diff --git a/1_data_cleaning.R b/1_data_cleaning.R index 30f0cb184843da02582a7d012338eb721d052fd9..c09b940a989a81bf515690fbbe41ff771b2218eb 100644 --- a/1_data_cleaning.R +++ b/1_data_cleaning.R @@ -51,7 +51,10 @@ d$yield[id_NA] <- NA d$fertilizer <- stringi::stri_escape_unicode(d$fertilizer) # replace the unicode representation by "e" yielding "Efficiencie" -d$fertilizer <- str_replace(d$fertilizer, "\\\\u00c3\\\\u0192\\\\u00c2\\\\u00ab", "e") +#d$fertilizer <- str_replace(d$fertilizer, "\\\\u00c3\\\\u00ab\\\\u00eb", "e") +##changed to grep which should work on Mac +d$fertilizer <- gsub("\\\\u00eb", "e",d$fertilizer) + ## Convert yield in ton per ha (tha), # here I use dplyr::mutate as an example diff --git a/Master_Script.R b/Master_Script.R index 82080bf11e4c4bf9890c5af33c777f29cdb8030c..1d653e6ec9c5e18927e512913b92f119b471b0f9 100644 --- a/Master_Script.R +++ b/Master_Script.R @@ -1,4 +1,3 @@ - ##install all required packages # Install required packages if necessary @@ -184,7 +183,15 @@ units[contains.unit]=unlist(lapply(colnames[contains.unit],function(x) x[2])) colnames=unlist(lapply(colnames,function(x) x[1])) -new.frame= t1[(head.row+1):nrow(t1),1:length(colnames)] +###now re-read files starting with first data row (to allow proper data type) +if(grepl(".xlsx",wb)){ +new.frame <-read.xlsx(file,detectDates=T,sheet=st, colNames=F, startRow=(head.row+1)) +} + +if(grepl(".csv",wb)){ +new.frame<-read.csv(file, header =F,as.is =T, skip=(head.row)) +} + colnames(new.frame)= colnames @@ -207,9 +214,9 @@ var.frame=data.frame("workbook"= wb.name,"sheet"= sheet.names,"variable"= var.na #mid.names=c("ID","Country","Name region","Name site","Minimum latitude","Maximum latitude", "Minimum longitude","Maximum longitude","Experiment/survey","Type of experiment","Type of survey","On-farm/on-station","Crops","Animals","Soil type") #meta.data.frame=read.xlsx(paste(path.data.op,"Required_Metadata.xlsx",sep="/")) -field.vec=c("Data ID", "Official title of the dataset", "Project name", "Description of project", "Author", "Author ID(ORCID)", "Contributor(s)", "Subject matter of research/Vocabulary", "Data origin", "Funder(s) or sponsor(s) of project", "creation date (m/d/yyyy)", "Embargo end date", "Citation", "keywords (AGROVOC)", "Country(ies) covered", "Point longitude coord. in Dec. Degrees ", "Agro-Ecological Zone(s)(FAO) covered", "Years covered by data", "Crops covered by data", "Animals covered by data", "Start date of data collection ", "End date of data collection ", "License (default=CC-BY)", "Permission given by email", "Rights", "Contact email") +field.vec=c("Data ID", "Official title of the dataset", "Project name", "Description of project", "Author", "Author ID(ORCID)", "Contributor(s)", "Subject matter of research/Vocabulary", "Data origin", "Funder(s) or sponsor(s) of project", "creation date (m/d/yyyy)", "Embargo end date", "Citation", "keywords (AGROVOC)", "Country(ies) covered", "Point geographic coordinates in Dec. Degrees ", "Agro-Ecological Zone(s)(FAO) covered", "Years covered by data", "Crops covered by data", "Animals covered by data", "Start date of data collection ", "End date of data collection ", "License (default=CC-BY)", "Permission given by email", "Rights", "Contact email") -field_name.vec=c("data.id", "data.title", "project.name", "project.description", "author", "orcid", "contributors", "subject.research", "data.origin", "donor", "date.creation", "date.embargo", "citation", "keywords.agrovoc", "countries", "longitude", "aez", "years", "crops", "animals", "date.collect.start", "date.collect.end", "licence", "permission", "rights", "contact.mail") +field_name.vec=c("data.id", "data.title", "project.name", "project.description", "author", "orcid", "contributors", "subject.research", "data.origin", "donor", "date.creation", "date.embargo", "citation", "keywords.agrovoc", "countries", "longitude.latitude", "aez", "years", "crops", "animals", "date.collect.start", "date.collect.end", "licence", "permission", "rights", "contact.mail") meta.data.frame=data.frame(field= field.vec,field_name= field_name.vec,values=NA, stringsAsFactors = F)