Skip to content
Snippets Groups Projects
Unverified Commit 81a13cf7 authored by Joost van Heerwaarden's avatar Joost van Heerwaarden Committed by GitHub
Browse files

Update Master_Script_V1.0.R

parent d3c4e621
No related branches found
No related tags found
No related merge requests found
require(openxlsx) require(openxlsx)
###add ontology read ###add ontology read
##add final metadata, implement automated inference ##add final metadata, implement automated inference
##make sure metdata and variable definitions are not overwritten if present!!!!! ##make sure metdata and variable definitions are not overwritten if present!!!!!
### ###
###set working directory ###set working directory
wd=getwd() wd=getwd()
wd=gsub("scripts","", wd) wd=gsub("scripts","", wd)
setwd(wd) setwd(wd)
####make directories for data ####make directories for data
path.data.raw=paste(getwd(),"/data/raw/",sep="") path.data.raw=paste(getwd(),"/data/raw/",sep="")
path.data.proc=paste(getwd(),"/data/processed/",sep="") path.data.proc=paste(getwd(),"/data/processed/",sep="")
path.data.op=paste(getwd(),"/data/definitions_protocols/",sep="") path.data.op=paste(getwd(),"/data/definitions_protocols/",sep="")
path.writing=paste(getwd(),"/writing/",sep="") path.writing=paste(getwd(),"/writing/",sep="")
path.results=paste(getwd(),"/results/",sep="") path.results.raw=paste(getwd(),"/results/raw",sep="")
path.scripts=paste(getwd(),"/scripts/",sep="") path.results.tab=paste(getwd(),"/results/tables",sep="")
path.results.im=paste(getwd(),"/results/images",sep="")
path.scripts=paste(getwd(),"/scripts/",sep="")
dir.create(path.data.raw, showWarnings=F, recursive=T)
dir.create(path.data.proc, showWarnings=F, recursive=T)
dir.create(path.data.op, showWarnings=F, recursive=T) dir.create(path.data.raw, showWarnings=F, recursive=T)
dir.create(path.writing, showWarnings=F, recursive=T) dir.create(path.data.proc, showWarnings=F, recursive=T)
dir.create(path.results, showWarnings=F, recursive=T) dir.create(path.data.op, showWarnings=F, recursive=T)
dir.create(path.scripts, showWarnings=F, recursive=T) dir.create(path.writing, showWarnings=F, recursive=T)
dir.create(path.results.raw, showWarnings=F, recursive=T)
dir.create(path.results.tab, showWarnings=F, recursive=T)
####check presence of data files and list dir.create(path.results.im, showWarnings=F, recursive=T)
data.list.wd= list.files(wd) dir.create(path.scripts, showWarnings=F, recursive=T)
data.list.wd = data.list.wd[grep("metadata", data.list.wd,invert=T)]
data.list.wd = data.list.wd[grep("readme", data.list.wd,invert=T)]
writing.list.wd=data.list.wd[unique(c(grep(".doc", data.list.wd),grep(".docx", data.list.wd),grep(".txt", data.list.wd),grep(".rtf", data.list.wd)))] ####check presence of data files and list
data.list.wd= data.list.wd[unique(c(grep(".csv", data.list.wd),grep(".xls", data.list.wd)))] data.list.wd= list.files(wd)
data.list.wd = data.list.wd[grep("metadata", data.list.wd,invert=T)]
data.list.raw= list.files(path.data.raw) data.list.wd = data.list.wd[grep("readme", data.list.wd,invert=T)]
writing.list= list.files(path.writing) writing.list.wd=data.list.wd[unique(c(grep(".doc", data.list.wd),grep(".docx", data.list.wd),grep(".txt", data.list.wd),grep(".rtf", data.list.wd)))]
##remove metedata data.list.wd= data.list.wd[unique(c(grep(".csv", data.list.wd),grep(".xls", data.list.wd)))]
data.list.raw= data.list.raw[grep("metadata",data.list.raw,invert=T)]
data.list.proc= list.files(path.data.proc) data.list.raw= list.files(path.data.raw)
writing.list= list.files(path.writing)
##if raw folder is empty, move data from main folder to raw ##remove metedata
data.list.raw= data.list.raw[grep("metadata",data.list.raw,invert=T)]
if(length(data.list.raw)==0){ data.list.proc= list.files(path.data.proc)
file.copy(data.list.wd, path.data.raw)
file.remove(data.list.wd) ##if raw folder is empty, move data from main folder to raw
data.list.raw= list.files(path.data.raw) if(length(data.list.raw)==0){
##remove metedata file.copy(data.list.wd, path.data.raw)
data.list.raw= data.list.raw[grep("metadata",data.list.raw,invert=T)] file.remove(data.list.wd)
} data.list.raw= list.files(path.data.raw)
##remove metedata
data.list.raw= data.list.raw[grep("metadata",data.list.raw,invert=T)]
if(length(writing.list)==0){
}
file.copy(writing.list.wd, path.writing)
file.remove(writing.list.wd)
if(length(writing.list)==0){
}
file.copy(writing.list.wd, path.writing)
file.remove(writing.list.wd)
##now read all excel workbooks and sheets within workbooks }
for(wb in data.list.raw){
##now read all excel workbooks and sheets within workbooks
if(grepl(".xlsx",wb)){
for(wb in data.list.raw){
wb.name=gsub(".xlsx","", wb)
file=paste(path.data.raw, wb,sep="/")
sheets=getSheetNames(file) if(grepl(".xlsx",wb)){
wb.name=gsub(".xlsx","", wb)
file=paste(path.data.raw, wb,sep="/")
sheets=getSheetNames(file)
}
if(grepl(".csv",wb)){ }
wb.name=gsub(".csv","", wb)
file=paste(path.data.raw, wb,sep="/")
sheets="data" if(grepl(".csv",wb)){
} wb.name=gsub(".csv","", wb)
file=paste(path.data.raw, wb,sep="/")
sheets="data"
##create workbook to store data and metadata }
meta.wb.name=paste(wb.name,"_metadata",sep="")
meta.wb <- createWorkbook(meta.wb.name)
##create workbook to store data and metadata
##store variable names meta.wb.name=paste(wb.name,"_metadata",sep="")
var.names=c() meta.wb <- createWorkbook(meta.wb.name)
unit.vec=c()
sheet.names=c()
##store variable names
for(st in sheets){ var.names=c()
unit.vec=c()
if(grepl(".xlsx",wb)){ sheet.names=c()
t1<-read.xlsx(file,detectDates=T,sheet=st, colNames=F)
} for(st in sheets){
if(grepl(".csv",wb)){ if(grepl(".xlsx",wb)){
t1<-read.csv(file, header =F,as.is =T) t1<-read.xlsx(file,detectDates=T,sheet=st, colNames=F)
} }
##skip empty worksheets if(grepl(".csv",wb)){
if(is.null(t1)) next t1<-read.csv(file, header =F,as.is =T)
}
##count NA to detect header in excel sheet
#na count function that sets "" to NA (for csv) ##skip empty worksheets
na.count.fun=function(x){ if(is.null(t1)) next
x[which(x=="")]=NA
out=sum(!is.na(x)) ##count NA to detect header in excel sheet
return(out) #na count function that sets "" to NA (for csv)
} na.count.fun=function(x){
x[which(x=="")]=NA
na.count=apply(t1,1, na.count.fun) out=sum(!is.na(x))
ncol=max(na.count) return(out)
head.row=which(na.count==ncol)[1] }
##make dataframe from sheet with colum names na.count=apply(t1,1, na.count.fun)
colnames=as.character(t1[head.row,]) ncol=max(na.count)
head.row=which(na.count==ncol)[1]
###remove NA from names
miss.name=which(is.na(colnames)|colnames=="NA") ##make dataframe from sheet with colum names
colnames[miss.name]=paste("X",1:length(miss.name),sep="") colnames=as.character(t1[head.row,])
###fish units from names using $ ##when units flag that specification of methods and defenitions of measurements (e.g. reported vs measured, dr vs fresh weight.) ###remove NA from names
colnames=lapply(colnames,function(x) unlist(strsplit(x,split="\\$"))) miss.name=which(is.na(colnames)|colnames=="NA")
colnames[miss.name]=paste("X",1:length(miss.name),sep="")
contains.unit=unlist(lapply(colnames,function(x) length(x)>1 ))
###fish units from names using $ ##when units flag that specification of methods and defenitions of measurements (e.g. reported vs measured, dr vs fresh weight.)
units=rep("",length(colnames)) colnames=lapply(colnames,function(x) unlist(strsplit(x,split="\\$")))
units[contains.unit]=unlist(lapply(colnames[contains.unit],function(x) x[2])) contains.unit=unlist(lapply(colnames,function(x) length(x)>1 ))
colnames=unlist(lapply(colnames,function(x) x[1])) units=rep("",length(colnames))
new.frame= t1[(head.row+1):nrow(t1),1:length(colnames)] units[contains.unit]=unlist(lapply(colnames[contains.unit],function(x) x[2]))
colnames(new.frame)= colnames
colnames=unlist(lapply(colnames,function(x) x[1]))
var.names=c(var.names, colnames) new.frame= t1[(head.row+1):nrow(t1),1:length(colnames)]
unit.vec=c(unit.vec, units) colnames(new.frame)= colnames
sheet.names=c(sheet.names,rep(st,length(colnames)))
var.names=c(var.names, colnames)
addWorksheet(meta.wb, st) unit.vec=c(unit.vec, units)
writeData(meta.wb, st, new.frame) sheet.names=c(sheet.names,rep(st,length(colnames)))
} addWorksheet(meta.wb, st)
writeData(meta.wb, st, new.frame)
##make data frame with workbook name, sheet names and variables and add column for definitions
var.frame=data.frame("workbook"= wb.name,"sheet"= sheet.names,"variable"= var.names,"unit"=unit.vec,"definition"=NA,"unique identifier"=0,"personal information"=0, stringsAsFactors = F) }
#set up metadata ##make data frame with workbook name, sheet names and variables and add column for definitions
#mid.names=c("ID","Country","Name region","Name site","Minimum latitude","Maximum latitude", "Minimum longitude","Maximum longitude","Experiment/survey","Type of experiment","Type of survey","On-farm/on-station","Crops","Animals","Soil type")
#meta.data.frame=read.xlsx(paste(path.data.op,"Required_Metadata.xlsx",sep="/")) var.frame=data.frame("workbook"= wb.name,"sheet"= sheet.names,"variable"= var.names,"unit"=unit.vec,"definition"=NA,"unique identifier"=0,"personal information"=0, stringsAsFactors = F)
field.vec=c("Data ID", "Official title of the dataset", "Project name", "Description of project", "Author", "Author ID(ORCID)", "Contributor(s)", "Subject matter of research/Vocabulary", "Data origin", "Funder(s) or sponsor(s) of project", "creation date (m/d/yyyy)", "Embargo end date", "Citation", "keywords (AGROVOC)", "Country(ies) covered", "Point longitude coord. in Dec. Degrees ", "Agro-Ecological Zone(s)(FAO) covered", "Years covered by data", "Crops covered by data", "Animals covered by data", "Start date of data collection ", "End date of data collection ", "License (default=CC-BY)", "Permission given by email", "Rights", "Contact email") #set up metadata
#mid.names=c("ID","Country","Name region","Name site","Minimum latitude","Maximum latitude", "Minimum longitude","Maximum longitude","Experiment/survey","Type of experiment","Type of survey","On-farm/on-station","Crops","Animals","Soil type")
field_name.vec=c("data.id", "data.title", "project.name", "project.description", "author", "orcid", "contributors", "subject.research", "data.origin", "donor", "date.creation", "date.embargo", "citation", "keywords.agrovoc", "countries", "longitude", "aez", "years", "crops", "animals", "date.collect.start", "date.collect.end", "licence", "permission", "rights", "contact.mail") #meta.data.frame=read.xlsx(paste(path.data.op,"Required_Metadata.xlsx",sep="/"))
meta.data.frame=data.frame(field= field.vec,field_name= field_name.vec,values=NA, stringsAsFactors = F) field.vec=c("Data ID", "Official title of the dataset", "Project name", "Description of project", "Author", "Author ID(ORCID)", "Contributor(s)", "Subject matter of research/Vocabulary", "Data origin", "Funder(s) or sponsor(s) of project", "creation date (m/d/yyyy)", "Embargo end date", "Citation", "keywords (AGROVOC)", "Country(ies) covered", "Point longitude coord. in Dec. Degrees ", "Agro-Ecological Zone(s)(FAO) covered", "Years covered by data", "Crops covered by data", "Animals covered by data", "Start date of data collection ", "End date of data collection ", "License (default=CC-BY)", "Permission given by email", "Rights", "Contact email")
field_name.vec=c("data.id", "data.title", "project.name", "project.description", "author", "orcid", "contributors", "subject.research", "data.origin", "donor", "date.creation", "date.embargo", "citation", "keywords.agrovoc", "countries", "longitude", "aez", "years", "crops", "animals", "date.collect.start", "date.collect.end", "licence", "permission", "rights", "contact.mail")
###check if workbook with variable definitions and metadata already present in processed and make sure not overwritten
exist.wb= data.list.proc[grep(wb.name, data.list.proc)] meta.data.frame=data.frame(field= field.vec,field_name= field_name.vec,values=NA, stringsAsFactors = F)
if(length(exist.wb)>0){
##read.data and extract ###check if workbook with variable definitions and metadata already present in processed and make sure not overwritten
exist.file=paste(path.data.proc,exist.wb,sep="/") exist.wb= data.list.proc[grep(wb.name, data.list.proc)]
existing.varframe=read.xlsx(exist.file,detectDates=T,sheet= "variable definitions")
existing.meta.data.frame=read.xlsx(exist.file,detectDates=T,sheet= "meta data") if(length(exist.wb)>0){
##read.data and extract
existing.meta.data.frame.id=apply(existing.meta.data.frame[,1:2],1,paste,collapse=";") exist.file=paste(path.data.proc,exist.wb,sep="/")
meta.data.frame.id=apply(meta.data.frame[,1:2],1,paste,collapse=";") existing.varframe=read.xlsx(exist.file,detectDates=T,sheet= "variable definitions")
existing.meta.data.frame=read.xlsx(exist.file,detectDates=T,sheet= "meta data")
existing.varframe.id=apply(existing.varframe[,1:3],1,paste,collapse=";") existing.meta.data.frame.id=apply(existing.meta.data.frame[,1:2],1,paste,collapse=";")
var.frame.id=apply(var.frame[,1:3],1,paste,collapse=";") meta.data.frame.id=apply(meta.data.frame[,1:2],1,paste,collapse=";")
##overwrite new frames with existing values existing.varframe.id=apply(existing.varframe[,1:3],1,paste,collapse=";")
meta.data.frame[na.omit(match(existing.meta.data.frame.id,meta.data.frame.id)),]<-existing.meta.data.frame[na.omit(match(meta.data.frame.id,existing.meta.data.frame.id)),] var.frame.id=apply(var.frame[,1:3],1,paste,collapse=";")
var.frame[na.omit(match(existing.varframe.id,var.frame.id)),]<-existing.varframe[na.omit(match( var.frame.id,existing.varframe.id)),]
##rbind additional rows ##overwrite new frames with existing values
meta.data.frame=rbind(meta.data.frame, existing.meta.data.frame[which(!existing.meta.data.frame.id%in% meta.data.frame.id),]) meta.data.frame[na.omit(match(existing.meta.data.frame.id,meta.data.frame.id)),]<-existing.meta.data.frame[na.omit(match(meta.data.frame.id,existing.meta.data.frame.id)),]
var.frame =rbind(var.frame, existing.varframe[which(!existing.varframe.id%in%var.frame.id),]) var.frame[na.omit(match(existing.varframe.id,var.frame.id)),]<-existing.varframe[na.omit(match( var.frame.id,existing.varframe.id)),]
##rbind additional rows
} meta.data.frame=rbind(meta.data.frame, existing.meta.data.frame[which(!existing.meta.data.frame.id%in% meta.data.frame.id),])
var.frame =rbind(var.frame, existing.varframe[which(!existing.varframe.id%in%var.frame.id),])
##now add metadata to metadata workbook
addWorksheet(meta.wb, "meta data") }
addWorksheet(meta.wb, "variable definitions")
##now add metadata to metadata workbook
addWorksheet(meta.wb, "meta data")
addWorksheet(meta.wb, "variable definitions")
writeData(meta.wb, "meta data", meta.data.frame) writeData(meta.wb, "meta data", meta.data.frame)
writeData(meta.wb, "variable definitions", var.frame) writeData(meta.wb, "variable definitions", var.frame)
##now save workbook ##now save workbook
saveWorkbook(meta.wb,file = paste(path.data.proc,meta.wb.name,".xlsx",sep=""), overwrite = T) ##later do not overrwrite but make sure merged/aggregated saveWorkbook(meta.wb,file = paste(path.data.proc,meta.wb.name,".xlsx",sep=""), overwrite = T) ##later do not overrwrite but make sure merged/aggregated
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment