From 47470f76d11d2b31923c200dc7590aa09a729eb6 Mon Sep 17 00:00:00 2001 From: Sterken <mark.sterken@wur.nl> Date: Wed, 2 Aug 2023 21:59:58 +0200 Subject: [PATCH] added all custom functions --- Function_scripts/H2.broad.R | 150 ++++++++++++++++++ Function_scripts/NEMA.get.functions.R | 54 +++++++ Function_scripts/QTL.data.prep.R | 73 +++++++++ Function_scripts/QTL.eQTL.call.TBs.R | 60 +++++++ Function_scripts/QTL.eQTL.table.R | 53 +++++++ Function_scripts/QTL.eQTL.table.addTB.R | 77 +++++++++ Function_scripts/QTL.map.1.FDR.R | 92 +++++++++++ Function_scripts/QTL.map.1.R | 87 ++++++++++ Function_scripts/QTL.map.1.perm.R | 85 ++++++++++ Function_scripts/QTL.map.1.sim.R | 124 +++++++++++++++ Function_scripts/QTL.map1.dataframe.R | 58 +++++++ Function_scripts/QTL.peak.finder.R | 60 +++++++ Function_scripts/QTL.sim.data.per.marker.R | 67 ++++++++ Function_scripts/QTL.table.addR2.R | 54 +++++++ Function_scripts/R.sq.R | 15 ++ Function_scripts/enrich.easy.R | 52 ++++++ Function_scripts/enrich.get.geneIDs.R | 32 ++++ Function_scripts/enrich.more.R | 28 ++++ Function_scripts/enrich.one.R | 35 ++++ Function_scripts/fast.lm.1.R | 21 +++ Function_scripts/fast.summ.R | 33 ++++ Function_scripts/fast.summ.transcriptomics.R | 33 ++++ Function_scripts/genetic.df.to.mat.R | 67 ++++++++ Function_scripts/genetic.distance.R | 81 ++++++++++ .../genetic.informative.markers.R | 27 ++++ Function_scripts/lm.1.R | 29 ++++ Function_scripts/permutate.traits.R | 20 +++ Function_scripts/plot.genotype.strain.R | 108 +++++++++++++ Function_scripts/prep.ggplot.QTL.profile.R | 20 +++ Function_scripts/prep.ggplot.eQTL.table.R | 36 +++++ Function_scripts/prep.ggplot.genetic.map.R | 52 ++++++ .../transcriptomics.age.predictor.R | 49 ++++++ Function_scripts/transcriptomics.df.to.list.R | 48 ++++++ Function_scripts/transcriptomics.lm.R | 88 ++++++++++ Function_scripts/transgression.calc.R | 73 +++++++++ mainscript_aS_eQTL.R | 4 +- 36 files changed, 2043 insertions(+), 2 deletions(-) create mode 100644 Function_scripts/H2.broad.R create mode 100644 Function_scripts/NEMA.get.functions.R create mode 100644 Function_scripts/QTL.data.prep.R create mode 100644 Function_scripts/QTL.eQTL.call.TBs.R create mode 100644 Function_scripts/QTL.eQTL.table.R create mode 100644 Function_scripts/QTL.eQTL.table.addTB.R create mode 100644 Function_scripts/QTL.map.1.FDR.R create mode 100644 Function_scripts/QTL.map.1.R create mode 100644 Function_scripts/QTL.map.1.perm.R create mode 100644 Function_scripts/QTL.map.1.sim.R create mode 100644 Function_scripts/QTL.map1.dataframe.R create mode 100644 Function_scripts/QTL.peak.finder.R create mode 100644 Function_scripts/QTL.sim.data.per.marker.R create mode 100644 Function_scripts/QTL.table.addR2.R create mode 100644 Function_scripts/R.sq.R create mode 100644 Function_scripts/enrich.easy.R create mode 100644 Function_scripts/enrich.get.geneIDs.R create mode 100644 Function_scripts/enrich.more.R create mode 100644 Function_scripts/enrich.one.R create mode 100644 Function_scripts/fast.lm.1.R create mode 100644 Function_scripts/fast.summ.R create mode 100644 Function_scripts/fast.summ.transcriptomics.R create mode 100644 Function_scripts/genetic.df.to.mat.R create mode 100644 Function_scripts/genetic.distance.R create mode 100644 Function_scripts/genetic.informative.markers.R create mode 100644 Function_scripts/lm.1.R create mode 100644 Function_scripts/permutate.traits.R create mode 100644 Function_scripts/plot.genotype.strain.R create mode 100644 Function_scripts/prep.ggplot.QTL.profile.R create mode 100644 Function_scripts/prep.ggplot.eQTL.table.R create mode 100644 Function_scripts/prep.ggplot.genetic.map.R create mode 100644 Function_scripts/transcriptomics.age.predictor.R create mode 100644 Function_scripts/transcriptomics.df.to.list.R create mode 100644 Function_scripts/transcriptomics.lm.R create mode 100644 Function_scripts/transgression.calc.R diff --git a/Function_scripts/H2.broad.R b/Function_scripts/H2.broad.R new file mode 100644 index 0000000..9ef6160 --- /dev/null +++ b/Function_scripts/H2.broad.R @@ -0,0 +1,150 @@ +###function for broad-sense heritability +#currently 3 methods are supported: +# ANOVA (very basic method) [ref?] +# REML (based on lme4 package) [ref?] +# Keurentjes_2007 (based on Brem, 2005 and Keurentjes, 2007) +# all methods support permutation + +###input +#trait.value: a vector with trait values +#strain: a vector with strain names matching the trait values +#trait: a vector with trait names (e.g you can enter multiple traits this way) +#method: one of three methods: ANOVA, REML, or Keurentjes_2007 (standard is ANOVA because it's quick) +#parental.strain: names of parental strains (only for Keurentjes; otherwise parental strains should be removed from the analysis) +#n.perm: number of permutations to be done (standard 1000) +#Vg.factor: fraction of the measured genotypic variance should be counted towards total variance (standard is 1) + +###output +#a dataframe with traits per row and columns with the heritability, underlying values and an FDR0.05 threshold + +###update +#made a function that determines the fdr based on the actual value (not only if it's significant anymore) + + H2.broad <- function(trait.value,strain,trait,method,parental.strain,n.perm,Vg.factor){ + if(missing(trait.value)){ stop("requires trait value")} + if(missing(strain)){ stop("requires strain values")} + if(missing(trait)){ trait <- rep("A",length(trait.value))} + if(missing(method)){ method <- "ANOVA"} + if(!method %in% c("ANOVA","REML","Keurentjes_2007")){ stop("method should be either ANOVA, REML, or Keurentjes_2007")} + if(method=="Keurentjes_2007"){ + if(missing(parental.strain)){ stop("when using Keurentjes_2007, parental strain names should be provided")} + if(sum(parental.strain %in% strain) != length(parental.strain)){ + stop("not all parental strains are in the data")} + } + if(missing(n.perm)){ n.perm <- 1000} + if(missing(Vg.factor)){ Vg.factor <- 1} + + + ###calculates H2 based on REML (part of H2.broad) + H2.REML <- function(input,Vg.factor){ + if(missing(Vg.factor)){Vg.factor <- 1} + + ##test if lme4 package is installed + if(!require("lme4")){ + install.packages("lme4") + library("lme4") + } + + REML <- lmer(trait.value ~ 1 + (1|strain),data=input) + Variations <- as.data.frame(VarCorr(REML))$vcov + + H2 <- c((Variations[1]*Vg.factor)/sum(Variations*c(Vg.factor,1)),(Variations*c(Vg.factor,1))) + names(H2) <- c("H2_REML","Vg","Ve") + return(H2) + } + + ###Calculates H2 based on ANOVA (part of H2.broad) + H2.ANOVA <- function(input,Vg.factor){ + if(missing(Vg.factor)){Vg.factor <- 1} + + ANOVA <- anova(lm(trait.value~strain,data=input)) + Variations <- ANOVA[[2]] + + H2 <- c((Variations[1]*Vg.factor)/sum(Variations*c(Vg.factor,1)),(Variations*c(Vg.factor,1))) + names(H2) <- c("H2_ANOVA","Vg","Ve") + return(H2) + } + + ###Calculates H2 as described in Keurentjes, 2007, in case error has to be estimated on limited number of strains + ###Permutation is based on Brem, 2005 + H2.Keurentjes <- function(input,parental.strain,Vg.factor){ + if(missing(Vg.factor)){Vg.factor <- 1} + + selc.parental <- input$strain %in% parental.strain + + ###Ve is estimated based on parental replicates + VAR.PL <- tapply(input$trait.value[selc.parental],as.character(unlist(input$strain[selc.parental])),var) + N.PL <- tapply(input$trait.value[selc.parental],as.character(unlist(input$strain[selc.parental])),length) + VAR.E <- sum(VAR.PL*(N.PL-1))/sum((N.PL-1)) + + ##Vt is estimated based on RILs + VAR.T <- var(input$trait.value[!selc.parental]) + + H2 <- c(((VAR.T-VAR.E)*Vg.factor)/VAR.T,((VAR.T-VAR.E)*Vg.factor),VAR.E) + names(H2) <- c("H2_keurentjes","Vg","Ve") + return(H2) + } + + permfunct <- function(x,val){ + out <- NULL + for(j in 1:ncol(x)){ + out <- c(out,max(which(rev(sort(c(val[j],x[,j])) >=val[j])))/(length(x[,j])+1)) + } + return(out) + } + + ###Prep the data + input <- data.frame(cbind(trait.value=trait.value,strain=strain)) + input$trait.value <- as.numeric(as.character(unlist(input$trait.value))) + input$strain <- as.factor(as.character(unlist(input$strain))) + + input <- split(input,trait) + + if(method == "ANOVA"){ + H2 <- t(sapply(input,H2.ANOVA,Vg.factor)) + + perm <- NULL + for(i in 1:n.perm){ + input.perm <- lapply(input,function(x){x[,1] <- x[order(runif(nrow(x))),1];return(x)}) + H2.perm <- t(sapply(input.perm,H2.ANOVA,Vg.factor)) + perm <- rbind(perm,H2.perm[,1]) + } + + H2 <- cbind(H2,FDR=permfunct(perm,H2[,1])) + } + if(method == "REML"){ + H2 <- t(sapply(input,H2.REML,Vg.factor)) + + perm <- NULL + for(i in 1:n.perm){ + input.perm <- lapply(input,function(x){x[,1] <- x[order(runif(nrow(x))),1];return(x)}) + H2.perm <- t(sapply(input.perm,H2.REML,Vg.factor)) + perm <- rbind(perm,H2.perm[,1]) + } + + + H2 <- cbind(H2,FDR=permfunct(perm,H2[,1])) + } + if(method == "Keurentjes_2007"){ + H2 <- t(sapply(input,H2.Keurentjes,parental.strain = parental.strain,Vg.factor)) + + perm <- NULL + for(i in 1:n.perm){ + input.perm <- lapply(input,function(x){x[,1] <- x[order(runif(nrow(x))),1];return(x)}) + H2.perm <- t(sapply(input.perm,H2.Keurentjes,parental.strain = parental.strain,Vg.factor)) + perm <- rbind(perm,H2.perm[,1]) + } + + H2 <- cbind(H2,FDR=permfunct(perm,H2[,1])) + } + + output <- as.data.frame(cbind(trait=names(input),H2)) + output[,1] <- as.character(unlist(output[,1])) + output[,2] <- round(as.numeric(as.character(unlist(output[,2]))),digits=3) + output[,3] <- round(as.numeric(as.character(unlist(output[,3]))),digits=3) + output[,4] <- round(as.numeric(as.character(unlist(output[,4]))),digits=3) + output[,5] <- as.numeric(as.character(unlist(output[,5]))) + + return(output) + } + diff --git a/Function_scripts/NEMA.get.functions.R b/Function_scripts/NEMA.get.functions.R new file mode 100644 index 0000000..233d2f6 --- /dev/null +++ b/Function_scripts/NEMA.get.functions.R @@ -0,0 +1,54 @@ +###Writes all custom NEMA_functions scripts to a specified folder + +###input +#file_script (script of which you want to get the NEMA_functions from) +#NEMA_functions_dir (location of the nema functions) +#write_dir (location where to write to, standard is current wd to a folder called "Function_scripts") + +###output +#a folder with functions + +###Description + +NEMA.get.functions <- function(file_script,NEMA_functions_dir,write_dir="Function_scripts"){ + if(missing(file_script)){ stop("requires a source script")} + if(missing(NEMA_functions_dir)){ NEMA_functions_dir <- "E:/Nemawork/Projects_R_zone/Git/NEMA_functions"} + + ###get write_dir + dir.create("Function_scripts", showWarnings = F) + write_dir <- paste(getwd(),write_dir,sep="/") + + ####compile a list of the nema functions as currently exist + functionlist <- dir(paste(NEMA_functions_dir,"/R/",sep="")) + ###remove the .R + functionlist <- gsub(".R$","",functionlist) + + ###load source script + sourcescript <- read.table(file = file_script,sep="\n",comment.char="",blank.lines.skip = FALSE,quote="") + + ###search in source script + output <- NULL + for(i in 1:length(functionlist)){ + output <- c(output,grepl(functionlist[i],sourcescript)) + } + + ###Go through five times for nested functions + ###load source script + for(k in 1:5){ + for(j in which(output)){ + sourcescript <- read.table(file = paste(NEMA_functions_dir,"/R/",functionlist[j],".R",sep=""),sep="\n",comment.char="",blank.lines.skip = FALSE,quote="") + + ###search in source script + for(i in 1:length(functionlist)){ + if(grepl(functionlist[i],sourcescript)){ + output[i] <- TRUE + } + } + } + } + + ###copy functions to folder + for(i in which(output)){ + file.copy(from = paste(NEMA_functions_dir,"/R/",functionlist[i],".R",sep=""), to = write_dir) + } +} diff --git a/Function_scripts/QTL.data.prep.R b/Function_scripts/QTL.data.prep.R new file mode 100644 index 0000000..8e6b570 --- /dev/null +++ b/Function_scripts/QTL.data.prep.R @@ -0,0 +1,73 @@ +###Data preparation for QTL mapping; start function for most of subsequent functions + +###input +#trait.matrix (matrix; traits per row and strains per column) +#strain.trait (vector with strain names per trait. Should match strain.map) +#strain.map (markers in rows, genotypes in columns) +#strain.marker (markers in rows, with columns name, chromosome and position) + +###output +#a list witht the entries Trait, Map, and Marker. Where the Traits are aligned with the map + +###Description +#Arranges the trait and the map data so they are aligned and runs some basic checks on the input data +#removes the non-present strains it warns about + +###See also +#QTL.map.1 + + + +###Data preparation + QTL.data.prep <- function(trait.matrix,strain.trait,strain.map,strain.marker){ + if(missing(trait.matrix)){ stop("requires trait data, traits per row and strains per column")} + if(missing(strain.trait)){ stop("requires the strain names matching the trait data")} + if(missing(strain.map)){ stop("requires the genetic map of the population")} + if(missing(strain.marker)){ stop("requires the marker info (name, chromosome, basepair) of the map")} + if(ncol(trait.matrix) != length(strain.trait)){ stop("the number of strains does not correspond with the trait matrix")} + if(sum(tolower(strain.trait) %in% tolower(colnames(strain.map))) < length(strain.trait)){ + warning(paste("The strains: ",paste(strain.trait[!tolower(strain.trait) %in% tolower(colnames(strain.map))],collapse=", ")," are not in the strain.map file")) + } + if(nrow(strain.map) != nrow(strain.marker)){ stop("the number of markers (strain.map) does not equal the number of marker descriptions (strain.marker)")} + if(ncol(strain.marker) != 3){ stop("make sure the strain.marker file contains a column with the marker name, chromosome, and bp location")} + if(nrow(trait.matrix)==1){ small <- TRUE} + if(nrow(trait.matrix)!=1){ small <- FALSE} + + strain.map <- data.matrix(strain.map) + + ###remove strains not present + if(small){ + rowna <- rownames(trait.matrix) + trait.matrix <- trait.matrix[1,which(tolower(strain.trait) %in% tolower(colnames(strain.map)))] + colna <- names(trait.matrix) + trait.matrix <- t(matrix(trait.matrix)) + colnames(trait.matrix) <- colna + rownames(trait.matrix) <- rowna + } else { + trait.matrix <- trait.matrix[,which(tolower(strain.trait) %in% tolower(colnames(strain.map)))] + strain.trait <- strain.trait[tolower(strain.trait) %in% tolower(colnames(strain.map))] + } + + ###Make matrix + map.nu <- matrix(NA,nrow=nrow(strain.map),ncol=length(strain.trait)) + for(j in 1:length(strain.trait)){ + map.nu[,j] <- strain.map[,tolower(colnames(strain.map)) == tolower(strain.trait[j])] + } + colnames(map.nu) <- strain.trait + rownames(map.nu) <- rownames(strain.map) + + ###add rownames to the trait matrix + if(length(rownames(trait.matrix)) == 0){ + rownames(trait.matrix) <- 1:nrow(trait.matrix) + } + + ###Make output + output <- NULL + output[[1]] <- trait.matrix + output[[2]] <- map.nu + output[[3]] <- data.frame(strain.marker) + names(output) <- c("Trait","Map","Marker") + + ###return output + return(output) + } \ No newline at end of file diff --git a/Function_scripts/QTL.eQTL.call.TBs.R b/Function_scripts/QTL.eQTL.call.TBs.R new file mode 100644 index 0000000..e64b308 --- /dev/null +++ b/Function_scripts/QTL.eQTL.call.TBs.R @@ -0,0 +1,60 @@ +###Calls transbands based on a poisson distribution +#Brem, Rachel B., and Leonid Kruglyak. "The landscape of genetic complexity across 5,700 gene expression traits in yeast." Proceedings of the National Academy of Sciences 102.5 (2005): 1572-1577. + +###input +#eQTL.table.file (output from QTL.eQTL.table) +#window (interval size in which QTL are counted) +#chromosomes (names of the chromosomes; standard is C. elegans) +#chromosome_size (sizes of the chromosomes; standard is C. elegans) + +###output +#a dataframe with sifnificances for cis- and trans-eQTL enrichments + +###Description + + + +QTL.eQTL.call.TBs <- function(eQTL.table.file,window,chromosomes,chromosome_size){ + if(missing(eQTL.table.file)){ stop("need a eQTL.table.file")} + if(missing(window)){ window <- 1.0e6; warning("set default window-size, might not be optimal")} + if(missing(chromosomes)){ chromosomes <- c("I","II","III","IV","V","X")} + if(missing(chromosome_size)){ chromosome_size <- c(15072434,15279421,13783801,17493829,20924180,17718942)} + if(length(chromosomes) != length(chromosome_size)){ stop("chromosome names and sizes should have the same length")} + + maxsize <- max(chromosome_size+window) + chrnum <- length(chromosomes) + + ###call locations with many eQTL + transband.id <- dplyr::mutate(eQTL.table.file,Interval=findInterval(qtl_bp,seq(1,maxsize,by=window))) %>% + dplyr::filter(!is.na(qtl_type)) %>% + dplyr::group_by(qtl_chromosome,Interval,qtl_type) %>% + dplyr::summarise(n.ct=length(unique(trait))) %>% + data.frame() %>% + dplyr::group_by(qtl_type) %>% + dplyr::mutate(exp.ct=mean(as.numeric(unlist(n.ct)))) %>% + data.frame() %>% + dplyr::mutate(transband_significance=ppois(n.ct,lambda=exp.ct,lower.tail=F)) + + ###Add locations that have no cis/trans eQTL + transband.id <- rbind(transband.id,data.frame(cbind(qtl_chromosome=rep(chromosomes,each=ceiling(maxsize/window)*2),Interval=rep(1:ceiling(maxsize/window),times=chrnum*2), + qtl_type=rep(rep(c("cis","trans"),each=ceiling(maxsize/window)),times=chrnum), + n.ct=0, + exp.ct=rep(rep(c(filter(transband.id,!duplicated(exp.ct),qtl_type=="cis")$exp.ct,filter(transband.id,!duplicated(exp.ct),qtl_type=="trans")$exp.ct),each=ceiling(maxsize/window)),times=chrnum), + transband_significance=1))) %>% + dplyr::mutate(tester=paste(qtl_chromosome,Interval,qtl_type)) %>% + dplyr::filter(!duplicated(tester)) %>% + dplyr::mutate(Interval=as.numeric(as.character(unlist(Interval))),transband_significance=as.numeric(transband_significance)) %>% + dplyr::mutate(Interval_left=(Interval-1)*window,Interval_right=1+Interval*window) %>% + dplyr::arrange(qtl_type,qtl_chromosome,Interval) + + ###resize to chromosome sizes + output <- NULL + for(i in 1:chrnum){ + output <- rbind(output,filter(transband.id,qtl_chromosome==chromosomes[i],Interval_right<(chromosome_size[i]+window))) + } + for(i in c(2,4,5,6,8,9)){ + output[,i] <- as.numeric(as.character(unlist(output[,i]))) + } + + return(output) + } \ No newline at end of file diff --git a/Function_scripts/QTL.eQTL.table.R b/Function_scripts/QTL.eQTL.table.R new file mode 100644 index 0000000..74c4f8c --- /dev/null +++ b/Function_scripts/QTL.eQTL.table.R @@ -0,0 +1,53 @@ +###function for making an eQTL table + +###input +#QTL.peak.dataframe (output from QTL.peak.finder function) +#trait.annotation (dataframe with columns: trait, gene_chromosome, gene_bp, gene_database_ID, gene_name_1, gene_name_2) +#cis.trans.window (how cis and trans eQTL are distinguised: physical, LOD.drop, or both) +#cis.trans.window.size (physical window size, standard 1000000) +#colnames.trait.annotations (names for the trait annotations added) + + +###output +#a dataframe listing the eQTL per trait. + +###Description +# Makes a table with eQTL peaks v1.1 +# QTL.peak.dataframe: output from peak.finder +# trait.annotation: gene ID's of the mapped traits +# cis.trans.window: Cis-trans windowd can be defined based on physical distance "physical", based on LOD-drop interval "LOD.drop", or on both "both" + +QTL.eQTL.table <- function(QTL.peak.dataframe, trait.annotation,cis.trans.window,cis.trans.window.size,colnames.trait.annotations){ + if(missing(QTL.peak.dataframe)){ stop("need an eQTL list file")} + if(missing(trait.annotation)){ stop("need annotation file with order: trait, gene_chromosome, gene_bp, gene_WBID, gene_sequence_name, gene_public_name")} + if(missing(cis.trans.window)){ cis.trans.window <- "physical"} + if(!cis.trans.window %in% c("physical","LOD.drop","both")){ stop("cis.trans.window should be in physical, LOD.drop, or both")} + if(missing(cis.trans.window.size)){ cis.trans.window.size <- 1000000} + if(missing(colnames.trait.annotations)){ colnames.trait.annotations <- c("trait","gene_chromosome","gene_bp","gene_WBID","gene_sequence_name","gene_public_name")} + + ###Rename colnames traits + colnames(trait.annotation) <- colnames.trait.annotations + + ###Take only the peaks + if(cis.trans.window =="physical"){ + output <- dplyr::filter(QTL.peak.dataframe,!is.na(qtl_peak)) %>% + merge(trait.annotation,by.x=1,by.y=1) %>% + dplyr::mutate(qtl_type = ifelse((qtl_chromosome==gene_chromosome & abs(qtl_bp-gene_bp) < cis.trans.window.size),"cis","trans")) + } + + if(cis.trans.window =="LOD.drop"){ + output <- dplyr::filter(QTL.peak.dataframe,!is.na(qtl_peak)) %>% + merge(trait.annotation,by.x=1,by.y=1) %>% + dplyr::mutate(qtl_type = ifelse((qtl_chromosome==gene_chromosome & (gene_bp > qtl_bp_left & gene_bp < qtl_bp_right)),"cis","trans")) + } + + if(cis.trans.window =="both"){ + output <- dplyr::filter(QTL.peak.dataframe,!is.na(qtl_peak)) %>% + merge(trait.annotation,by.x=1,by.y=1) %>% + dplyr::mutate(qtl_type = ifelse((qtl_chromosome==gene_chromosome & abs(qtl_bp-gene_bp) < cis.trans.window.size) | + ((qtl_chromosome==gene_chromosome & abs(qtl_bp-gene_bp) < cis.trans.window.size) & (gene_bp > qtl_bp_left & gene_bp < qtl_bp_right)),"cis","trans")) + } + + ###Return file + return(output) + } diff --git a/Function_scripts/QTL.eQTL.table.addTB.R b/Function_scripts/QTL.eQTL.table.addTB.R new file mode 100644 index 0000000..5b08e98 --- /dev/null +++ b/Function_scripts/QTL.eQTL.table.addTB.R @@ -0,0 +1,77 @@ +###Adds whether a QTL is located in a trans-band (or cis-enriched area) + +###input +#eQTL.table.file (output from QTL.eQTL.table) +#eQTL.call.TBs.file (output from QTL.eQTL.call.TBs) +#transband_thr (statistical threshold for calling a trans-band) +#qtl_type_select (type of qtl to call enrichment for) +#merge_condition (how adjacent chromosomal intervals should be for merging trans-bands) +#digits_Mb (amount of digits behind the Mb should appear in TB-name) + +###output +#the eQTL.table.file with a trans_band column added + +###Description + +###version +#Fixed issue with merging transbands + +QTL.eQTL.table.addTB <- function(eQTL.table.file,call.transbands.file,transband_thr,qtl_type_select,merge_condition,digits_Mb){ + if(missing(eQTL.table.file)){ stop("need a eQTL.table.file")} + if(missing(call.transbands.file)){ stop("need a call.transbands.file")} + if(missing(transband_thr)){ transband_thr <- 0.0001; warning("set default significance, might not be optimal")} + if(missing(qtl_type_select)){ qtl_type_select <- "trans"} + if(missing(merge_condition)){ merge_condition <- 2} + if(missing(digits_Mb)){ digits_Mb <- 1} + + transbands <- dplyr::filter(call.transbands.file,transband_significance < transband_thr,qtl_type==qtl_type_select) %>% + dplyr::group_by(qtl_chromosome) %>% + dplyr::mutate(mergeornot=c(0,diff(Interval))) %>% + data.frame() + + transbands.merged <- NULL + for(i in 1:length(unique(transbands$qtl_chromosome))){ + tmp <- dplyr::filter(transbands,qtl_chromosome == unique(transbands$qtl_chromosome)[i]) + tmp2 <- NULL + if(nrow(tmp) > 1){ + for(j in 2:nrow(tmp)){ + if(tmp$mergeornot[j] <= merge_condition){ + tmp2 <- data.frame(rbind(tmp2,cbind(qtl_chromosome=tmp[j,1], qtl_type=tmp[j,3], n.ct=sum(tmp[c((j-1),j),4]), exp.ct=sum(tmp[c((j-1),j),5]), + transband_significance=max(tmp[j,6]), Interval_left=tmp[j-1,8], Interval_right=tmp[j,9]))) + for(k in c(3,4,5,6,7)){tmp2[,k] <- as.numeric(as.character(unlist(tmp2[,k])))} + } else { + if(j==2){ + tmp3 <- tmp[1:2,-c(2,7,10)] + for(k in c(3,4,5,6,7)){tmp3[,k] <- as.numeric(as.character(unlist(tmp3[,k])))} + tmp2 <- rbind(tmp2,tmp3) + for(k in c(3,4,5,6,7)){tmp2[,k] <- as.numeric(as.character(unlist(tmp2[,k])))} + } else { + tmp3 <- tmp[j,-c(2,7,10)] + for(k in c(3,4,5,6,7)){tmp3[,k] <- as.numeric(as.character(unlist(tmp3[,k])))} + tmp2 <- rbind(tmp2,tmp3) + for(k in c(3,4,5,6,7)){tmp2[,k] <- as.numeric(as.character(unlist(tmp2[,k])))} + } + } + } + } else { + tmp2 <- tmp[,-c(2,7,10)] + for(k in c(3,4,5,6,7)){tmp2[,k] <- as.numeric(as.character(unlist(tmp2[,k])))} + } + transbands.merged <- data.frame(rbind(transbands.merged,tmp2)) + for(k in c(3,4,5,6,7)){transbands.merged[,k] <- as.numeric(as.character(unlist(transbands.merged[,k])))} + } + for(k in c(1,2)){transbands.merged[,k] <- as.character(unlist(transbands.merged[,k]))} + transbands.merged <- mutate(transbands.merged,Name=paste(qtl_chromosome,":",round(Interval_left/1e6,digits=digits_Mb),"-",round(Interval_right/1e6,digits=digits_Mb),"Mb",sep="")) + + output <- dplyr::mutate(eQTL.table.file,trans_band = "none") + + for(i in 1:nrow(transbands.merged)){ + output[output$qtl_type == transbands.merged$qtl_type[i] & + output$qtl_chromosome == transbands.merged$qtl_chromosome[i] & + output$qtl_bp > transbands.merged$Interval_left[i] & + output$qtl_bp <= transbands.merged$Interval_right[i] & + !is.na(output$qtl_type) & !is.na(output$qtl_chromosome) & !is.na(output$qtl_bp),]$trans_band <- transbands.merged$Name[i] + } + + return(output) + } diff --git a/Function_scripts/QTL.map.1.FDR.R b/Function_scripts/QTL.map.1.FDR.R new file mode 100644 index 0000000..c587f75 --- /dev/null +++ b/Function_scripts/QTL.map.1.FDR.R @@ -0,0 +1,92 @@ +###FDR calculation function for single marker mapping +#Snoek & Sterken, 2017; Sterken, 2017 + +###input +#map1.output (output from the QTL.map.1 function) +#filenames.perm (files containing permutated data) +#FDR_dir (location of the files with permutated data) +#q.value (desired q-value) +#small (logic; set to TRUE for a small set) + +###output +#a list with names: LOD, Effect, Trait_perm, Map, and Marker. + +###Description +# Based on Benjamini and Yakuteili multiple testing under dependency + + + +QTL.map.1.FDR <- function(map1.output,filenames.perm,FDR_dir,q.value,small){ + if(missing(map1.output)){ stop("Missing map1.output ([[LOD]],[[Effect]],[[Trait]],[[Map]],[[Marker]])")} + if(missing(filenames.perm)){ stop("Need a vector with filenames for permutation")} + if(missing(FDR_dir)){ FDR_dir <- getwd()} + if(missing(q.value)){ q.value <- 0.025} + if(missing(small)){ small <- FALSE} + + output <- as.list(NULL) + + output[[2]] <- matrix(0,ncol=length(filenames.perm),nrow=5000) + rownames(output[[2]]) <- seq(0.1,500,by=0.1) + colnames(output[[2]]) <- filenames.perm + + ###in case of one trait + if(small){ + if(length(filenames.perm) > 1){ + stop("needs one permutatation file, with n permutations") + } + tmp <- load(file=paste(FDR_dir,filenames.perm,sep="")) + ###No NA's + tmp <- get(tmp)[[1]] + tmp[is.na(tmp)] <- 0.1 + + ###no Non-traits + tmp <- tmp[rownames(tmp) != "",] + output[[2]] <- apply(tmp,1,max,na.rm=T) + + if((length(output[[2]]) * q.value) < 10){ + warning(paste("inadvisable to calculate q=",q.value,"based on ",length(output[[2]])," permutations. Increase to at least ", ceiling(10/q.value))) + } + + ###take the q-cutoff, *10 is to be consistent with eQTL methodology (is per 0.1) + output[[1]] <- rev(sort(output[[2]]))[ceiling(q.value*length(output[[2]]))]*10 + + ###again, to be consistent; + output[[3]] <- mean(output[[2]],na.rm = TRUE) + + ###real data + output[[4]] <- max(map1.output$LOD,na.rm=TRUE) + + names(output) <- c("Significance_threshold","False_discoveries","False_discoveries_average","Real_discoveries") + } + if(!small){ + for(i in 1:length(filenames.perm)){ + tmp <- load(file=paste(FDR_dir,filenames.perm[i],sep="")) + ###No NA's + tmp <- get(tmp)[[1]] + tmp[is.na(tmp)] <- 0.1 + + tmp <- table(apply(round(tmp,digits=1),1,max,na.rm=T)) + + output[[2]][(as.numeric(names(tmp))*10),i] <- tmp + output[[2]][,i] <- rev(cumsum(rev(output[[2]][,i]))) + } + ###Take the average over permutations + output[[3]] <- apply(output[[2]],1,mean) + + ###Calculate the real discoveries, no NA's + tmp <- map1.output$LOD + tmp[is.na(tmp)] <- 0.1 + + RDS.tmp <- table(round(apply(tmp,1,max,na.rm=T),digits=1)) + RDS <- rep(0,times=5000); RDS[(as.numeric(names(RDS.tmp))*10)] <- RDS.tmp + RDS <- rev(cumsum(rev(RDS))) + + output[[4]] <- RDS + + output[[1]] <- which(round(((dim(map1.output$LOD)[1]-RDS)/dim(map1.output$LOD)[1])*q.value*log10(dim(map1.output$LOD)[1]),digits=3)-round(output[[3]]/output[[4]],digits=3)>0)[1] + names(output) <- c("Significance_threshold","False_discoveries","False_discoveries_average","Real_discoveries") + } + + ###Return + return(output) + } diff --git a/Function_scripts/QTL.map.1.R b/Function_scripts/QTL.map.1.R new file mode 100644 index 0000000..958a1b9 --- /dev/null +++ b/Function_scripts/QTL.map.1.R @@ -0,0 +1,87 @@ +###function for single marker mapping +#Snoek & Sterken, 2017; Sterken, 2017 + +###input +#trait.matrix (traits in rows, genotypes in columns) +#strain.map (markers in rows, genotypes in columns) +#strain.marker (markers in rows, with columns name, chromosome and position) + +###output +#a list with names: LOD, Effect, Trait, Map, and Marker. + +###Description +# map.1 is optimized for dataset in which many markers are un-informative +# these markers are not mapped but the p-vals and effect are copied from the previous marker. + +QTL.map.1 <- function(trait.matrix,strain.map,strain.marker){ + if(missing(trait.matrix)){ stop("specify trait.matrix, matrix with traits in rows and genotypes in columns")} + if(missing(strain.map)){ stop("specify strain.map (genetic map)")} + if(ncol(strain.map) != dim(as.matrix(trait.matrix))[2] & + ncol(strain.map) != prod(dim(as.matrix(trait.matrix)))){ stop("number of strains is not equal to number of genotypes in map")} + if(missing(strain.marker)){ strain.marker <- data.frame(cbind(name=1:dim(strain.map)[1],chromosome=NA,bp=1:dim(strain.map)[1])) + rownames(strain.marker) <- 1:dim(strain.map)[1]} + + ###Check for NAs + NAs <- sum(is.na(trait.matrix))>0 | sum(is.na(strain.map))>0 + + small <- FALSE + if(dim(as.matrix(trait.matrix))[1] ==1){ + traitnames <- rownames(trait.matrix) + trait.matrix <- rbind(trait.matrix,trait.matrix) + rownames(trait.matrix) <- c("A","B") + small <- TRUE + } + + eff.out <- matrix(NA,nrow(trait.matrix),nrow(strain.map)) + pval.out <- matrix(NA,nrow(trait.matrix),nrow(strain.map)) + + for(i in 1:nrow(strain.map)){ + noseg <- length(unique(strain.map[i,])) ==1 + if(noseg){ + output.tmp <- matrix(c(0,0),byrow=TRUE,ncol=2,nrow=nrow(trait.matrix)) + } + if( i == 1 & !noseg){ + if(!NAs){output.tmp <- fast.lm.1(trait.matrix,strain.map[i,])} + if(NAs){output.tmp <- lm.1(trait.matrix,strain.map[i,])} + } + if( i != 1 & sum(abs(strain.map[i-1,]-strain.map[i,]),na.rm=T) != 0 & !noseg){ + if(!NAs){output.tmp <- fast.lm.1(trait.matrix,strain.map[i,])} + if(NAs){output.tmp <- lm.1(trait.matrix,strain.map[i,])} + } + if( i != 1 & sum(abs(strain.map[i-1,]-strain.map[i,]),na.rm=T) == 0 & !noseg){ + output.tmp <- output.tmp + } + eff.out[,i] <- output.tmp[,2] + pval.out[,i] <- output.tmp[,1] + } + if(!small){ + colnames(eff.out) <- rownames(strain.marker) + rownames(eff.out) <- rownames(trait.matrix) + colnames(pval.out) <- rownames(strain.marker) + rownames(pval.out) <- rownames(trait.matrix) + } + if(small){ + eff.out <- matrix(eff.out[1,],nrow=1) + rownames(eff.out) <- traitnames + colnames(eff.out) <- rownames(strain.map) + pval.out <- matrix(pval.out[1,],nrow=1) + rownames(pval.out) <- traitnames + colnames(pval.out) <- rownames(strain.map) + trait.matrix <- matrix(trait.matrix[1,],nrow=1) + rownames(trait.matrix) <- traitnames + colnames(trait.matrix) <- colnames(strain.map) + } + + output <- NULL; output <- as.list(output) + output[[1]] <- round(pval.out,digits=2) + output[[2]] <- round(eff.out,digits=3) + output[[3]] <- trait.matrix + output[[4]] <- strain.map + output[[5]] <- strain.marker + names(output) <- c("LOD","Effect","Trait","Map","Marker") + return(output) + } + + + + diff --git a/Function_scripts/QTL.map.1.perm.R b/Function_scripts/QTL.map.1.perm.R new file mode 100644 index 0000000..5226111 --- /dev/null +++ b/Function_scripts/QTL.map.1.perm.R @@ -0,0 +1,85 @@ +###Permutation function for single marker mapping +#Snoek & Sterken, 2017; Sterken, 2017 + +###input +#trait.matrix (traits in rows, genotypes in columns) +#strain.map (markers in rows, genotypes in columns) +#strain.marker (markers in rows, with columns name, chromosome and position) +#n.perm (the number of times each trait should be permutated) + +###output +#a list with names: LOD, Effect, Trait_perm, Map, and Marker. + +###Description +# map.1 is optimized for dataset in which many markers are un-informative +# these markers are not mapped but the p-vals and effect are copied from the previous marker. + +QTL.map.1.perm <- function(trait.matrix,strain.map,strain.marker,n.perm){ + if(missing(trait.matrix)){ stop("specify trait.matrix, matrix with traits in rows and genotypes in columns")} + if(missing(strain.map)){ stop("specify strain.map (genetic map)")} + if(ncol(strain.map) != dim(as.matrix(trait.matrix))[2] & + ncol(strain.map) != prod(dim(as.matrix(trait.matrix)))){ stop("number of strains is not equal to number of genotypes in map")} + if(missing(strain.marker)){ strain.marker <- data.frame(cbind(name=1:dim(strain.map)[1],chromosome=NA,bp=1:dim(strain.map)[1])) + rownames(strain.marker) <- 1:dim(strain.map)[1]} + if(missing(n.perm)){ n.perm <- 1000} + if(n.perm == 1 & dim(as.matrix(trait.matrix))[2] ==1){ stop("cannot conduct 1 permutation on 1 trait, set n.perm > 1")} + + + NAs <- sum(is.na(trait.matrix))>0 | sum(is.na(strain.map))>0 + + small <- FALSE + if(dim(as.matrix(trait.matrix))[1] ==1){ + traitnames <- rownames(trait.matrix) + trait.matrix <- rbind(trait.matrix,trait.matrix) + rownames(trait.matrix) <- c(traitnames,"B") + small <- TRUE + } + + traitnames <- rownames(trait.matrix) + trait.matrix <- trait.matrix[rep(1:nrow(trait.matrix),each=n.perm),] + trait.matrix <- permutate.traits(trait.matrix) + rownames(trait.matrix) <- paste(rep(traitnames,each=n.perm),1:n.perm,sep="_") + + if(small){ + trait.matrix <- trait.matrix[1:n.perm,] + } + + eff.out <- matrix(NA,nrow(trait.matrix),nrow(strain.map)) + pval.out <- matrix(NA,nrow(trait.matrix),nrow(strain.map)) + + for(i in 1:nrow(strain.map)){ + noseg <- length(unique(strain.map[i,])) ==1 + if(noseg){ + output.tmp <- matrix(c(0,0),byrow=TRUE,ncol=2,nrow=nrow(trait.matrix)) + } + if( i == 1 & !noseg){ + if(!NAs){output.tmp <- fast.lm.1(trait.matrix,strain.map[i,])} + if(NAs){output.tmp <- lm.1(trait.matrix,strain.map[i,])} + } + if( i != 1 & sum(abs(strain.map[i-1,]-strain.map[i,]),na.rm=T) != 0 & !noseg){ + if(!NAs){output.tmp <- fast.lm.1(trait.matrix,strain.map[i,])} + if(NAs){output.tmp <- lm.1(trait.matrix,strain.map[i,])} + } + if( i != 1 & sum(abs(strain.map[i-1,]-strain.map[i,]),na.rm=T) == 0 & !noseg){ + output.tmp <- output.tmp + } + eff.out[,i] <- output.tmp[,2] + pval.out[,i] <- output.tmp[,1] + } + + colnames(eff.out) <- rownames(strain.marker) + rownames(eff.out) <- rownames(trait.matrix) + colnames(pval.out) <- rownames(strain.marker) + rownames(pval.out) <- rownames(trait.matrix) + + + output <- NULL; output <- as.list(output) + output[[1]] <- round(pval.out,digits=2) + output[[2]] <- round(eff.out,digits=3) + output[[3]] <- trait.matrix + output[[4]] <- strain.map + output[[5]] <- strain.marker + names(output) <- c("LOD","Effect","Trait_perm","Map","Marker") + + return(output) + } diff --git a/Function_scripts/QTL.map.1.sim.R b/Function_scripts/QTL.map.1.sim.R new file mode 100644 index 0000000..700be0d --- /dev/null +++ b/Function_scripts/QTL.map.1.sim.R @@ -0,0 +1,124 @@ +###Conduct mapping simulation to investigate power +#Sterken, 2017 + +###input +#strain.map (markers in rows, genotypes in columns) +#strain.marker (markers in rows, with columns name, chromosome and position) +#n_per_marker (number of simulated QTL per marker) +#sigma_peak (standard deviation of QTL peak; standard normal) +#sigma_error (standard deviation of noise; standard normal) + +###output +#a dataframe with per rows the outcomes of the simulated peak-sizes. The number of true, false, and undetected +# QTL; quantiles of the QTL effect-size estimation; quantiles of the QTL location estimation + +###Description +# Noise based on standard normal +# Fractional genotypes are transformed to integer numbers using round for simulating QTL +# Heterozygousity may be simulated using '0' or NA; these are ignored in assigning QTL effects but instead are given +# an effect based on a uniform distribution. + +###Updates +# QTL detection was very strict (peak at location of peak, changed that to a peak within a percentage of the genetic map (2.5%)) + + +QTL.map.1.sim <- function(strain.map,strain.marker,n_per_marker,sigma_peak,sigma_error,threshold,peak_within_perc){ + if(missing(strain.map)){ stop("specify strain.map (genetic map)")} + if(missing(strain.marker)){ strain.marker <- data.frame(cbind(name=1:dim(strain.map)[1],chromosome=NA,bp=1:dim(strain.map)[1])) + rownames(strain.marker) <- 1:dim(strain.map)[1]} + if(missing(n_per_marker)){ n_per_marker <-10} + if(missing(sigma_peak)){ sigma_peak <- c(1,1.15,1.3,1.47,1.64,1.82,2,2.2,2.47,2.73,3.05,3.45,4)} + if(missing(sigma_error)){ sigma_error <- 1} + if(missing(threshold)){ threshold <- c(3.5)} + if(missing(peak_within_perc)){ peak_within_perc = 0.0125} + ###1 sigma difference on 1 sigma noise is 20% of variance + ###2 sigma difference on 1 sigma noise is 50% of variance + ###3 sigma difference on 1 sigma noise is 69% of variance + ###4 sigma difference on 1 sigma noise is 80% of variance; summary(lm(c(rnorm(10000,0,1),rnorm(10000,4,1))~rep(c(-1,1),each=10000))) + ###cbind(var_expl=seq(5,95,by=5),sigma=c(0.46,0.67,0.84,1,1.15,1.3,1.47,1.64,1.82,2,2.2,2.47,2.73,3.05,3.45,4,4.75,6.0,8.75)) + + which.max.rev <- function(x){length(x)-which.max(rev(x))+1} + + out <- NULL + for(i in sigma_peak){ + ###Generate peaked data with random error + sim.map.file <- QTL.sim.data.per.marker(strain.map,strain.marker,n_per_marker,sigma_peak=i,sigma_error) + sim.emp.file <- QTL.sim.data.per.marker(strain.map,strain.marker,n_per_marker,sigma_peak=0,sigma_error) + + ###Map it + sim.map.file <- QTL.map.1(sim.map.file[[1]],sim.map.file[[2]],sim.map.file[[3]]) + + ###locate peak + loc_peak1 <- as.numeric(as.character(unlist(apply(sim.map.file$LOD,1,which.max)))) + loc_peak2 <- as.numeric(as.character(unlist(apply(sim.map.file$LOD,1,which.max.rev)))) + loc_peak <- floor(apply(cbind(loc_peak1,loc_peak2),1,median)) + + ###effects and lod + lod_peak <- as.numeric(t(sim.map.file$LOD))[loc_peak+seq(0,((nrow(sim.map.file$Map)^2)*n_per_marker)-nrow(sim.map.file$Map),by=nrow(sim.map.file$Map))] + eff_peak <- as.numeric(t(sim.map.file$Effect))[loc_peak+seq(0,((nrow(sim.map.file$Map)^2)*n_per_marker)-nrow(sim.map.file$Map),by=nrow(sim.map.file$Map))] + + ###effects and location of simulated peak + loc_sim <- rep(1:nrow(sim.map.file$Map),each=n_per_marker) + + ###peak is true if: above threshold and within % of total genetic map + ###peak is false if: above threshold and outside % of total genetic map + ###peak is not detected if: no significant peak + out.sim <- NULL + out.sim <- c(out.sim,detected_true=sum(lod_peak>=threshold & abs(loc_peak-loc_sim) < floor(nrow(sim.map.file$Map)*peak_within_perc))) + out.sim <- c(out.sim,detected_false=sum(lod_peak>=threshold & abs(loc_peak-loc_sim) >= floor(nrow(sim.map.file$Map)*peak_within_perc))) + out.sim <- c(out.sim,detected_not = sum(lod_peak<threshold)) + selc <- lod_peak>=threshold & abs(loc_peak-loc_sim) < floor(nrow(sim.map.file$Map)*peak_within_perc) + out.sim <- c(out.sim,detected_true_eff_est=quantile(abs(eff_peak[selc])/(i/2),probs=c(0.05,0.25,0.5,0.75,0.95))) + out.sim <- c(out.sim,detected_true_loc_est=quantile(abs(as.numeric(as.character(unlist(sim.map.file$Marker[loc_sim,3])))-as.numeric(as.character(unlist(sim.map.file$Marker[loc_peak+ceiling((loc_peak2-loc_peak)/2),3]))))[selc],probs=c(0.05,0.25,0.5,0.75,0.95))) + + ###Map False file + sim.map.file <- QTL.map.1(sim.emp.file[[1]],sim.emp.file[[2]],sim.emp.file[[3]]) + + loc_peak <- as.numeric(as.character(unlist(apply(sim.map.file$LOD,1,which.max)))) + lod_peak <- as.numeric(as.character(unlist(apply(sim.map.file$LOD,1,max,na.rm=TRUE)))) + eff_peak <- t(sim.map.file$Effect)[(ncol(sim.map.file$Effect)*(0:(nrow(sim.map.file$Effect)-1)))+loc_peak] + + loc_sim <- rep(1:nrow(sim.map.file$Map),each=n_per_marker) + + out.emp <- NULL + out.emp <- c(out.emp,detected_true=0) + out.emp <- c(out.emp,detected_false=sum(lod_peak >= threshold)) + out.emp <- c(out.emp,detected_not = sum(lod_peak < threshold)) + + out.emp <- c(out.emp,detected_true_eff_est=quantile(rep(NA,times=100),na.rm=T,probs=c(0.05,0.25,0.5,0.75,0.95))) + out.emp <- c(out.emp,detected_true_loc_est=quantile(rep(NA,times=100),na.rm=T,probs=c(0.05,0.25,0.5,0.75,0.95))) + ###Combine and summarize + out.sim <- data.frame(cbind(name=c(paste("peak_sim_",i,sep=""),paste("noise_sim_",sigma_error,sep="")), + var_explained=c(round(summary(lm(c(rnorm(100000,0,sigma_error),rnorm(100000,i,sigma_error))~rep(c(-1,1),each=100000)))$adj.r.squared,digits=2),round(summary(lm(c(rnorm(100000,0,sigma_error),rnorm(100000,0,sigma_error))~rep(c(-1,1),each=100000)))$adj.r.squared,digits=2)), + rbind(out.sim,out.emp))) + + out <- rbind(out,out.sim) + } + + ###Make sure numbers are numeric and characters are characteristic + for(i in 2:ncol(out)){ + out[,i] <- as.numeric(as.character(unlist(out[,i]))) + } + out[,1] <- as.character(unlist(out[,1])) + + + ###summarize all noise simulations + out[max(grep("noise_sim",out[,1])),-1] <- apply(out[grep("noise_sim",out[,1]),-1],2,sum) + out <- out[c(grep("peak_sim",out[,1]),max(grep("noise_sim",out[,1]))),] + + ##Add percentage detected & percentage false + output <- cbind(out, + percentage_detected=round(out$detected_true/(out$detected_true+out$detected_false+out$detected_not),digits=3)*100, + percentage_false=round(out$detected_false/(out$detected_true+out$detected_false+out$detected_not),digits=3)*100, + type=unlist(strsplit(out$name,split="_"))[seq(1,nrow(out)*3,by=3)], + sigma=unlist(strsplit(out$name,split="_"))[seq(3,nrow(out)*3,by=3)]) + + output <- output[,c(18,19,2,3:5,16,17,6:15)] + colnames(output)[grepl("_est.",colnames(output))] <- paste("quantile_",unlist(strsplit(colnames(output)[grepl("_est.",colnames(output))],split="_est."))[seq(2,2*sum(grepl("_est.",colnames(output))),by=2)],sep="") + colnames(output) <- gsub("\\.","",colnames(output)) + + colnames(output) <- paste(c(rep("Simulation",times=3),rep("QTL_detection",times=5),rep("Effect_size_estimation_(quantiles)",times=5),rep("QTL-true_peak_location_distance_(quantiles)",times=5)),colnames(output),sep=".") + + + return(output) +} diff --git a/Function_scripts/QTL.map1.dataframe.R b/Function_scripts/QTL.map1.dataframe.R new file mode 100644 index 0000000..eec8c50 --- /dev/null +++ b/Function_scripts/QTL.map1.dataframe.R @@ -0,0 +1,58 @@ +###function for transforming output of QTL.map.1 to a dataframe + +###input +#QTL.map.1 output +#small (logic) if a single trait was mapped set to TRUE + +###output +#dataframe with the columns trait, qtl_chromosome, qtl_bp, qtl_marker, qtl_significance, and qtl_effect + +###Description +#This transforms the relevant output to a dataframe, making handling in dplyr and plotting using ggplot easier +#This takes output of map.1 functions (list file with LOD, effect, Trait, and Map and converts it to a list. +#Optimized (a bit) by generating an empty matrix and filling the columns one-by-one. +#added an option for single traits + + QTL.map1.dataframe <- function(map1.output,small){ + if(missing(map1.output)){ stop("Missing map1.output ([[LOD]],[[Effect]],[[Trait]],[[Map]],[[Marker]])")} + if(missing(small)){ small <- FALSE} + + if(is.null(dim(map1.output[[1]]))){ + small <- TRUE + } + + if(small){ + ###Split up to long format + output <- matrix(NA,length(map1.output[[1]]),6) + output[,1] <- rep(rownames(map1.output[[3]])[1],each=length(map1.output[[1]])) + output[,2] <- as.character(unlist(map1.output[[5]][,2])) + output[,3] <- as.numeric(as.character(unlist(map1.output[[5]][,3]))) + output[,4] <- c(1:length(map1.output[[1]])) + output[,5] <- c(map1.output[[1]]) + output[,6] <- c(map1.output[[2]]) + output <- as.data.frame(output) + } + if(!small){ + ###Split up to long format + output <- matrix(NA,ncol(map1.output[[1]])*nrow(map1.output[[1]]),6) + output[,1] <- rep(rownames(map1.output[[1]]),each=ncol(map1.output[[1]])) + output[,2] <- rep(as.character(unlist(map1.output[[5]][,2])),times=nrow(map1.output[[1]])) + output[,3] <- rep(as.numeric(as.character(unlist(map1.output[[5]][,3]))),times=nrow(map1.output[[1]])) + output[,4] <- rep(c(1:ncol(map1.output[[1]])),times=nrow(map1.output[[1]])) + output[,5] <- c(t(map1.output[[1]])) + output[,6] <- c(t(map1.output[[2]])) + output <- as.data.frame(output) + } + + ###overwrite dataframe formats + output[,1] <- as.character(unlist(output[,1])) + output[,2] <- as.character(unlist(output[,2])) + output[,3] <- as.numeric(as.character(unlist(output[,3]))) + output[,4] <- as.numeric(as.character(unlist(output[,4]))) + output[,5] <- as.numeric(as.character(unlist(output[,5]))) + output[,6] <- as.numeric(as.character(unlist(output[,6]))) + colnames(output) <- c("trait","qtl_chromosome","qtl_bp","qtl_marker","qtl_significance","qtl_effect") + + ###Return + return(output) + } diff --git a/Function_scripts/QTL.peak.finder.R b/Function_scripts/QTL.peak.finder.R new file mode 100644 index 0000000..9011547 --- /dev/null +++ b/Function_scripts/QTL.peak.finder.R @@ -0,0 +1,60 @@ +###function for single marker mapping +#Snoek & Sterken, 2017; Sterken, 2017 + +###input +#map1.dataframe; output of the QTL.map1.dataframe function +#threshold; the significance treshold for calling QTL (in -log10(p)) +#LOD.drop; the decrease in LOD-score compared to the peak to call the boundaries of the QTL. Standard is 1.5 + +###output +#the original dataframe with the added columns: qtl_peak, qtl_marker_left, qtl_bp_left, qtl_marker_right, qtl_bp_right +#these columns are only non-NA where a QTL peak is located. + +###Description +#This takes output of mapping.to.list function and identiefies peaks and confidence intervals v1.3 +#Based on a given threshold and a 1.5 LOD-drop. +#v1.2 made the function more efficient, the peak borders are selected in one processing +#v1.3 fixed the problem calling the peak if the peak borders the edge of the chromosome +#v1.4 fixed 'larger then, smaller then' bug +#v1.5 fixed 'incompatible types' bug; NA is not recognized as numeric +#v1.6 rewritten the code, so it is fully in dplyr framework + +QTL.peak.finder <- function(map1.dataframe,threshold,LOD.drop){ + if(missing(map1.dataframe)){ stop("Missing map1.dataframe (trait, qtl_chromosome, qtl_bp, qtl_marker, qtl_significance, qtl_effect)")} + if(missing(threshold)){ stop("Missing threshold")} + if(missing(LOD.drop)){ LOD.drop <- 1.5} + + ###locate the peak + output <- dplyr::group_by(map1.dataframe,trait,qtl_chromosome) %>% + dplyr::mutate(qtl_peak_tmp=ifelse(qtl_significance == max(qtl_significance) & max(qtl_significance)>=threshold,1,NA)) %>% + dplyr::group_by(trait,qtl_chromosome,qtl_peak_tmp) %>% + dplyr::mutate(qtl_peak=ifelse(is.na(qtl_peak_tmp),NA, + ifelse(abs(qtl_marker - round(mean(qtl_marker),digits = 0)) == min(abs(qtl_marker - round(mean(qtl_marker),digits = 0))),qtl_marker,NA))) %>% + dplyr::group_by(trait,qtl_chromosome) %>% + dplyr::mutate(Border=ifelse((qtl_significance < max(qtl_significance)-LOD.drop & max(qtl_significance)>=threshold),T, + ifelse((qtl_bp == min(qtl_bp) & min(qtl_bp) %in% qtl_bp[qtl_significance >= max(qtl_significance)-LOD.drop & max(qtl_significance)>=threshold]),T, + ifelse((qtl_bp == max(qtl_bp) & max(qtl_bp) %in% qtl_bp[qtl_significance >= max(qtl_significance)-LOD.drop & max(qtl_significance)>=threshold]),T,F))), + peakloc=ifelse(sum(!is.na(qtl_peak)) == 1,qtl_peak[!is.na(qtl_peak)],NA)) %>% + dplyr::group_by(trait,qtl_chromosome,Border) %>% + dplyr::mutate(qtl_marker_left=as.numeric(ifelse(Border,ifelse(qtl_marker == max(qtl_marker[qtl_marker <= peakloc]),qtl_marker,NA),NA)), + qtl_bp_left=as.numeric(ifelse(Border,ifelse(qtl_bp == max(qtl_bp[qtl_marker <= peakloc]),qtl_bp,NA),NA)), + qtl_marker_right=as.numeric(ifelse(Border,ifelse(qtl_marker == min(qtl_marker[qtl_marker >= peakloc]),qtl_marker,NA),NA)), + qtl_bp_right=as.numeric(ifelse(Border,ifelse(qtl_bp == min(qtl_bp[qtl_marker >= peakloc]),qtl_bp,NA),NA))) %>% + dplyr::group_by(trait,qtl_chromosome) %>% + dplyr::mutate(qtl_peak_tmp=ifelse(sum(!is.na(qtl_peak))==1,1,0)) %>% + dplyr::group_by(trait,qtl_chromosome,qtl_peak_tmp) %>% + dplyr::mutate(qtl_marker_left=ifelse(qtl_peak_tmp==1,qtl_marker_left[!is.na(qtl_marker_left)],NA), + qtl_bp_left=ifelse(qtl_peak_tmp==1,qtl_bp_left[!is.na(qtl_bp_left)],NA), + qtl_marker_right=ifelse(qtl_peak_tmp==1,qtl_marker_right[!is.na(qtl_marker_right)],NA), + qtl_bp_right=ifelse(qtl_peak_tmp==1,qtl_bp_right[!is.na(qtl_bp_right)],NA)) %>% + dplyr::ungroup() %>% + dplyr::select(trait,qtl_chromosome,qtl_bp,qtl_marker,qtl_significance,qtl_effect,qtl_peak,qtl_marker_left,qtl_bp_left,qtl_marker_right,qtl_bp_right) %>% + dplyr::mutate(qtl_marker_left=ifelse(!is.na(qtl_peak),qtl_marker_left,NA), + qtl_bp_left=ifelse(!is.na(qtl_peak),qtl_bp_left,NA), + qtl_marker_right=ifelse(!is.na(qtl_peak),qtl_marker_right,NA), + qtl_bp_right=ifelse(!is.na(qtl_peak),qtl_bp_right,NA)) + + ###return called peaks + return(output) + } + diff --git a/Function_scripts/QTL.sim.data.per.marker.R b/Function_scripts/QTL.sim.data.per.marker.R new file mode 100644 index 0000000..1f6bc50 --- /dev/null +++ b/Function_scripts/QTL.sim.data.per.marker.R @@ -0,0 +1,67 @@ +###Simulate simple additive QTL +#Sterken, 2017 + +###input +#strain.map (markers in rows, genotypes in columns) +#strain.marker (markers in rows, with columns name, chromosome and position) +#n_per_marker (number of simulated QTL per marker) +#sigma_peak (standard deviation of QTL peak; standard normal) +#sigma_error (standard deviation of noise; standard normal) + +###output +#a list with: Simulated traits ([[1]]), Map ([[2]]), and Marker ([[3]]). + +###Description +# Noise based on standard normal +# Fractional genotypes are transformed to integer numbers using round for simulating QTL +# Heterozygousity may be simulated using '0' or NA; these are ignored in assigning QTL effects but instead are given +# an effect based on a uniform distribution. + + +QTL.sim.data.per.marker <- function(strain.map,strain.marker,n_per_marker,sigma_peak,sigma_error,replicates){ + if(missing(replicates)){ replicates <- 1} + + if(sum(strain.map%%1>0,na.rm=T)>0){ + ###fractional loci are treated as closest genotype + strain.map.use <- round(strain.map,digits = 0) + warning("set fractional genotypes to closest genotype using round() for simulating QTL") + } else { + strain.map.use <- strain.map + } + + sim.map.file <- list() + + sim.map.file[[2]] <- data.matrix(strain.map.use) + types <- unique(as.numeric(as.character(unlist(sim.map.file[[2]])))) + types <- types[!is.na(types)] + if(length(types) > 2){ + types <- sort(types)[c(1,length(types))] + warning(paste("more than 2 genotypes detected, only simulating with the most extreme numbers:",paste(types,collapse=" "),sep=" ")) + } + sim.map.file[[2]] <- sim.map.file[[2]][rep(1:nrow(sim.map.file[[2]]),each=n_per_marker),] + + for(i in 1:replicates){ + tmp <- sim.map.file[[2]] + tmp[sim.map.file[[2]]==types[1]] <- sigma_peak + tmp[sim.map.file[[2]]==types[2]] <- 0 + ###markers of unknown genotype get filled with a random number + tmp[!sim.map.file[[2]]%in%types] <- runif(sum(!sim.map.file[[2]]%in%types),0,sigma_peak) + tmp <- tmp + rnorm(n=length(tmp),mean=0,sd=sigma_error) + rownames(tmp) <- paste(rownames(tmp),rep(1:n_per_marker,times=nrow(sim.map.file[[2]])/n_per_marker),sep="_") + + if(i == 1){ + sim.map.file[[1]] <- tmp + } else { + sim.map.file[[1]] <- tmp + sim.map.file[[1]] + } + } + ###average + sim.map.file[[1]] <- sim.map.file[[1]] / replicates + + ###paste original strain map back + sim.map.file[[2]] <- strain.map + + sim.map.file[[3]] <- strain.marker + + return(sim.map.file) + } diff --git a/Function_scripts/QTL.table.addR2.R b/Function_scripts/QTL.table.addR2.R new file mode 100644 index 0000000..4a35038 --- /dev/null +++ b/Function_scripts/QTL.table.addR2.R @@ -0,0 +1,54 @@ +###function for determining variance explained by QTL + +###input +#QTL.table.file (should contain marker locations: qtl_marker, qtl_marker_1, qtl_marker_2 and trait names) +#QTL.list.file (list with QTL data, should contain entries Map and Trait) + +###output +#the R-squared is added to the QTL.table.file + +###Description + +# This function will add the sum of squares of the used model to a QTL table output, it also calculates the r-squared of the model + + + QTL.table.addR2 <- function(QTL.table.file,QTL.list.file){ + if(missing(QTL.table.file)){ stop("Give a QTL summary table, should contain marker locations (qtl_marker, qtl_marker_1, qtl_marker_2) and trait names")} + if(missing(QTL.list.file)){ stop("Give the input on which was mapped (output of QTL.data.prep)")} + + ###R-squared function + R.sq <- function(x,y){return(cor(x,y,use="na.or.complete")^2)} + + ###True if single marker mapping + if("qtl_marker" %in% colnames(QTL.table.file)){ + output <- NULL + for(i in 1:nrow(QTL.table.file)){ + ###get marker + mrk <- QTL.list.file$Map[as.numeric(as.character(unlist(QTL.table.file$qtl_marker[i]))),] + ###get expression + y <- unlist(QTL.list.file$Trait[rownames(QTL.list.file$Trait)==as.character(unlist(QTL.table.file$trait[i])),]) + ###run model + output <- c(output,R.sq(y,mrk)) + } + output <- data.frame(cbind(QTL.table.file,qtl_R2_sm=output)) + } + + ###True if multiple marker mapping + if("marker_1" %in% colnames(QTL.table.file) & "marker_2" %in% colnames(QTL.table.file)){ + output <- cbind(QTL.table.file,SS_int_mrk1=NA,SS_int_mrk2=NA,SS_int=NA,SS_int_res=NA,SS_add_mrk1=NA,SS_add_mrk2=NA,SS_add_res=NA) + for(i in 1:nrow(QTL.table.file)){ + ###get marker + mrk1 <- QTL.list.file$Map[QTL.table.file$marker_1[i],] + mrk2 <- QTL.list.file$Map[QTL.table.file$marker_2[i],] + ###get expression + y <- unlist(QTL.list.file$Trait[rownames(QTL.list.file$Trait)==QTL.table.file$trait[i],]) + ###run model + output[i,colnames(output)%in%c("SS_int_mrk1","SS_int_mrk2","SS_int","SS_int_res")] <- anova(lm(y ~ mrk1*mrk2))$"Sum Sq" + output[i,colnames(output)%in%c("SS_add_mrk1","SS_add_mrk2","SS_add_res")] <- anova(lm(y ~ mrk1+mrk2))$"Sum Sq" + + } + output <- dplyr::mutate(output,r2_add=(SS_add_mrk1+SS_add_mrk2)/(SS_add_mrk1+SS_add_mrk2+SS_add_res),r2_int=SS_int/(SS_int_mrk1+SS_int_mrk2+SS_int+SS_int_res)) + } + return(output) + } + diff --git a/Function_scripts/R.sq.R b/Function_scripts/R.sq.R new file mode 100644 index 0000000..1a2d822 --- /dev/null +++ b/Function_scripts/R.sq.R @@ -0,0 +1,15 @@ +###calculates R-squared as in a linear model + +###input +#x (vector) +#y (vector) + + +###output +#squared correlation coeffcient + +###Description + + R.sq <- function(x,y){return(cor(x,y,use="na.or.complete")^2)} + + \ No newline at end of file diff --git a/Function_scripts/enrich.easy.R b/Function_scripts/enrich.easy.R new file mode 100644 index 0000000..6e06882 --- /dev/null +++ b/Function_scripts/enrich.easy.R @@ -0,0 +1,52 @@ +###main enrichment function + +###input +#gene_list (list of genes: WBID, public names, etc.) +#gene_group (group genes belong to; if not defined assumed to be a single group) +#gene_total (all genes detected with platform (e.g. Microarray)) +#database_dir (location of Enrich.DB.Rdata file) +#group_filter_n (minimum size of group; standard is 3) +#overlap_filter_n (minimum overlap; standard is 2) + +###output +#dataframe with enrichment, columns: Term, Annotation, Group, Genes_in_group, Overlap_expected, Overlap, Significance, Bonferroni + +###Description +# Main enrichment function, calls on enrich.one, enrich.get.geneIDs, and enrich.more + + +enrich.easy <- function(gene_list,gene_group,gene_total,database_dir,group_filter_n = 3,overlap_filter_n = 2){ + if(missing(gene_list)){ stop("provide vector of genes to be enriched")} + if(missing(gene_group)){ gene_group <- rep("A",times=length(gene_list)); + warning("genes assumed to belong to a single group")} + if(missing(gene_total)){ stop("provide vector of total set of genes (e.g. present on MA)")} + ###Check input + if(length(gene_list) != length(gene_group)){ stop("gene_list and gene_group should match in length")} + ###Database loading + if(missing(database_dir)){ stop("Give location of enrichment database (or built it using 'built.enrich.DB'")} + load(paste(database_dir,"/obj_Enrich.DB.Rdata",sep="")) + load(paste(database_dir,"/obj_gene.identifiers.Rdata",sep="")) + + ###Process input + gene_list <- enrich.get.geneIDs(gene_list,gene.identifiers) + gene_total <- enrich.get.geneIDs(unique(gene_total),gene.identifiers) + + tmp <- tapply(gene_list,gene_group,enrich.more,gene_total=gene_total,Enrich.DB=Enrich.DB) + output <- NULL + for(i in 1:length(tmp)){ + output <- rbind(output,tmp[[i]]) + } + output <- cbind(Term=rep(names(tmp),lapply(tmp,nrow)),output) + ###Filter on low amounts + output <- output[output[,4]> group_filter_n & output[,6]>overlap_filter_n,] + if(length(output) == 0){ stop("no significant enrichments")} + + ###Add bonferroni and category names + output <- cbind(output,Bonferroni=p.adjust(output[,7],method="bonferroni"),Annotation=Enrich.DB[[1]][match(output[,2],Enrich.DB[[1]][,1]),2]) + output <- output[,c(1,9,3,4,5,6,7,8)] + colnames(output) <- c("Term","Annotation","Group","Genes_in_group","Overlap_expected","Overlap","Significance","Bonferroni") + + output <- dplyr::mutate(output,Fold_enrichment=log2(Overlap/Overlap_expected)) %>% + dplyr::select(Term,Annotation,Group,Genes_in_group,Overlap_expected,Overlap,Fold_enrichment,Significance,Bonferroni) + return(output) + } \ No newline at end of file diff --git a/Function_scripts/enrich.get.geneIDs.R b/Function_scripts/enrich.get.geneIDs.R new file mode 100644 index 0000000..6fcd197 --- /dev/null +++ b/Function_scripts/enrich.get.geneIDs.R @@ -0,0 +1,32 @@ +###recodes a set of genes into unique IDs + +###input +#gene_list (list of genes (e.g. WBID)) +#gene_identifiers (dataframe with various identifiers per gene) + +###output +#vector with uniform identifiers + +###Description +# Basic enrichment function, normally not used directly. + + +enrich.get.geneIDs <- function(gene_list,gene_identifiers){ + if(missing(gene_list)){ stop("Gene list (e.g. wormbase IDs)")} + if(missing(gene_identifiers)){ stop("Provide a dataframe with gene identifiers (ID, public name, sequence name, transcript name")} + + ###Remove empty rows + gene_list[gene_list == "" | gene_list == "NA" | gene_list == "N/A"] <- NA + + tmp <- matrix(c(match(gene_list,gene_identifiers[,1]),match(gene_list,gene_identifiers[,2]),match(gene_list,gene_identifiers[,3]),match(gene_list,gene_identifiers[,4])),ncol=4) + NAs <- which(apply(is.na(tmp),1,sum)==4) + if(length(NAs) > 0){ + tmp[NAs,] <- 1 + } + output <- gene_identifiers[apply(tmp,1,function(x){unique(x[!is.na(x)])}),1] + if(length(NAs) > 0){ + output[NAs] <- "none" + warning(paste("there are",length(NAs),"non-matching identifiers in your query")) + } + return(output) + } diff --git a/Function_scripts/enrich.more.R b/Function_scripts/enrich.more.R new file mode 100644 index 0000000..f671a47 --- /dev/null +++ b/Function_scripts/enrich.more.R @@ -0,0 +1,28 @@ +###enrichment for all sets in database + +###input +#gene_list (list of genes in ID format (e.g. WBID)) +#gene_total (all genes detected with platform (e.g. Microarray)) +#anno_sets (annotation sets to be enriched for) + +###output +#dataframe with enrichment, columns: Anno_type, Group, Genes_in_group, Overlap_expected, Overlap, Significance + +###Description +# Basic enrichment function, normally not used directly. +# Calls enrich.one for doing the enrichment per group + +enrich.more <- function(gene_list,gene_total,anno_sets,Enrich.DB){ + if(missing(gene_list)){ stop("genes, ID format")} + if(missing(gene_total)){ stop("total set of genes (e.g. present on MA), ID format")} + if(missing(anno_sets)){ anno_sets <- Enrich.DB[[1]][-1,1]} + + output <- NULL + for(i in 1:length(anno_sets)){ + anno_group <- Enrich.DB[[which(Enrich.DB[[1]][,1] == anno_sets[i])]] + output <- rbind(output,cbind(anno_sets[i],enrich.one(unique(gene_list),unique(gene_total),anno_group))) + } + colnames(output) <- c("Anno_Type","Group","Genes_in_group","Overlap_expected","Overlap","Significance") + + return(output) + } \ No newline at end of file diff --git a/Function_scripts/enrich.one.R b/Function_scripts/enrich.one.R new file mode 100644 index 0000000..3cc101a --- /dev/null +++ b/Function_scripts/enrich.one.R @@ -0,0 +1,35 @@ +###enrichment for 1 set + +###input +#gene_list (list of genes in ID format (e.g. WBID)) +#gene_total (all genes detected with platform (e.g. Microarray)) +#anno_group (annotation-group to be enriched for) + +###output +#dataframe with enrichment, columns: Group, Genes_in_group, Overlap_expected, Overlap, Significance + +###Description +# Basic enrichment function, normally not used directly. + +enrich.one <- function(gene_list,gene_total,anno_group){ + if(missing(gene_list)){ stop("genes, ID format")} + if(missing(gene_total)){ stop("total set of genes (e.g. present on MA), ID format")} + if(missing(anno_group)){ stop("Give an annotation-group file")} + + ###make absolutely sure that we work with unique genes + anno_group[[1]] <- anno_group[[1]][!duplicated(paste(as.character(unlist(anno_group[[1]][,1])),as.character(unlist(anno_group[[1]][,2])))),] + anno_group[[1]][,2] <- as.factor(anno_group[[1]][,2]) + gene_list <- unique(gene_list) + gene_total <- unique(gene_total) + + wb.drawn <- table(anno_group[[1]][anno_group[[1]][,1] %in% gene_list,2]) + tot.wb <- table(anno_group[[1]][anno_group[[1]][,1] %in% gene_total,2]) + tot.bb <- length(unique(gene_total))- tot.wb + tot.drawn <- length(gene_list) + enr.test <- phyper(wb.drawn,tot.wb,tot.bb,tot.drawn,lower.tail=F) + + output <- data.frame(rownames(tot.wb),as.numeric(tot.wb),as.numeric(tot.wb)*(sum(gene_list %in% gene_total)/length(gene_total)),as.numeric(wb.drawn),as.numeric(enr.test)) + colnames(output) <- c("Group","Genes_in_group","Overlap_expected","Overlap","Significance") + + return (output) +} diff --git a/Function_scripts/fast.lm.1.R b/Function_scripts/fast.lm.1.R new file mode 100644 index 0000000..df5df3c --- /dev/null +++ b/Function_scripts/fast.lm.1.R @@ -0,0 +1,21 @@ +###fast lm function + +###input +#traits (matrix) +#variable (vector) + +###output +#matrix with -log10 transformed p-values and effects + +###Description +# Fast linear model, 1 variable +# Uses fast.summ, can't handle NA's! + + fast.lm.1 <- function(traits,variable){ + if(missing(traits)){ stop("specify traits")} + if(missing(variable)){ stop("specify variable")} + + output <- fast.summ(lm(t(traits)~variable)) + return(output) + } + diff --git a/Function_scripts/fast.summ.R b/Function_scripts/fast.summ.R new file mode 100644 index 0000000..3fbf64d --- /dev/null +++ b/Function_scripts/fast.summ.R @@ -0,0 +1,33 @@ +###fast summary function + +###input +#output of lm() + +###output +#matrix with -log10 transformed p-values and effects + +###Description +# Fast summary function +# This function cannot handle NA's! + + + fast.summ <- function(input){ + p <- input$rank + rdf <- input$df.residual + Qr <- input$qr + n <- nrow(Qr$qr) + + p1 <- 1L:p + r <- input$residuals + rss <- colSums(r^2) + + resvar <- rss/rdf + R <- chol2inv(Qr$qr[p1, p1, drop = FALSE]) + se <- sqrt(rep(diag(R),each=length(resvar)) * rep(resvar,times=length(diag(R)))) + est <- input$coefficients[Qr$pivot[p1],] + tval <- t(t(est)/se) + + pvals <- round(-log10(2 * pt(abs(tval),rdf, lower.tail = FALSE)),digits=2) + output <- cbind(t(pvals)[,-1],t(round(est,digits=5))[,-1]) + return(output) + } \ No newline at end of file diff --git a/Function_scripts/fast.summ.transcriptomics.R b/Function_scripts/fast.summ.transcriptomics.R new file mode 100644 index 0000000..5c81e07 --- /dev/null +++ b/Function_scripts/fast.summ.transcriptomics.R @@ -0,0 +1,33 @@ +###fast summary function for transcriptomics + +###input +#output of lm() + +###output +#matrix with -log10 transformed p-values and effects + +###Description +# Fast summary function +# This function cannot handle NA's! + + +fast.summ.transcriptomics <- function(input){ + p <- input$rank + rdf <- input$df.residual + Qr <- input$qr + n <- nrow(Qr$qr) + + p1 <- 1L:p + r <- input$residuals + rss <- colSums(r^2) + + resvar <- rss/rdf + R <- chol2inv(Qr$qr[p1, p1, drop = FALSE]) + se <- sqrt(rep(diag(R),each=length(resvar)) * rep(resvar,times=length(diag(R)))) + est <- input$coefficients[Qr$pivot[p1],] + tval <- t(t(est)/se) + + pvals <- 2 * pt(abs(tval),rdf, lower.tail = FALSE) + output <- cbind(t(pvals)[,-1],t(round(est,digits=5))[,-1]) + return(output) + } \ No newline at end of file diff --git a/Function_scripts/genetic.df.to.mat.R b/Function_scripts/genetic.df.to.mat.R new file mode 100644 index 0000000..46fd1ee --- /dev/null +++ b/Function_scripts/genetic.df.to.mat.R @@ -0,0 +1,67 @@ +###Transform a dataframe genetic map format to a matrix +#Mark Sterken, 26/7/2020 + +###input +#genotype.dataframe (dataframe; organized by strain, chromosome, position_start, position_end, genotype) +#chromosomes (chromosome names of organism; standard is C. elegans) +#chromosome_size (sizes of chromoosmes; standard is C. elegans) +#marker_spacing (number of markers to generate; standars is one marker per 10kb) + +###output +#a list with [[1]] a matrix with the genetic map and [[2]] a dataframe with the information on the markers + +###Description +#translates a long-format genetic map to a wide-format genetic map useful for further analyses. + +###See also + +genetic.df.to.mat <- function(genotype.dataframe,chromosomes,chromosome_size,marker_spacing){ + if(missing(genotype.dataframe)){ stop("requires a dataframe of crossover coordinates: strain, chromosome, position_start, position_end, genotype")} + if(missing(chromosomes)){ chromosomes <- c("I","II","III","IV","V","X")} + if(missing(chromosome_size)){ chromosome_size <- c(15072434,15279421,13783801,17493829,20924180,17718942)} + if(length(chromosomes) != length(chromosome_size)){ stop("chromosome names and sizes should have the same length")} + if(missing(marker_spacing)){ marker_spacing <- 10000} + + + ###reformat colnames to lowercase + colnames(genotype.dataframe) <- tolower(colnames(genotype.dataframe)) + + ###Make empty coordinate system based on input parameters + ###Always include base 1 and last base per chromosome + tmp <- tapply(chromosome_size,chromosomes,function(x,marker_spacing){c(1,seq(marker_spacing,x,by=marker_spacing),x)},marker_spacing=marker_spacing) + + ###go through markers, strain by strain + genotype.matrix <- NULL + for(i in 1:length(unique(genotype.dataframe$strain))){ + tmp.nu <- lapply(tmp,function(x){NA[!is.na(x)]}) + strain.nu <- genotype.dataframe[genotype.dataframe$strain == unique(genotype.dataframe$strain)[i],] + strain.nu <- strain.nu[strain.nu$chromosome %in% chromosomes,] + for(j in 1:nrow(strain.nu)){ + chr.nu <- which(names(tmp) == strain.nu$chromosome[j]) + tmp.nu[[chr.nu]][tmp[[chr.nu]] %in% strain.nu$position_start[j]:strain.nu$position_end[j]] <- strain.nu$genotype[j] + } + + genotype.matrix <- cbind(genotype.matrix,unlist(tmp.nu)) + colnames(genotype.matrix)[i] <- unique(genotype.dataframe$strain)[i] + } + rownames(genotype.matrix) <- paste(rep(names(tmp),times=unlist(lapply(tmp,length))),as.character(unlist(tmp)),sep="") + + informative <- genetic.informative.markers(genotype.matrix) + + ###Built marker file + strain.marker <- data.frame(cbind(name=paste(rep(names(tmp),times=unlist(lapply(tmp,length))),as.character(unlist(tmp)),sep=""), + chromosome=rep(names(tmp),times=unlist(lapply(tmp,length))), + position=unlist(tmp), + informative_marker=as.numeric(informative))) + strain.marker[,1] <- as.character(unlist(strain.marker[,1])) + strain.marker[,2] <- as.character(unlist(strain.marker[,2])) + strain.marker[,3] <- as.numeric(as.character(unlist(strain.marker[,3]))) + strain.marker[,4] <- as.numeric(as.character(unlist(strain.marker[,4]))) + + output <- list(NULL) + output[[1]] <- genotype.matrix + output[[2]] <- strain.marker + names(output) <- c("strain.map","strain.marker") + + return(output) + } \ No newline at end of file diff --git a/Function_scripts/genetic.distance.R b/Function_scripts/genetic.distance.R new file mode 100644 index 0000000..a0e8c18 --- /dev/null +++ b/Function_scripts/genetic.distance.R @@ -0,0 +1,81 @@ +###calculate genetic genetic distance between markers + + +###input +#strain.map (of the investigated strains: markers in rows, genotypes in columns) +#strain.marker (of the invesigated strains: markers in rows, with columns name, chromosome and position) +#chromosomes (character vector with chromosome names; standard is C. elegans) +#chromosome_size (numeric vector with chromosome sizes; standard is C. elegans) +#ignore.na (logic, tells if NAs should be incorporated in the analysis) + + +###output +#a dataframe with chromosome, physical position, and genetic position (cM) + +###Description +#ignore.na makes the algorithm find te nearest non-NA marker + + + + +genetic.distance <- function(strain.map,strain.marker,chromosomes,chromosome_size,ignore.na){ + if(missing(strain.map)){ stop("Requires genetic map with markers per row and strains per column")} + if(missing(strain.marker)){ stop("Requires marker file with 3 columns: name, chromosome, position")} + if(missing(chromosomes)){ chromosomes <- c("I","II","III","IV","V","X")} + if(missing(chromosome_size)){ chromosome_size <- c(15072434,15279421,13783801,17493829,20924180,17718942)} + if(length(chromosomes) != length(chromosome_size)){ stop("chromosome names and sizes should have the same length")} + if(missing(ignore.na)){ ignore.na <- FALSE} + + ###rolling mean function + rollingmean <- function(x){out <- NULL; for(i in 2:length(x)){out <- c(out,mean(x[(i-1):i]))};return(out)} + + ###make colnames comply to format + colnames(strain.marker) <- c("name", "chromosome", "position") + + ###output + output <- NULL + + ###per chromosome + for(i in 1:length(chromosomes)){ + currchr <- strain.map[strain.marker[,2]==chromosomes[i],] + currmrk <- strain.marker[strain.marker[,2]==chromosomes[i],] + ###fills in gaps based on the genotype of the closest non-NA marker + if(!ignore.na){ + for(j in 1:ncol(currchr)){ + for(k in which(is.na(currchr[,j]))){ + currori <- currchr[,j] + closest <- currmrk[,3]-currmrk[k,3] + closest[k] <- NA + closest <- order(abs(closest)) + ii <- 1 + while(is.na(currori[closest[ii]])){ + ii <- ii + 1 + } + currchr[k,j] <- currori[closest[ii]] + } + } + } + + ###count recombinations + diffmap <- apply(currchr,2,diff) != 0 + + notrecomb <- apply(!diffmap & !is.na(diffmap),1,sum) + yesrecomb <- apply(diffmap & !is.na(diffmap),1,sum) + + fractions <- cumsum(yesrecomb/(notrecomb+yesrecomb)) + fractions <- c(fractions[1],fractions,rev(fractions)[1]) + + locations <- rollingmean(currmrk[,3]) + locations <- c(1,locations,chromosome_size[i]) + + ###into output + output <- rbind(output,cbind(chromosome=chromosomes[i],position=locations,cM=fractions)) + } + output <- data.frame(output) + output[,1] <- as.character(unlist(output[,1])) + output[,2] <- as.numeric(as.character(unlist(output[,2]))) + output[,3] <- as.numeric(as.character(unlist(output[,3])))*100 + + return(output) + } + diff --git a/Function_scripts/genetic.informative.markers.R b/Function_scripts/genetic.informative.markers.R new file mode 100644 index 0000000..5f646ef --- /dev/null +++ b/Function_scripts/genetic.informative.markers.R @@ -0,0 +1,27 @@ +###Determine whether adjecent markers are informative +#Mark Sterken, 26/7/2020 + +###input +#genotype.matrix (preferrably numeric matrix with genotypes; if not it will force it to be numberic) + +###output +#a vector with whether a marker is informative (1) or not (0) + +###Description +#performes side-by-side correlation analysis to determine if crossovers occured between markers. If so it will deem the markers on either side useful. + +###See also + +genetic.informative.markers <- function(genotype.matrix){ + ###force into numeric matrix for correlation analysis + genotype.matrix.numeric <- matrix(as.numeric(as.factor(unlist(genotype.matrix))),ncol=ncol(genotype.matrix)) + genotype.matrix.numeric[is.na(genotype.matrix.numeric)] <- 0 + + tmp <- round(as.numeric(diag(cor(t(genotype.matrix.numeric))[-1,])),digits=5) + ###each different marker is informative + tmp1 <- c(0,tmp) + tmp2 <- c(tmp,0) + + output <- tmp1 != 1 | tmp2 != 1 + return(output) + } diff --git a/Function_scripts/lm.1.R b/Function_scripts/lm.1.R new file mode 100644 index 0000000..fc95b44 --- /dev/null +++ b/Function_scripts/lm.1.R @@ -0,0 +1,29 @@ +###lm function for NA handling + +###input +#data.input (matrix) +#variable (vector) + +###output +#matrix with -log10 transformed p-values and effects + +###Description +# Slow linear model for NA handling +# ignores heterozygous strains in effect determination + + + + lm.1 <- function(data.input,variable){ + retext <- function(x){return(c(first(x),last(x)))} + + out <- NULL + for(i in 1:nrow(data.input)){ + out <- rbind(out, + cbind(LOD=-log10(cor.test(data.input[i,!is.na(variable)],variable[!is.na(variable)],na.action=na.exclude())$p.value), + effect=diff(retext(tapply(data.input[i,!is.na(variable)],variable[!is.na(variable)],mean,na.rm=T)))) + ) + } + + return(out) + } + diff --git a/Function_scripts/permutate.traits.R b/Function_scripts/permutate.traits.R new file mode 100644 index 0000000..507b541 --- /dev/null +++ b/Function_scripts/permutate.traits.R @@ -0,0 +1,20 @@ +###permutates a trait matrix + +###input +#trait.matrix (matrix strains in columsn, traits in rows) + + +###output +#by-row permutated matrix + +###Description + + + permutate.traits <- function(trait.matrix){ + perm.list <- rep(c(1:nrow(trait.matrix)),times=ncol(trait.matrix)) + runif(length(trait.matrix),0.25,0.75) + perm.list <- order(perm.list) + perm.data <- matrix(as.numeric(trait.matrix)[perm.list],nrow=nrow(trait.matrix),ncol=ncol(trait.matrix),byrow=TRUE) + return(perm.data) + } + + \ No newline at end of file diff --git a/Function_scripts/plot.genotype.strain.R b/Function_scripts/plot.genotype.strain.R new file mode 100644 index 0000000..48f0547 --- /dev/null +++ b/Function_scripts/plot.genotype.strain.R @@ -0,0 +1,108 @@ +################################################################################ +###plot.genotype.strain(genotype.marker,genotype.strain1,genotype.strain2,chr.info,col.lines,both.chr) +### visualizes the genotype of a single strain, can also visualize two strains side by side +### +###v1.1 fixed a bug where the area between the second last and last marker of the chromosome was not plotted +### +### obligatory input +### strain.marker; marker names, and their corresponding chromosome numbers and basepairs +### strain.map1; genotype of the strain of interest, indicated using 1 (N2) and -1 (CB4856) values +### optional input +### chromosomes (character vector with chromosome names; standard is C. elegans) +### chromosome_size (numeric vector with chromosome sizes; standard is C. elegans) +### strain.map2; genotype of the second strain of interest, indicated using 1 (N2) and -1 (CB4856) values +### both.chr; set to TRUE if you input strain.map2; default is FALSE +### col.lines; graphic colours; default is c("blue", "grey", "orange") + +plot.genotype.strain <- function(strain.marker,strain.map1,strain.map2,chromosomes,chromosome_size,col.lines,both.chr){ + if(missing(strain.marker)){ stop("insert marker info, name, chromosome, pos")} + if(missing(strain.map1)){ strain.map1 <- rep(0,nrow(strain.marker))} + if(missing(strain.map2)){ strain.map2 <- strain.map1} + if(missing(chromosomes)){ chromosomes <- c("I","II","III","IV","V","X")} + if(missing(chromosome_size)){ chromosome_size <- c(15072434,15279421,13783801,17493829,20924180,17718942)} + if(length(chromosomes) != length(chromosome_size)){ stop("chromosome names and sizes should have the same length")} + if(missing(col.lines)){ col.lines <- c("#1F78B4","grey","#FF7F00")} ##default is -1 CB: blue, 1 N2: orange + if(missing(both.chr)){ both.chr <- FALSE} + + ###input chromosomes + chr.info <- data.frame(cbind(chromosome=chromosomes, + size=chromosome_size, + cumulative=c(0,cumsum(chromosome_size[-length(chromosome_size)]))), + stringsAsFactors=FALSE) + chr.info[,2] <- as.numeric(as.character(unlist(chr.info[,2]))) + chr.info[,3] <- as.numeric(as.character(unlist(chr.info[,3]))) + + strain.map1[is.na(strain.map1)] <- 0 + strain.map1 <- col.lines[strain.map1+2] + + if(both.chr == FALSE){ + plot(0,0,type="n",ylim=c(0,24),xlim=c(0.5,6.5),cex=2,axes=F,ylab="",xlab="",cex.lab=1.5) + + ###loop over markers + for(j in 1:nrow(strain.marker)){ + xpos <- which(chr.info[,1] == strain.marker[j,2]) + + tmp <- strain.marker[strain.marker[,2] == chr.info[xpos,1],] + m.on.c <- which(tmp[,3] == strain.marker[j,3]) + if(m.on.c == 1){ + lines(c(xpos,xpos),c(0,strain.marker[j,3]/1e6),col=strain.map1[j],lwd=21,lend=1) + } + if(m.on.c > 1 & m.on.c < nrow(tmp)){ + lines(c(xpos,xpos),c(strain.marker[j-1,3]/1e6,strain.marker[j,3]/1e6),col=strain.map1[j],lwd=21,lend=1) + } + if(m.on.c == nrow(tmp)){ + lines(c(xpos,xpos),c(strain.marker[j-1,3]/1e6,strain.marker[j,3]/1e6),col=strain.map1[j],lwd=21,lend=1) + lines(c(xpos,xpos),c(strain.marker[j,3]/1e6,chr.info[xpos,2]/1e6),col=strain.map1[j],lwd=21,lend=1) + } + } + text(1:6,23,chr.info[,1],font=2,cex=1.5) + } + + if(both.chr == TRUE){ + strain.map2[is.na(strain.map2)] <- 0 + strain.map2 <- col.lines[strain.map2+2] + + plot(0,0,type="n",ylim=c(0,23),xlim=c(0.5,6.5),cex=2,axes=F,ylab="",xlab="",cex.lab=1.5) + + ###loop over markers + for(j in 1:nrow(strain.marker)){ + xpos <- which(chr.info[,1] == strain.marker[j,2]) + + tmp <- strain.marker[strain.marker[,2] == chr.info[xpos,1],] + m.on.c <- which(tmp[,3] == strain.marker[j,3]) + if(m.on.c == 1){ + lines(c(xpos-0.2,xpos-0.2),c(0,strain.marker[j,3]/1e6),col=strain.map1[j],lwd=9,lend=1) + lines(c(xpos+0.2,xpos+0.2),c(0,strain.marker[j,3]/1e6),col=strain.map2[j],lwd=9,lend=1) + } + if(m.on.c > 1 & m.on.c < nrow(tmp)){ + lines(c(xpos-0.2,xpos-0.2),c(strain.marker[j-1,3]/1e6,strain.marker[j,3]/1e6),col=strain.map1[j],lwd=9,lend=1) + lines(c(xpos+0.2,xpos+0.2),c(strain.marker[j-1,3]/1e6,strain.marker[j,3]/1e6),col=strain.map2[j],lwd=9,lend=1) + } + if(m.on.c == nrow(tmp)){ + lines(c(xpos-0.2,xpos-0.2),c(strain.marker[j-1,3]/1e6,strain.marker[j,3]/1e6),col=strain.map1[j],lwd=9,lend=1) + lines(c(xpos+0.2,xpos+0.2),c(strain.marker[j-1,3]/1e6,strain.marker[j,3]/1e6),col=strain.map2[j],lwd=9,lend=1) + lines(c(xpos-0.2,xpos-0.2),c(strain.marker[j,3]/1e6,chr.info[xpos,2]/1e6),col=strain.map1[j],lwd=9,lend=1) + lines(c(xpos+0.2,xpos+0.2),c(strain.marker[j,3]/1e6,chr.info[xpos,2]/1e6),col=strain.map2[j],lwd=9,lend=1) + } + } + text(1:6,23,chr.info[,1],font=2,cex=1.5) + } + } + + + + + + + + + + + + + + + + + + diff --git a/Function_scripts/prep.ggplot.QTL.profile.R b/Function_scripts/prep.ggplot.QTL.profile.R new file mode 100644 index 0000000..4fb2b79 --- /dev/null +++ b/Function_scripts/prep.ggplot.QTL.profile.R @@ -0,0 +1,20 @@ + prep.ggplot.QTL.profile <- function(QTL.peak.output,map1.output,trait){ + output <- list() + + output[[1]] <- QTL.peak.output[as.character(unlist(QTL.peak.output$trait))==trait,] + names(output)[1] <- "QTL_profile" + tmp <- output[[1]]$qtl_peak[!is.na(output[[1]]$qtl_peak)] + + if(length(tmp)>0){ + for(i in 1:length(tmp)){ + output[[i+1]] <- cbind(trait=trait, + map1.output$Marker[tmp[i],], + strain=colnames(map1.output$Trait), + genotype=map1.output$Map[tmp[i],], + trait_value=map1.output$Trait[rownames(map1.output$Trait)==trait,], + R_squared=R.sq(x=map1.output$Map[tmp[i],],y=map1.output$Trait[rownames(map1.output$Trait)==trait,])) + names(output)[i+1] <- paste("QTLsplit_",as.character(unlist(map1.output$Marker[tmp[i],1])),sep="_") + } + } + return(output) + } diff --git a/Function_scripts/prep.ggplot.eQTL.table.R b/Function_scripts/prep.ggplot.eQTL.table.R new file mode 100644 index 0000000..e6833b9 --- /dev/null +++ b/Function_scripts/prep.ggplot.eQTL.table.R @@ -0,0 +1,36 @@ + ###Makes a table with eQTL peaks for plotting + ###this function adds points to faithfully indicate the chromosome boundaries (standard is C. elegans)) + prep.ggplot.eQTL.table <- function(eQTL.table.file,condition,chromosomes,chromosome_size){ + if(missing(eQTL.table.file)){ stop("need a eQTL.table.file")} + if(missing(condition)){ condition <- "ct"} + if(missing(chromosomes)){ chromosomes <- c("I","II","III","IV","V","X")} + if(missing(chromosome_size)){ chromosome_size <- c(15072434,15279421,13783801,17493829,20924180,17718942)} + if(length(chromosomes) != length(chromosome_size)){ stop("chromosome names and sizes should have the same length")} + + output <- dplyr::filter(eQTL.table.file, gene_chromosome %in% chromosomes) %>% + dplyr::mutate(gene_chromosome = factor(gene_chromosome,levels=rev(chromosomes))) %>% + dplyr::mutate(qtl_chromosome = factor(qtl_chromosome,levels=chromosomes),condition=condition) %>% + dplyr::mutate(qtl_type = factor(qtl_type,levels=rev(sort(unique(qtl_type))))) + + ###Chromosome sizes cheat + cheatpoints <- output[rep(1,times=length(chromosomes)*2),] + for(i in which(unlist(lapply(output,class)) == "factor")){ + cheatpoints[,i] <- as.character(unlist(cheatpoints[,i])) + } + cheatpoints[1:nrow(cheatpoints),unlist(lapply(output,class)) != "character"] <- 0 + cheatpoints[1:nrow(cheatpoints),unlist(lapply(output,class)) != "numeric"] <- "" + cheatpoints$qtl_chromosome <- as.character(rep(chromosomes,times=2)) + cheatpoints$gene_chromosome <- as.character(rep(chromosomes,times=2)) + cheatpoints$qtl_bp <- as.numeric(as.character(c(chromosome_size,rep(1,times=length(chromosomes))))) + cheatpoints$gene_bp <- as.numeric(as.character(c(chromosome_size,rep(1,times=length(chromosomes))))) + cheatpoints$trait <- paste(as.character(rep(chromosomes,times=2)),rep(c(1,2),each=length(chromosomes))) + + + ###combine + output <- rbind(output,cheatpoints) %>% + dplyr::mutate(qtl_type = factor(qtl_type,levels = c("cis","trans",""))) + + ###return + return(output) + } + diff --git a/Function_scripts/prep.ggplot.genetic.map.R b/Function_scripts/prep.ggplot.genetic.map.R new file mode 100644 index 0000000..2b96c30 --- /dev/null +++ b/Function_scripts/prep.ggplot.genetic.map.R @@ -0,0 +1,52 @@ +###rewrites the genetic map to a dataframe plottable in ggplot + + +###input +#strain.map (of the investigated strains: markers in rows, genotypes in columns) +#strain.marker (of the invesigated strains: markers in rows, with columns name, chromosome and position) + +###output +#dataframe with columns: strain, chromosome, position_start, position_end, and genotype + +###Description + +prep.ggplot.genetic.map <- function(strain.map,strain.marker){ + if(missing(strain.map)){ stop("Requires genetic map with markers per row and strains per column")} + if(missing(strain.marker)){ stop("Requires marker file with 3 columns: name, chromosome, position")} + if(nrow(strain.map) != nrow(strain.marker)){ stop("map does not match markers")} + if(ncol(strain.marker) != 3){ stop("Requires marker file with 3 columns: name, chromosome, position")} + + diffmap <- apply(strain.map,2,diff) != 0 + chrmap <- diff(as.numeric(as.factor(strain.marker[,2]))) != 0 + + tmp <- as.data.frame(cbind(name=as.character(unlist(strain.marker[,1])), + chromosome=as.character(unlist(strain.marker[,2])), + position=as.numeric(as.character(unlist(strain.marker[,3]))), + strain=rep(colnames(strain.map),each=nrow(strain.map)), + genotype=as.numeric(strain.map), + diff_left=as.numeric(rbind(diffmap,FALSE)), + diff_right=as.numeric(rbind(FALSE,diffmap)), + chr_left=as.numeric(c(chrmap,TRUE)), + chr_right=as.numeric(c(TRUE,chrmap)))) + for(i in c(3,5:9)){ + tmp[,i] <- as.numeric(as.character(unlist(tmp[,i]))) + } + + tmp <- cbind(tmp[,1:5],diffs=apply(tmp[,6:9],1,sum)) + tmp <- tmp[tmp[,6] != 0,] + + tmp <- cbind(tmp[-nrow(tmp),-6],tmp[-1,-6]) + tmp <- tmp[tmp[,2] == tmp[,7] & tmp[,4] == tmp[,9] & tmp[,5] == tmp[,10],] + + output <- as.data.frame(cbind(strain=as.character(unlist(tmp[,4])), + chromosome=as.character(unlist(tmp[,2])), + position_start=as.numeric(as.character(unlist(tmp[,3]))), + position_end=as.numeric(as.character(unlist(tmp[,8]))), + genotype=as.numeric(as.character(unlist(tmp[,5]))))) + output[,1] <- as.character(unlist(output[,1])) + output[,2] <- as.character(unlist(output[,2])) + output[,3] <- as.numeric(as.character(unlist(output[,3]))) + output[,4] <- as.numeric(as.character(unlist(output[,4]))) + + return(output) + } \ No newline at end of file diff --git a/Function_scripts/transcriptomics.age.predictor.R b/Function_scripts/transcriptomics.age.predictor.R new file mode 100644 index 0000000..fe9f03b --- /dev/null +++ b/Function_scripts/transcriptomics.age.predictor.R @@ -0,0 +1,49 @@ +###predicts relative age of C. elegans transcriptomics sample +# Snoek, L. Basten, et al. "A rapid and massive gene expression shift marking adolescent transition in C. elegans." Scientific reports 4 (2014): 3912. + +###input +#df_input (dataframe with columns: SampleID,gene_WBID,log2_intensities) +#predict_lm_fit (a fitted linear model object) + +###output +#dataframe with: SampleID, hours_median,hours_sd,age_relative,age_relative_sd + +###Description +# Please note that only the relative age is reliable for the age-range of 44-68 hours. If your samples are outside of that range, the prediction is +# probably not accurate. The predicated age can only be interpreted relative; only if you have samples for calibration the relative age can be converted + +transcriptomics.age.predictor <- function(df_input,predict_lm_fit=age_predict_lm_fit){ + if(missing(df_input)){ stop("give input dataframe, columns: SampleID,gene_WBID,log2_intensities")} + if(missing(predict_lm_fit)){ stop("requires fitted model timepoint~expression:gene")} + + + genes <- unlist(strsplit(names(predict_lm_fit$coefficients),split=":")) + genes <- genes[grepl("WBID",genes)] + genes <- gsub("gene_WBID","",genes) + + out <- NULL + for(i in unique(df_input$SampleID)){ + sample.nu <- dplyr::filter(df_input,SampleID==i) %>% + dplyr::filter(gene_WBID %in% genes) %>% + dplyr::group_by(gene_WBID) %>% + dplyr::summarise(log2_intensities=median(log2_intensities)) %>% + dplyr::mutate(expression=log2(log2_intensities/mean(log2_intensities))) %>% + dplyr::select(gene_WBID,expression) %>% + dplyr::mutate(gene_WBID=as.character(unlist(gene_WBID))) + + out <- rbind(out, + cbind(SampleID=i,hours_median=median(predict(predict_lm_fit,newdata=sample.nu)),hours_sd=sd(predict(predict_lm_fit,newdata=sample.nu))) + ) + } + + out <- data.frame(out) + out[,1] <- as.character(unlist(out[,1])) + out[,2] <- as.numeric(as.character(unlist(out[,2]))) + out[,3] <- as.numeric(as.character(unlist(out[,3]))) + + out <- mutate(out,age_relative=(hours_median-mean(hours_median))/sd(hours_median),age_relative_sd=hours_sd/sd(hours_median)) + + warning("prediction is accurate for RELATIVE age between 44 - 68 hours of age") + return(out) + } + diff --git a/Function_scripts/transcriptomics.df.to.list.R b/Function_scripts/transcriptomics.df.to.list.R new file mode 100644 index 0000000..ab1fdf9 --- /dev/null +++ b/Function_scripts/transcriptomics.df.to.list.R @@ -0,0 +1,48 @@ +################################################################################################################################################################ +### dataframe.to.list(dataframe,variable.test,transformation.test) v1.3 +### returns a list with a data matrix of the selected data column for use in the lm scripts and a numeric vector of the variables +### obligatory input +### dataframe, output from list.to.dataframe +### variable.test, the names of the columns for which the variables should be generated, simply give the names in a character vector +### transformation.test, the name of the column that should be turned into a data matrix +### v1.2 Modified the function so multiple variables can be extracted at once +### v1.3 Fixed naming the output + +transcriptomics.df.to.list <- function(dataframe,variable.test,transformation.test){ + if(missing(dataframe)){ stop("require dataframe")} + if(missing(variable.test)){ stop("require column names of the variables that should be tested")} + if(missing(transformation.test)){ stop("require column name of the data that should be tested")} + + ###Generate output file + output <- list() + + ###Generate the data file + col.dat <- colnames(dataframe) == transformation.test + + + output[[1]] <- matrix(dataframe[,col.dat],nrow=length(unique(dataframe$SpotID))) + colnames(output[[1]]) <- rep("",times=ncol(output[[1]])) + rownames(output[[1]]) <- unique(dataframe$SpotID) + + ###Generate the variable vectors for lm + col.var <- colnames(dataframe) %in% variable.test + + n <- 2 + for(i in which(col.var)){ + variables <- as.numeric(as.factor(dataframe[seq(1,nrow(dataframe),by=length(unique(dataframe$SpotID))),i])) + variables[variables ==1] <- -1 + variables[variables > 1] <- variables[variables > 1] -1 + names(variables) <- dataframe[seq(1,nrow(dataframe),by=length(unique(dataframe$SpotID))),i] + + output[[n]] <- variables + if(n==2){colnames(output[[1]]) <- names(variables)} + if(n> 2){colnames(output[[1]]) <- paste(colnames(output[[1]]),names(variables),sep=":")} + n <- n+1 + } + + ###name the output + names(output) <- c(transformation.test,colnames(dataframe)[col.var]) + + ###return + return(output) + } diff --git a/Function_scripts/transcriptomics.lm.R b/Function_scripts/transcriptomics.lm.R new file mode 100644 index 0000000..db46286 --- /dev/null +++ b/Function_scripts/transcriptomics.lm.R @@ -0,0 +1,88 @@ +###linear model for transcriptomics + +###input +#lm_data (matrix with transcriptome data) +#lm_factors (dataframe with factors over which data should be explained) +#lm_model (either addtive, interaction, or custom) +#lm_model_custom (in case custom is selected as model, provide the model) + +###output +#list with input data, factors, and outcome of the model + +###Description +# custom models can be written as lm_factors[,1] + lm_factors[,2] + lm_factors[,1] : lm_factors[,3] + + +transcriptomics.lm <- function(lm_data,lm_factors,lm_model,lm_model_custom){ + if(missing(lm_data)){ stop("provide data for the linear model")} + if(missing(lm_factors)){ stop("provide dataframe with the explanatory factors")} + if(missing(lm_model)){ lm_model <- "additive"} + if(!lm_model %in% c("additive","interaction","custom")){ stop("the supported model types are: additive, interaction, and custom")} + if(lm_model == "custom"){ + if(missing(lm_model_custom)){ stop("provide a model in the form of: lm_factors[,1] + lm_factors[,2] + lm_factors[,1] : lm_factors[,3]")} + } + + if(length(dim(lm_factors)) == 0){lm_factors <- matrix(lm_factors,ncol=1); colnames(lm_factors) <- "x"} + + ###constructs the names of the interaction model + interaction_names <- function(output){ + name_matrix <- matrix(colnames(output[[2]]),ncol=ncol(output[[2]]),nrow=ncol(output[[2]])) + tmp <- list() + lm_names <- NULL + + for(j in 1:ncol(output[[2]])){ + if(j == 1){ + tmp[[j]] <- name_matrix + } else { + tmp[[j]] <- name_matrix[-c(1:(j-1)),-c(1:(j-1))] + } + if(j < ncol(output[[2]])){ + for(i in 2:ncol(tmp[[j]])){ + tmp[[j]][,i] <- paste(paste(tmp[[j]][1:(i-1),i],collapse=":"),tmp[[j]][,i],sep=":") + } + } + tmp[[j]][upper.tri(tmp[[j]])] <- NA + } + for(j in 1:ncol(output[[2]])){ + if(j == 1){ + lm_names <- c(lm_names,tmp[[j]][,j]) + } else { + for(i in 1:ncol(output[[2]])){ + if(i <= (ncol(output[[2]])+1-j)){ + lm_names <- c(lm_names,tmp[[i]][,j]) + } + } + } + } + lm_names <- lm_names[!is.na(lm_names)] + return(lm_names) + } + + output <- list() + output[[1]] <- lm_data; if(length(rownames(output[[1]])) == 0){rownames(output[[1]]) <- 1:nrow(output[[1]])} + output[[2]] <- lm_factors; if(length(colnames(output[[2]])) == 0){colnames(output[[2]]) <- 1:ncol(output[[2]])} + + + if(lm_model == "additive"){ + lm_model_custom <- paste("lm(t(output[[1]]) ~ ",paste(paste("output[[2]][,",1:ncol(output[[2]]),"]",sep=""),collapse=" + "),")",sep="") + tmp <- fast.summ.transcriptomics(eval(parse(text=lm_model_custom))) + colnames(tmp) <- c(paste("significance_",colnames(output[[2]]),sep=""),paste("effect_",colnames(output[[2]]),sep="")) + } + if(lm_model == "interaction"){ + lm_model_custom <- paste("lm(t(output[[1]]) ~ ",paste(paste("output[[2]][,",1:ncol(output[[2]]),"]",sep=""),collapse=" * "),")",sep="") + tmp <- fast.summ.transcriptomics(eval(parse(text=lm_model_custom))) + colnames(tmp) <- c(paste("significance_",interaction_names(output),sep=""),paste("effect_",interaction_names(output),sep="")) + } + if(lm_model == "custom"){ + lm_model_custom <- paste("lm(t(output[[1]]) ~ ",lm_model_custom,")",sep="") + tmp <- fast.summ.transcriptomics(eval(parse(text=lm_model_custom))) + } + + + + output[[3]] <- tmp + + names(output) <- c("data","factors","model") + + return(output) + } diff --git a/Function_scripts/transgression.calc.R b/Function_scripts/transgression.calc.R new file mode 100644 index 0000000..8cf87ec --- /dev/null +++ b/Function_scripts/transgression.calc.R @@ -0,0 +1,73 @@ +###function for calculating transgression in eQTL sets +#The method is based on Brem, 2005 and assumes single measurements per strain per trait, except for the parental strains. + +###input +#trait.value: a vector with trait values +#strain: a vector with strain names matching the trait values +#trait: a vector with trait names (e.g you can enter multiple traits this way) +#parental.strain: names of parental strains (only for Keurentjes; otherwise parental strains should be removed from the analysis) +#sigmas: how many sigmas should RILs be beyond the parents to be called significant? (standard is 2) +#n.perm: number of permutations to be done (standard 1000) + +###output +#a dataframe with traits per row and columns with the number of transgressive strains for the actual dataset and for FDR0.05 and 0.01 threshold + + + + ###Transgression, calculated and permutated as in Brem, 2005 + transgression.calc <- function(trait.value,strain,trait,parental.strain,sigmas,n.perm){ + if(missing(trait.value)){ stop("provide trait values")} + if(missing(strain)){ stop("provide strain names")} + if(missing(trait)){ trait <- rep("A",length(trait_value))} + if(missing(parental.strain)){ stop("provide names of two parental strains")} + if(missing(sigmas)){ sigmas <- 2} + if(missing(n.perm)){ n.perm <- 1000} + + ###Convenience function for transgression + transg.sapply <- function(input,parental.strain,sigmas){ + selc.parental1 <- tolower(input$strain) %in% tolower(parental.strain[1]) + selc.parental2 <- tolower(input$strain) %in% tolower(parental.strain[2]) + + VAR.PL <- tapply(input$trait.value[(selc.parental1|selc.parental2)],input$strain[(selc.parental1|selc.parental2)],var) + N.PL <- tapply(input$trait.value[(selc.parental1|selc.parental2)],input$strain[(selc.parental1|selc.parental2)],length) + SD.PL <- (sum(VAR.PL*(N.PL-1))/sum((N.PL-1)))^2 + + MEAN.PL1 <- mean(input$trait.value[selc.parental1],na.rm=T) + MEAN.PL2 <- mean(input$trait.value[selc.parental2],na.rm=T) + + tmp <- tapply(input$trait.value[!(selc.parental1|selc.parental2)],input$strain[!(selc.parental1|selc.parental2)],mean,na.rm=T) + + transg <- sum(tmp < (min(c(MEAN.PL1,MEAN.PL2))-sigmas*SD.PL) | + tmp > (max(c(MEAN.PL1,MEAN.PL2))+sigmas*SD.PL)) + + return(transg) + } + + ###Prep the data + input <- data.frame(cbind(trait.value=trait.value,strain=strain)) + input$trait.value <- as.numeric(as.character(unlist(input$trait.value))) + input$strain <- as.character(unlist(input$strain)) + + input <- split(input,trait) + + transg <- sapply(input,transg.sapply,parental.strain = parental.strain, sigmas = sigmas) + + perm <- NULL + for(i in 1:n.perm){ + input.perm <- lapply(input,function(x){x[,1] <- x[order(runif(nrow(x))),1];return(x)}) + transg.perm <- sapply(input.perm,transg.sapply,parental.strain = parental.strain, sigmas = sigmas) + perm <- rbind(perm,transg.perm) + } + FDR0.1 <- apply(perm,2,function(x){sort(x)[ceiling(n.perm*0.90)]}) + FDR0.05 <- apply(perm,2,function(x){sort(x)[ceiling(n.perm*0.95)]}) + FDR0.01 <- apply(perm,2,function(x){sort(x)[ceiling(n.perm*0.99)]}) + + output <- as.data.frame(cbind(trait=names(input),n_strains_transgression=transg,FDR0.1=FDR0.1,FDR0.05=FDR0.05,FDR0.01=FDR0.01)) + output[,1] <- as.character(unlist(output[,1])) + output[,2] <- as.numeric(as.character(unlist(output[,2]))) + output[,3] <- as.numeric(as.character(unlist(output[,3]))) + output[,4] <- as.numeric(as.character(unlist(output[,4]))) + output[,5] <- as.numeric(as.character(unlist(output[,5]))) + return(output) + } + diff --git a/mainscript_aS_eQTL.R b/mainscript_aS_eQTL.R index 9c61bda..5a0743d 100644 --- a/mainscript_aS_eQTL.R +++ b/mainscript_aS_eQTL.R @@ -20,8 +20,8 @@ } setwd(workwd) - source(paste(git_dir,"/NEMA_functions/Loader.R",sep="")) - NEMA.get.functions("mainscript_aS_eQTL.R") + #source(paste(git_dir,"/NEMA_functions/Loader.R",sep="")) + #NEMA.get.functions("mainscript_aS_eQTL.R",NEMA_functions_dir = "C:/Users/sterk009/OneDrive - Wageningen University & Research/Shared_data_Mark/Projects_R_Zone/Git/NEMA_functions") # Packages ---------------------------------------------------------------- -- GitLab