Skip to content
Snippets Groups Projects
Commit adce632a authored by Hartanto, Margi's avatar Hartanto, Margi
Browse files

add functions folder and fix horizontal threshold on eQTL histogram

parent 07c1b041
No related branches found
No related tags found
No related merge requests found
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
.Ruserdata .Ruserdata
figures figures
files files
functions
networks networks
qtl-peaks qtl-peaks
qtl-permutations qtl-permutations
......
...@@ -20,7 +20,6 @@ ...@@ -20,7 +20,6 @@
library(UpSetR) library(UpSetR)
library(forcats) library(forcats)
library(grid) library(grid)
library(heritability) library(heritability)
library(topGO) library(topGO)
library(org.At.tair.db) library(org.At.tair.db)
...@@ -33,9 +32,6 @@ ...@@ -33,9 +32,6 @@
# library(factoextra) # library(factoextra)
# library(stats) # library(stats)
# library(amap) # library(amap)
#
#
#
# library(gridExtra) # library(gridExtra)
# library(RCy3) # library(RCy3)
# library(corrplot) # library(corrplot)
...@@ -344,8 +340,8 @@ ...@@ -344,8 +340,8 @@
ylim(c(0, 100)) + ylim(c(0, 100)) +
geom_hline(data = hist.threshold, aes(yintercept = threshold), geom_hline(data = hist.threshold, aes(yintercept = threshold),
linetype = 'dashed', linetype = 'dashed',
color = 'red', color = 'black',
size = .1) + size = .5) +
scale_x_continuous(breaks=c(5, 10, 15, 20, 25, 30, 35, 40)*10^6,labels=c(5, 10, 15, 20, 25, 30, 35, 40)) scale_x_continuous(breaks=c(5, 10, 15, 20, 25, 30, 35, 40)*10^6,labels=c(5, 10, 15, 20, 25, 30, 35, 40))
tiff(file = paste0("figures/eqtl-hist.tiff"), tiff(file = paste0("figures/eqtl-hist.tiff"),
......
...@@ -92,7 +92,6 @@ ...@@ -92,7 +92,6 @@
geom_histogram(binwidth = 2000000, right = T, origin = 0, alpha = 1) + geom_histogram(binwidth = 2000000, right = T, origin = 0, alpha = 1) +
#geom_density(alpha = 0.5) + #geom_density(alpha = 0.5) +
facet_grid(factor(qtl_level, level = c('phQTL', 'mQTL', 'eQTL')) ~ qtl_chromosome, scales="free") + facet_grid(factor(qtl_level, level = c('phQTL', 'mQTL', 'eQTL')) ~ qtl_chromosome, scales="free") +
presentation + presentation +
scale_fill_manual(values = c('#000000', '#ccbb44', '#228833', '#4477aa', '#cc3311')) + scale_fill_manual(values = c('#000000', '#ccbb44', '#228833', '#4477aa', '#cc3311')) +
theme(legend.position = "top") + theme(legend.position = "top") +
......
###Linear models
###Fast summary function
###This function cannot handle NA's!
fast.summ <- function(input){
p <- input$rank
rdf <- input$df.residual
Qr <- input$qr
n <- nrow(Qr$qr)
p1 <- 1L:p
r <- input$residuals
rss <- colSums(r^2)
resvar <- rss/rdf
R <- chol2inv(Qr$qr[p1, p1, drop = FALSE])
se <- sqrt(rep(diag(R),each=length(resvar)) * rep(resvar,times=length(diag(R))))
est <- input$coefficients[Qr$pivot[p1],]
tval <- t(t(est)/se)
pvals <- round(-log10(2 * pt(abs(tval),rdf, lower.tail = FALSE)),digits=2)
output <- cbind(t(pvals)[,-1],t(round(est,digits=5))[,-1])
return(output)
}
###Fast linear model, 1 variable
###Uses fast.summ, can't handle NA's!
fast.lm.1 <- function(traits,variable){
if(missing(traits)){ stop("specify traits")}
if(missing(variable)){ stop("specify variable")}
output <- fast.summ(lm(t(traits)~variable))
return(output)
}
###Fast linear model, 2 variables, additive
fast.lm.2.add <- function(variable1,traits,variable2){
if(missing(traits)){ stop("specify traits")}
if(missing(variable1)){ stop("specify variable1")}
if(missing(variable2)){ stop("specify variable2")}
output <- fast.summ(lm(t(traits)~variable1+variable2))
if(dim(output)[2]==2){
output <- cbind(output[,1],NA,output[,2],NA)
}
if(dim(output)[2]==4){
colnames(output) <- c("pval1","pval2","eff1","eff2")
return(output)
}
}
###Fast linear model, 2 variables, interaction
fast.lm.2.int <- function(variable1,traits,variable2){
if(missing(traits)){ stop("specify traits")}
if(missing(variable1)){ stop("specify variable1")}
if(missing(variable2)){ stop("specify variable2")}
output <- fast.summ(lm(t(traits)~variable1*variable2))
if(dim(output)[2]==2){
output <- cbind(output[,1],NA,NA,output[,2],NA,NA)
}
if(dim(output)[2]==4){
output <- cbind(output[,1:2],NA,output[,3:4],NA)
}
if(dim(output)[2]==6){
colnames(output) <- c("pval1","pval2","pvali","eff1","eff2","effi")
return(output)
}
}
###Slow linear model for NA handling
lm.1 <- function(data.input,variable){
lm.tapply <- function(y,x){return(c(diff(tapply(y[x!=0 & !is.na(x)],x[x!=0 & !is.na(x)],mean,na.rm=T)),cor.test(y[x!=0 & !is.na(x)],x[x!=0 & !is.na(x)],na.action=na.exclude())$p.value))}
out <- tapply(as.numeric(t(data.input)),rep(rownames(data.input),each=ncol(data.input)),lm.tapply,x=variable)
out <- unlist(out)
out <- cbind(LOD=-log10(out[seq(2,length(out),by=2)]),effect=out[seq(1,length(out),by=2)])
return(out)
}
###Permutation function
permutate.traits <- function(trait.matrix){
perm.list <- rep(c(1:nrow(trait.matrix)),times=ncol(trait.matrix)) + runif(length(trait.matrix),0.25,0.75)
perm.list <- order(perm.list)
perm.data <- matrix(as.numeric(trait.matrix)[perm.list],nrow=nrow(trait.matrix),ncol=ncol(trait.matrix),byrow=TRUE)
return(perm.data)
}
###calculates R-squared as in a linear model
R.sq <- function(x,y){return(cor(x,y,use="na.or.complete")^2)}
###Wrongly labeled sample function, takes eQTL table output
###v1.1 Fixed marker calling, now markers are re-calculated based on the population used (so the known genetic
### information about the population).
eQTL.WLS <- function(eQTL.table.output,mean.centered.data,strain.map,strain.marker){
if(missing(eQTL.table.output)){ stop("need output from eQTL.table")}
if(missing(mean.centered.data)){ stop("need mean centered data: columns: trait, strain, ratio, sample")}
if(missing(strain.map)){ stop("strain.map missing; need the population map")}
if(missing(strain.marker)){ stop("strain.marker missing; need information about the markers: name, chromosome, and basepair")}
###Force strain.marker column names into format
if(ncol(strain.marker) != 3){ stop("need 3 columns in strain.marker")}
colnames(strain.marker) <- c("name","chromosome","basepair")
strain.marker <- mutate(strain.marker,marker=1:nrow(strain.map))
###Force mean.centered.data column names into format
if(ncol(mean.centered.data) != 4){ stop("need 4 columns in mean.centered.data")}
colnames(mean.centered.data) <- c("trait", "strain", "ratio", "sample")
###Convert genotypes to list
geno.list <- mutate(as.data.frame(strain.map),marker=c(1:nrow(strain.map))) %>%
gather(key=strain_genotype,value=genotype,-marker) %>%
filter(!is.na(genotype))
###Take only the cis eQTL
eQTL.table.output <- filter(eQTL.table.output,qtl_type=="cis") %>%
select(trait,qtl_chromosome,qtl_bp,qtl_significance,qtl_effect,gene_chromosome,gene_bp,gene_WBID) %>%
merge(strain.marker,by.x=2,by.y=2) %>%
group_by(trait,qtl_significance,qtl_effect) %>%
mutate(closeness=abs(qtl_bp-basepair)) %>%
mutate(closest=ifelse(closeness==min(closeness),"yep","nope")) %>%
filter(!duplicated(closest),closest=="yep") %>%
as.data.frame() %>%
select(marker,trait,qtl_chromosome,qtl_bp,qtl_significance,qtl_effect,gene_chromosome,gene_bp,gene_WBID) %>%
merge(mean.centered.data)
###Use the cis QTL to calculate the correlation between the predicted (eQTL) effect versus the expression in the strain
out <- NULL
for(i in 1:length(unique(geno.list$strain_genotype))){
out.tmp <- merge(eQTL.table.output,filter(geno.list,strain_genotype==unique(geno.list$strain_genotype)[i])) %>%
group_by(sample,strain,strain_genotype) %>%
summarise(correlation = cor(x=genotype*ratio,y=qtl_effect,use="complete.obs")) %>%
as.data.frame()
out <- rbind(out,out.tmp)
}
out <- group_by(out,sample) %>%
mutate(rank.map=length(correlation)-rank(correlation)+1) %>%
as.data.frame()
###Return the correlations
return(out)
}
###Plotting function for WLS data
plot.eQTL.WLS <- function(eQTL.WLS.output,filename){
if(missing(filename )){ filename <- "WLS"}
###Write a pdf with the correlations for manual inspection
pdf(paste(filename,".pdf",sep=""),width=18,height=8)
for(i in 1:length(unique(eQTL.WLS.output$sample))){
to.plot <- filter(eQTL.WLS.output,sample==unique(eQTL.WLS.output$sample)[i])
plot(y=to.plot$correlation,x=to.plot$rank.map,main=unique(eQTL.WLS.output$sample)[i],axes=F,xlab="Predicted strain",ylab="Correlation",pch=19,col=rgb(0,0,0,alpha=0.5))
axis(1,c(1:nrow(to.plot)),labels=to.plot$strain_genotype[order(to.plot$rank.map)],las=2,cex.axis=0.5)
axis(2,c(-1,-0.5,0,0.5,1),c(-1,-0.5,0,0.5,1),las=2)
text(y=seq(max(to.plot$correlation),mean(to.plot$correlation),length=10),x=0.75*nrow(to.plot),labels=apply(to.plot[order(to.plot$rank.map),][1:10,c(3,4)],1,paste,collapse=" "))
box()
}
dev.off()
}
###Data preparation
QTL.data.prep <- function(trait.matrix,strain.trait,strain.map,strain.marker){
if(missing(trait.matrix)){ stop("requires trait data, traits per row and strains per column")}
if(missing(strain.trait)){ stop("requires the strain names matching the trait data")}
if(missing(strain.map)){ stop("requires the genetic map of the population")}
if(missing(strain.marker)){ stop("requires the marker info (name, chromosome, basepair) of the map")}
if(ncol(trait.matrix) != length(strain.trait)){ stop("the number of strains does not correspond with the trait matrix")}
if(sum(tolower(strain.trait) %in% tolower(colnames(strain.map))) < length(strain.trait)){
warning("not all strains are in the strain.map file")}
if(nrow(strain.map) != nrow(strain.marker)){ stop("the number of markers (strain.map) does not equal the number of marker descriptions (strain.marker)")}
if(ncol(strain.marker) != 3){ stop("make sure the strain.marker file contains a column with the marker name, chromosome, and bp location")}
strain.map <- data.matrix(strain.map)
###Make matrix
map.nu <- matrix(NA,nrow=nrow(strain.map),ncol=length(strain.trait))
for(j in 1:length(strain.trait)){
map.nu[,j] <- strain.map[,tolower(colnames(strain.map)) == tolower(strain.trait[j])]
}
colnames(map.nu) <- strain.trait
rownames(map.nu) <- rownames(strain.map)
###add rownames to the trait matrix
if(length(rownames(trait.matrix)) == 0){
rownames(trait.matrix) <- 1:nrow(trait.matrix)
}
###Make output
output <- NULL
output[[1]] <- trait.matrix
output[[2]] <- map.nu
output[[3]] <- data.frame(strain.marker)
names(output) <- c("Trait","Map","Marker")
###return output
return(output)
}
\ No newline at end of file
###calculates H2 based on REML (part of H2.broad)
H2.REML <- function(input,Vg.factor){
if(missing(Vg.factor)){Vg.factor <- 1}
REML <- lmer(trait.value ~ 1 + (1|strain),data=input)
Variations <- as.data.frame(VarCorr(REML))$vcov
H2 <- c((Variations[1]*Vg.factor)/sum(Variations*c(Vg.factor,1)),(Variations*c(Vg.factor,1)))
names(H2) <- c("H2_REML","Vg","Ve")
return(H2)
}
###Calculates H2 based on ANOVA (part of H2.broad)
H2.ANOVA <- function(input,Vg.factor){
if(missing(Vg.factor)){Vg.factor <- 1}
ANOVA <- anova(lm(trait.value~strain,data=input))
Variations <- ANOVA[[2]]
H2 <- c((Variations[1]*Vg.factor)/sum(Variations*c(Vg.factor,1)),(Variations*c(Vg.factor,1)))
names(H2) <- c("H2_ANOVA","Vg","Ve")
return(H2)
}
###Calculates H2 as described in Keurentjes, 2007, in case error has to be estimated on limited number of strains
###Permutation is based on Brem, 2005
H2.Keurentjes <- function(input,parental.strain,Vg.factor){
if(missing(Vg.factor)){Vg.factor <- 1}
selc.parental <- input$strain %in% parental.strain
###Ve is estimated based on parental replicates
VAR.PL <- tapply(input$trait.value[selc.parental],as.character(unlist(input$strain[selc.parental])),var)
N.PL <- tapply(input$trait.value[selc.parental],as.character(unlist(input$strain[selc.parental])),length)
VAR.E <- sum(VAR.PL*(N.PL-1))/sum((N.PL-1))
##Vt is estimated based on RILs
VAR.T <- var(input$trait.value[!selc.parental])
H2 <- c(((VAR.T-VAR.E)*Vg.factor)/VAR.T,((VAR.T-VAR.E)*Vg.factor),VAR.E)
names(H2) <- c("H2_keurentjes","Vg","Ve")
return(H2)
}
###Wrapper function for broad-sense heritability
H2.broad <- function(trait.value,strain,trait,method,parental.strain,n.perm,Vg.factor){
if(missing(trait.value)){ stop("requires trait value")}
if(missing(strain)){ stop("requires strain values")}
if(missing(trait)){ trait <- rep("A",length(trait.value))}
if(missing(method)){ method <- "ANOVA"}
if(!method %in% c("ANOVA","REML","Keurentjes_2007")){ stop("method should be either ANOVA, REML, or Keurentjes_2007")}
if(method=="Keurentjes_2007"){
if(missing(parental.strain)){ stop("when using Keurentjes_2007, parental strain names should be provided")}
if(sum(parental.strain %in% strain) != length(parental.strain)){
stop("not all parental strains are in the data")}
}
if(missing(n.perm)){ n.perm <- 1000}
if(missing(Vg.factor)){ Vg.factor <- 1}
###Prep the data
input <- data.frame(cbind(trait.value=trait.value,strain=strain))
input$trait.value <- as.numeric(as.character(unlist(input$trait.value)))
input$strain <- as.factor(as.character(unlist(input$strain)))
input <- split(input,trait)
if(method == "ANOVA"){
H2 <- t(sapply(input,H2.ANOVA,Vg.factor))
perm <- NULL
for(i in 1:n.perm){
input.perm <- lapply(input,function(x){x[,1] <- x[order(runif(nrow(x))),1];return(x)})
H2.perm <- t(sapply(input.perm,H2.ANOVA,Vg.factor))
perm <- rbind(perm,H2.perm[,1])
}
H2 <- cbind(H2,FDR0.05_ANOVA=apply(perm,2,function(x){sort(x)[ceiling(n.perm*0.95)]}))
}
if(method == "REML"){
H2 <- t(sapply(input,H2.REML,Vg.factor))
perm <- NULL
for(i in 1:n.perm){
input.perm <- lapply(input,function(x){x[,1] <- x[order(runif(nrow(x))),1];return(x)})
H2.perm <- t(sapply(input.perm,H2.REML,Vg.factor))
perm <- rbind(perm,H2.perm[,1])
}
H2 <- cbind(H2,FDR0.05_REML=apply(perm,2,function(x){sort(x)[ceiling(n.perm*0.95)]}))
}
if(method == "Keurentjes_2007"){
H2 <- t(sapply(input,H2.Keurentjes,parental.strain = parental.strain,Vg.factor))
perm <- NULL
for(i in 1:n.perm){
input.perm <- lapply(input,function(x){x[,1] <- x[order(runif(nrow(x))),1];return(x)})
H2.perm <- t(sapply(input.perm,H2.Keurentjes,parental.strain = parental.strain,Vg.factor))
perm <- rbind(perm,H2.perm[,1])
}
H2 <- cbind(H2,FDR0.05_Keurentjes=apply(perm,2,function(x){sort(x)[ceiling(n.perm*0.95)]}))
}
output <- as.data.frame(cbind(trait=names(input),round(H2,digits=3)))
output[,1] <- as.character(unlist(output[,1]))
output[,2] <- as.numeric(as.character(unlist(output[,2])))
output[,3] <- as.numeric(as.character(unlist(output[,3])))
output[,4] <- as.numeric(as.character(unlist(output[,4])))
output[,5] <- as.numeric(as.character(unlist(output[,5])))
return(output)
}
###Transgression, calculated and permutated as in Brem, 2005
transgression.calc <- function(trait.value,strain,trait,parental.strain,sigmas,n.perm){
if(missing(trait.value)){ stop("provide trait values")}
if(missing(strain)){ stop("provide strain names")}
if(missing(trait)){ trait <- rep("A",length(trait_value))}
if(missing(parental.strain)){ stop("provide names of two parental strains")}
if(missing(sigmas)){ sigmas <- 2}
if(missing(n.perm)){ n.perm <- 1000}
###Convenience function for transgression
transg.sapply <- function(input,parental.strain,sigmas){
selc.parental1 <- tolower(input$strain) %in% tolower(parental.strain[1])
selc.parental2 <- tolower(input$strain) %in% tolower(parental.strain[2])
VAR.PL <- tapply(input$trait.value[(selc.parental1|selc.parental2)],input$strain[(selc.parental1|selc.parental2)],var)
N.PL <- tapply(input$trait.value[(selc.parental1|selc.parental2)],input$strain[(selc.parental1|selc.parental2)],length)
SD.PL <- (sum(VAR.PL*(N.PL-1))/sum((N.PL-1)))^2
MEAN.PL1 <- mean(input$trait.value[selc.parental1],na.rm=T)
MEAN.PL2 <- mean(input$trait.value[selc.parental2],na.rm=T)
tmp <- tapply(input$trait.value[!(selc.parental1|selc.parental2)],input$strain[!(selc.parental1|selc.parental2)],mean,na.rm=T)
transg <- sum(tmp < (min(c(MEAN.PL1,MEAN.PL2))-sigmas*SD.PL) |
tmp > (max(c(MEAN.PL1,MEAN.PL2))+sigmas*SD.PL))
return(transg)
}
###Prep the data
input <- data.frame(cbind(trait.value=trait.value,strain=strain))
input$trait.value <- as.numeric(as.character(unlist(input$trait.value)))
input$strain <- as.character(unlist(input$strain))
input <- split(input,trait)
transg <- sapply(input,transg.sapply,parental.strain = parental.strain, sigmas = sigmas)
perm <- NULL
for(i in 1:n.perm){
input.perm <- lapply(input,function(x){x[,1] <- x[order(runif(nrow(x))),1];return(x)})
transg.perm <- sapply(input.perm,transg.sapply,parental.strain = parental.strain, sigmas = sigmas)
perm <- rbind(perm,transg.perm)
}
FDR0.1 <- apply(perm,2,function(x){sort(x)[ceiling(n.perm*0.90)]})
FDR0.05 <- apply(perm,2,function(x){sort(x)[ceiling(n.perm*0.95)]})
FDR0.01 <- apply(perm,2,function(x){sort(x)[ceiling(n.perm*0.99)]})
output <- as.data.frame(cbind(trait=names(input),n_strains_transgression=transg,FDR0.1=FDR0.1,FDR0.05=FDR0.05,FDR0.01=FDR0.01))
output[,1] <- as.character(unlist(output[,1]))
output[,2] <- as.numeric(as.character(unlist(output[,2])))
output[,3] <- as.numeric(as.character(unlist(output[,3])))
output[,4] <- as.numeric(as.character(unlist(output[,4])))
output[,5] <- as.numeric(as.character(unlist(output[,5])))
return(output)
}
###Narrow sense heritabilities as calculated in Rockman, 2010. (see also Speed et al, 2012, American journal of human genetics).
h2.emma <- function(input,strain.map,kinship.matrix,Vg.factor){
if(missing(input)){ stop("provide a vector with the phenotype, matching the strain.map")}
if(missing(strain.map)){ stop("provide a matrix with genotypes (encoded as -1, 1), columns strains, rows markers")}
if(ncol(strain.map) != length(input)){ stop("phenotype vector does not match strain.map")}
if(missing(kinship.matrix)){
strain.map[strain.map==0 & !is.na(strain.map)] <- 0.5
strain.map[strain.map==-1 & !is.na(strain.map)] <- 0
strain.map[is.na(strain.map)] <- 0.5
kinship.matrix <- emma.kinship(snps=strain.map)
}
if(missing(Vg.factor)){ Vg.factor <- 1}
###Calculate Vg and Ve
emmaREML <- emma.REML.t(ys=input,xs=strain.map,K=kinship.matrix)
###Calculate heritability
Vg <- sum(emmaREML$vgs,na.rm=T)
Ve <- sum(emmaREML$ves,na.rm=T)
h2 <- c((Vg*Vg.factor)/((Vg*Vg.factor)+Ve),Vg,Ve)
names(h2) <- c("h2_emma","Vg","Ve")
return(h2)
}
###narrow sense heritabilities as calculated by heritability package, from Kruijer, Boer & Malosetti, 2015
h2.REML <- function(input,strain.names,kinship.matrix,Vg.factor){
if(missing(input)){ stop("provide a vector with the phenotype, matching the strain.map")}
if(missing(strain.names)){ stop("provide a matrix with strain names matching the phenotype")}
if(length(strain.names) != length(input)){ stop("phenotype vector does not match strain.map")}
if(missing(Vg.factor)){ Vg.factor <- 1}
###Calculate Vg and Ve
markerREML <- tryCatch({marker_h2(data.vector=input,geno.vector=strain.names,K=kinship.matrix,max.iter=40)},
error=function(e){cat("ERROR :",conditionMessage(e), "\n"); markerREML <- data.frame(cbind(va=NA,ve=NA)); return(markerREML)})
###Calculate heritability
Vg <- markerREML$va
Ve <- markerREML$ve
h2 <- c((Vg*Vg.factor)/((Vg*Vg.factor)+Ve),Vg,Ve)
names(h2) <- c("h2_REML","Vg","Ve")
return(h2)
}
h2.narrow <- function(map1.output,method,n.perm,Vg.factor,small){
if(missing(map1.output)){ stop("requires map1 file")}
if(missing(method)){ method <- "REML"}
if(!method %in% c("emma","REML")){ stop("only emma and REML are currently supported")}
if(missing(n.perm)){ n.perm <- 1000}
if(missing(Vg.factor)){ Vg.factor <- 1}
if(missing(small)){ small <- FALSE}
if(class(map1.output[[1]])=="numeric"){
small <- TRUE
}
if(small){
map1.output$Trait[2,] <- map1.output$Trait[1,]
}
###start calculating
if(method == "emma"){
###calculate kinship (handy in case of many traits)
tmp <- map1.output$Map
tmp[tmp==0 & !is.na(tmp)] <- 0.5
tmp[tmp==-1 & !is.na(tmp)] <- 0
tmp[is.na(tmp)] <- 0.5
kinship.matrix <- emma.kinship(snps=tmp)
h2 <- t(apply(map1.output$Trait,1,h2.emma,strain.map=map1.output$Map,kinship.matrix=kinship.matrix,Vg.factor=Vg.factor))
###permutation
perm <- NULL
map1.perm <- as.list(NULL)
map1.perm[[1]] <- map1.output$Trait
map1.perm[[2]] <- map1.output$Map
names(map1.perm) <- c("Trait","Map")
for(i in 1:n.perm){
map1.perm$Trait <- t(apply(map1.perm$Trait,1,function(x){x <- x[order(runif(length(x)))];return(x)}))
h2.perm <- t(apply(map1.perm$Trait,1,h2.emma,strain.map=map1.perm$Map,kinship.matrix=kinship.matrix,Vg.factor=Vg.factor))
perm <- rbind(perm,h2.perm[,1])
}
h2 <- cbind(h2,FDR0.05_emma=apply(perm,2,function(x){sort(x)[ceiling(n.perm*0.95)]}))
}
if(method == "REML"){
###calculate kinship (handy in case of many traits)
tmp <- map1.output$Map
tmp[tmp==0 & !is.na(tmp)] <- 0.5
tmp[tmp==-1 & !is.na(tmp)] <- 0
tmp[is.na(tmp)] <- 0.5
kinship.matrix <- emma.kinship(snps=tmp)
rownames(kinship.matrix) <- colnames(map1.output$Map)
colnames(kinship.matrix) <- colnames(map1.output$Map)
h2 <- t(apply(map1.output$Trait,1,h2.REML,strain.names=colnames(map1.output$Map),kinship.matrix=kinship.matrix,Vg.factor=Vg.factor))
###permutation
perm <- NULL
map1.perm <- as.list(NULL)
map1.perm[[1]] <- map1.output$Trait
map1.perm[[2]] <- map1.output$Map
names(map1.perm) <- c("Trait","Map")
for(i in 1:n.perm){
map1.perm$Trait <- t(apply(map1.perm$Trait,1,function(x){x <- x[order(runif(length(x)))];return(x)}))
h2.perm <- t(apply(map1.perm$Trait,1,h2.REML,strain.names=colnames(map1.perm$Map),kinship.matrix=kinship.matrix,Vg.factor=Vg.factor))
perm <- rbind(perm,h2.perm[,1])
}
h2 <- cbind(h2,FDR0.05_REML=apply(perm,2,function(x){sort(x,na.last = FALSE)[ceiling(n.perm*0.95)]}))
}
output <- as.data.frame(cbind(trait=rownames(h2),round(h2,digits=5)))
output[,1] <- as.character(unlist(output[,1]))
output[,2] <- as.numeric(as.character(unlist(output[,2])))
output[,3] <- as.numeric(as.character(unlist(output[,3])))
output[,4] <- as.numeric(as.character(unlist(output[,4])))
output[,5] <- as.numeric(as.character(unlist(output[,5])))
if(small){
output <- output[1,]
}
return(output)
}
\ No newline at end of file
This diff is collapsed.
###interactions between multiple peaks
map.1.interactions <- function(eQTL.table.file,map1.output){
if(missing(eQTL.table.file)){ stop("needs output from eQTL.table")}
if(missing(map1.output)){ stop("needs the map1 output file")}
traits <- eQTL.table.file$trait[duplicated(eQTL.table.file$trait)]
output <- NULL
for(i in 1:length(traits)){
traitrow <- which(rownames(map1.output$Trait) == traits[i])
maprows <- eQTL.table.file$qtl_peak[eQTL.table.file$trait == traits[i]]
lm_model_additive <- paste("lm(map1.output$Trait[",traitrow,",] ~ ",paste(paste("map1.output$Map[",maprows,",]",sep=""),collapse=" + "),")",sep="")
lm_model_interaction <- paste("lm(map1.output$Trait[",traitrow,",] ~ ",paste(paste("map1.output$Map[",maprows,",]",sep=""),collapse=" * "),")",sep="")
output_additive <- summary(eval(parse(text=lm_model_additive)))
output_interaction <- summary(eval(parse(text=lm_model_interaction)))
output <- rbind(output,cbind(trait=traits[i],model="additive",markers=rownames(output_additive[[4]])[-1],effect=output_additive[[4]][,1][-1],significance=output_additive[[4]][,4][-1],Rsquared=output_additive[[9]]))
output <- rbind(output,cbind(trait=traits[i],model="interaction",markers=rownames(output_interaction[[4]])[-1],effect=output_interaction[[4]][,1][-1],significance=output_interaction[[4]][,4][-1],Rsquared=output_interaction[[9]]))
}
output[,3] <- gsub("map1.output$Map[","",output[,3],fixed=TRUE)
output[,3] <- gsub(", ]","",output[,3],fixed=TRUE)
rownames(output) <- c(1:nrow(output))
output <- data.frame(output)
for(i in c(1:3)){output[,i] <- as.character(unlist(output[,i]))}
for(i in c(4:6)){output[,i] <- as.numeric(as.character(unlist(output[,i])))}
output <- cbind(output,p_val=-log10(output[,5]))
return(output)
}
\ No newline at end of file
###Permutation 1 marker mapping, genotypes per column, traits per row
map.1.perm <- function(trait.matrix,strain.map,strain.marker,n.perm){
if(missing(trait.matrix)){ stop("specify trait.matrix, matrix with traits in rows and genotypes in columns")}
if(missing(strain.map)){ stop("specify strain.map (genetic map)")}
if(ncol(strain.map) != dim(as.matrix(trait.matrix))[2] &
ncol(strain.map) != prod(dim(as.matrix(trait.matrix)))){ stop("number of strains is not equal to number of genotypes in map")}
if(missing(strain.marker)){ strain.marker <- data.frame(cbind(name=1:dim(strain.map)[1],chromosome=NA,bp=1:dim(strain.map)[1]))
rownames(strain.marker) <- 1:dim(strain.map)[1]}
if(missing(n.perm)){ n.perm <- 1000}
if(n.perm == 1 & dim(as.matrix(trait.matrix))[2] ==1){ stop("cannot conduct 1 permutation on 1 trait, set n.perm > 1")}
NAs <- sum(is.na(trait.matrix))>0 | sum(is.na(strain.map))>0
small <- FALSE
if(dim(as.matrix(trait.matrix))[1] ==1){
trait.matrix <- rbind(trait.matrix,1)
small <- TRUE
}
trait.matrix <- trait.matrix[rep(1:nrow(trait.matrix),each=n.perm),]
perm.data <- permutate.traits(trait.matrix)
rownames(perm.data) <- paste(rownames(trait.matrix),1:n.perm,sep="_")
if(small){
perm.data <- perm.data[1:n.perm,]
}
eff.out <- matrix(NA,nrow(trait.matrix),nrow(strain.map))
pval.out <- matrix(NA,nrow(trait.matrix),nrow(strain.map))
for(i in 1:nrow(strain.map)){
noseg <- length(unique(strain.map[i,])) ==1
if(noseg){
output.tmp <- matrix(c(0,0),byrow=TRUE,ncol=2,nrow=nrow(perm.data))
}
if( i == 1 & !noseg){
if(!NAs){output.tmp <- fast.lm.1(perm.data,strain.map[i,])}
if(NAs){output.tmp <- lm.1(perm.data,strain.map[i,])}
}
if( i != 1 & sum(abs(strain.map[i-1,]-strain.map[i,]),na.rm=T) != 0 & !noseg){
if(!NAs){output.tmp <- fast.lm.1(perm.data,strain.map[i,])}
if(NAs){output.tmp <- lm.1(perm.data,strain.map[i,])}
}
if( i != 1 & sum(abs(strain.map[i-1,]-strain.map[i,]),na.rm=T) == 0 & !noseg){
output.tmp <- output.tmp
}
eff.out[,i] <- output.tmp[,2]
pval.out[,i] <- output.tmp[,1]
}
colnames(eff.out) <- rownames(strain.marker)
rownames(eff.out) <- rownames(trait.matrix)
colnames(pval.out) <- rownames(strain.marker)
rownames(pval.out) <- rownames(trait.matrix)
output <- NULL; output <- as.list(output)
output[[1]] <- round(pval.out,digits=2)
output[[2]] <- round(eff.out,digits=3)
output[[3]] <- perm.data
output[[4]] <- strain.map
output[[5]] <- strain.marker
names(output) <- c("LOD","Effect","Trait_perm","Map","Marker")
return(output)
}
###Based on Benjamini and Yakuteili multiple testing under dependency
map.perm.fdr <- function(map1.output,filenames.perm,FDR_dir,q.value,small){
if(missing(map1.output)){ stop("Missing map1.output ([[LOD]],[[Effect]],[[Trait]],[[Map]],[[Marker]])")}
if(missing(filenames.perm)){ stop("Need a vector with filenames for permutation")}
if(missing(FDR_dir)){ FDR_dir <- getwd()}
if(missing(q.value)){ q.value <- 0.025}
if(missing(small)){ small <- FALSE}
output <- as.list(NULL)
output[[2]] <- matrix(0,ncol=length(filenames.perm),nrow=5000)
rownames(output[[2]]) <- seq(0.1,500,by=0.1)
colnames(output[[2]]) <- filenames.perm
###in case of one trait
if(small){
if(length(filenames.perm) > 1){
stop("needs one permutatation file, with n permutations")
}
tmp <- load(file=paste(FDR_dir,filenames.perm,sep=""))
###No NA's
tmp <- get(tmp)[[1]]
tmp[is.na(tmp)] <- 0.1
###no Non-traits
tmp <- tmp[rownames(tmp) != "",]
output[[2]] <- apply(tmp,1,max,na.rm=T)
if((length(output[[2]]) * q.value) < 10){
warning(paste("inadvisable to calculate q=",q.value,"based on ",length(output[[2]])," permutations. Increase to at least ", ceiling(10/q.value)))
}
###take the q-cutoff, *10 is to be consistent with eQTL methodology (is per 0.1)
output[[1]] <- rev(sort(output[[2]]))[ceiling(q.value*length(output[[2]]))]*10
###again, to be consistent;
output[[3]] <- mean(output[[2]],na.rm = TRUE)
###real data
output[[4]] <- max(map1.output$LOD,na.rm=TRUE)
names(output) <- c("Significance_threshold","False_discoveries","False_discoveries_average","Real_discoveries")
}
if(!small){
for(i in 1:length(filenames.perm)){
tmp <- load(file=paste(FDR_dir,filenames.perm[i],sep=""))
###No NA's
tmp <- get(tmp)[[1]]
tmp[is.na(tmp)] <- 0.1
tmp <- table(apply(round(tmp,digits=1),1,max,na.rm=T))
output[[2]][(as.numeric(names(tmp))*10),i] <- tmp
output[[2]][,i] <- rev(cumsum(rev(output[[2]][,i])))
}
###Take the average over permutations
output[[3]] <- apply(output[[2]],1,mean)
###Calculate the real discoveries, no NA's
tmp <- map1.output$LOD
tmp[is.na(tmp)] <- 0.1
RDS.tmp <- table(round(apply(tmp,1,max,na.rm=T),digits=1))
RDS <- rep(0,times=5000); RDS[(as.numeric(names(RDS.tmp))*10)] <- RDS.tmp
RDS <- rev(cumsum(rev(RDS)))
output[[4]] <- RDS
output[[1]] <- which(round(((dim(map1.output$LOD)[1]-RDS)/dim(map1.output$LOD)[1])*q.value*log10(dim(map1.output$LOD)[1]),digits=3)-round(output[[3]]/output[[4]],digits=3)>0)[1]
names(output) <- c("Significance_threshold","False_discoveries","False_discoveries_average","Real_discoveries")
}
###Return
return(output)
}
###auxiliary function for data simulation
simulate.data.per.marker <- function(strain.map,strain.marker,n_per_marker,sigma_peak,sigma_error){
sim.map.file <- list()
sim.map.file[[2]] <- data.matrix(strain.map)
types <- unique(as.numeric(as.character(unlist(sim.map.file[[2]]))))
sim.map.file[[2]] <- sim.map.file[[2]][rep(1:nrow(sim.map.file[[2]]),each=n_per_marker),]
sim.map.file[[1]] <- sim.map.file[[2]]
sim.map.file[[1]][sim.map.file[[2]]==types[1]] <- sigma_peak
sim.map.file[[1]][sim.map.file[[2]]==types[2]] <- 0
sim.map.file[[1]] <- sim.map.file[[1]] + rnorm(n=length(sim.map.file[[1]]),mean=0,sd=sigma_error)
rownames(sim.map.file[[1]]) <- paste(rownames(sim.map.file[[1]]),rep(1:n_per_marker,times=nrow(sim.map.file[[2]])/n_per_marker),sep="_")
sim.map.file[[2]] <- sim.map.file[[2]][seq(1,nrow(sim.map.file[[2]]),by=n_per_marker),]
sim.map.file[[3]] <- strain.marker
return(sim.map.file)
}
###Conducts mapping simulations to investigate power
map.1.simulation <- function(strain.map,strain.marker,n_per_marker,sigma_peak,sigma_error,threshold){
if(missing(strain.map)){ stop("specify strain.map (genetic map)")}
if(missing(strain.marker)){ strain.marker <- data.frame(cbind(name=1:dim(strain.map)[1],chromosome=NA,bp=1:dim(strain.map)[1]))
rownames(strain.marker) <- 1:dim(strain.map)[1]}
if(missing(n_per_marker)){ n_per_marker <-10}
if(missing(sigma_peak)){ sigma_peak <- c(1,1.15,1.3,1.47,1.64,1.82,2,2.2,2.47,2.73,3.05,3.45,4)}
if(missing(sigma_error)){ sigma_error <- 1}
if(missing(threshold)){ threshold <- c(3.5)}
###1 sigma difference on 1 sigma noise is 20% of variance
###2 sigma difference on 1 sigma noise is 50% of variance
###3 sigma difference on 1 sigma noise is 69% of variance
###4 sigma difference on 1 sigma noise is 80% of variance; summary(lm(c(rnorm(10000,0,1),rnorm(10000,4,1))~rep(c(-1,1),each=10000)))
###cbind(var_expl=seq(5,80,by=5),sigma=c(0.46,0.67,0.84,1,1.15,1.3,1.47,1.64,1.82,2,2.2,2.47,2.73,3.05,3.45,4))
out <- NULL
for(i in sigma_peak){
###Generate peaked data with random error
sim.map.file <- simulate.data.per.marker(strain.map,strain.marker,n_per_marker,sigma_peak=i,sigma_error)
sim.emp.file <- simulate.data.per.marker(strain.map,strain.marker,n_per_marker,sigma_peak=0,sigma_error)
###Map it
sim.map.file <- map.1(sim.map.file[[1]],sim.map.file[[2]],sim.map.file[[3]])
loc_peak <- as.numeric(as.character(unlist(apply(sim.map.file$LOD,1,which.max))))
lod_peak <- as.numeric(as.character(unlist(apply(sim.map.file$LOD,1,max,na.rm=TRUE))))
eff_peak <- t(sim.map.file$Effect)[(ncol(sim.map.file$Effect)*(0:(nrow(sim.map.file$Effect)-1)))+loc_peak]
loc_sim <- rep(1:nrow(sim.map.file$Map),each=n_per_marker)
lod_sim <- t(sim.map.file$LOD)[(ncol(sim.map.file$LOD)*(0:(nrow(sim.map.file$LOD)-1)))+loc_sim]
eff_sim <- t(sim.map.file$Effect)[(ncol(sim.map.file$Effect)*(0:(nrow(sim.map.file$Effect)-1)))+loc_sim]
out.sim <- NULL
out.sim <- c(out.sim,detected_true=sum(sim.map.file$Marker[loc_peak,2] == sim.map.file$Marker[loc_sim,2] & lod_peak-1.5 < lod_sim & lod_peak >= threshold))
out.sim <- c(out.sim,detected_false=(sum(lod_peak >= threshold) - sum(sim.map.file$Marker[loc_peak,2] == sim.map.file$Marker[loc_sim,2] & lod_peak-1.5 < lod_sim & lod_peak >= threshold)))
out.sim <- c(out.sim,detected_not = sum(lod_peak < threshold))
selc <- sim.map.file$Marker[loc_peak,2] == sim.map.file$Marker[loc_sim,2] & lod_peak-1.5 < lod_sim & lod_peak >= threshold
out.sim <- c(out.sim,detected_true_eff_est=quantile(abs(eff_peak[selc])/(i/2)))
out.sim <- c(out.sim,detected_true_loc_est=quantile(abs(sim.map.file$Marker[loc_peak,3]-sim.map.file$Marker[loc_sim,3])[selc]))
###Map False file
sim.map.file <- map.1(sim.emp.file[[1]],sim.emp.file[[2]],sim.emp.file[[3]])
loc_peak <- as.numeric(as.character(unlist(apply(sim.map.file$LOD,1,which.max))))
lod_peak <- as.numeric(as.character(unlist(apply(sim.map.file$LOD,1,max,na.rm=TRUE))))
eff_peak <- t(sim.map.file$Effect)[(ncol(sim.map.file$Effect)*(0:(nrow(sim.map.file$Effect)-1)))+loc_peak]
loc_sim <- rep(1:nrow(sim.map.file$Map),each=n_per_marker)
lod_sim <- t(sim.map.file$LOD)[(ncol(sim.map.file$LOD)*(0:(nrow(sim.map.file$LOD)-1)))+loc_sim]
eff_sim <- t(sim.map.file$Effect)[(ncol(sim.map.file$Effect)*(0:(nrow(sim.map.file$Effect)-1)))+loc_sim]
out.emp <- NULL
out.emp <- c(out.emp,detected_true=0)
out.emp <- c(out.emp,detected_false=sum(lod_peak >= threshold))
out.emp <- c(out.emp,detected_not = sum(lod_peak < threshold))
out.emp <- c(out.emp,detected_true_eff_est=quantile(rep(NA,times=100),na.rm=T))
out.emp <- c(out.emp,detected_true_loc_est=quantile(rep(NA,times=100),na.rm=T))
###Combine and summarize
out.sim <- data.frame(cbind(name=c(paste("peak_sim_",i,sep=""),paste("noise_sim_",sigma_error,sep="")),
var_explained=c(round(summary(lm(c(rnorm(100000,0,sigma_error),rnorm(100000,i,sigma_error))~rep(c(-1,1),each=100000)))$adj.r.squared,digits=2),round(summary(lm(c(rnorm(100000,0,sigma_error),rnorm(100000,0,sigma_error))~rep(c(-1,1),each=100000)))$adj.r.squared,digits=2)),
rbind(out.sim,out.emp)))
out <- rbind(out,out.sim)
}
###Make sure numbers are numeric and characters are characteristic
for(i in 2:ncol(out)){
out[,i] <- as.numeric(as.character(unlist(out[,i])))
}
out[,1] <- as.character(unlist(out[,1]))
###summarize all noise simulations
out[max(grep("noise_sim",out[,1])),-1] <- apply(out[grep("noise_sim",out[,1]),-1],2,sum)
out <- out[c(grep("peak_sim",out[,1]),max(grep("noise_sim",out[,1]))),]
##Add percentage detected & percentage false
output <- cbind(out,
percentage_detected=round(out$detected_true/(out$detected_true+out$detected_false+out$detected_not),digits=3)*100,
percentage_false=round(out$detected_false/(out$detected_true+out$detected_false+out$detected_not),digits=3)*100,
type=unlist(strsplit(out$name,split="_"))[seq(1,nrow(out)*3,by=3)],
sigma=unlist(strsplit(out$name,split="_"))[seq(3,nrow(out)*3,by=3)])
output <- output[,c(18,19,2,3:5,16,17,6:15)]
colnames(output)[grepl("_est.",colnames(output))] <- paste("quantile_",unlist(strsplit(colnames(output)[grepl("_est.",colnames(output))],split="_est."))[seq(2,2*sum(grepl("_est.",colnames(output))),by=2)],sep="")
colnames(output) <- gsub("\\.","",colnames(output))
colnames(output) <- paste(c(rep("Simulation",times=3),rep("QTL_detection",times=5),rep("Effect_size_estimation_(quantiles)",times=5),rep("QTL-true_peak_location_distance_(quantiles)",times=5)),colnames(output),sep=".")
return(output)
}
################################################################################################################################################################
###eQTL_correlator(trait.dataframe,strain.map,strain.marker,eQTL.table,ref.genotype,method,n.genes,interval)
### predicts the genotype based on eQTL information, it returns a list with correlations.
### obligatory input
### eQTL.table.output, output from the eQTL.table function
### trait.dataframe, preferrably data centered to the parental mean, otherwise z.scores or ratio with the mean of the entire experiment could be used
### optional input
### ref.genotype, standard is "1"
### n.genes, standard is 20
### method, standard is "per.genes", other option is "interval"
### interval, standard is 1e6
eQTL.correlator <- function(trait.dataframe,eQTL.table,ref.genotype,method,n.genes,interval){
if(missing(trait.dataframe)){ stop("specify trait.dataframe, columns: trait, strain, ratio, sample")}
if(sum(c("trait","strain","ratio","sample") %in% colnames(trait.dataframe)) != 4){
stop("specify trait.dataframe, columns: trait, strain, ratio, sample")}
if(missing(eQTL.table)){ stop("specify eQTL.table")}
###Parameters
if(missing(ref.genotype)){ ref.genotype <- 1}
if(missing(method)){ method <- "per.genes"}
if(missing(n.genes) & method=="per.genes"){ n.genes <- 20}
if(missing(interval)& method=="interval"){ interval <- 1e6}
if(method %in% c("per.genes","interval") ==FALSE){ stop("choose a method from 'per.genes' and 'interval'")}
###Take only one type of eQTL
eQTL.table <- eQTL.table[eQTL.table$qtl_type == unique(eQTL.table$qtl_type)[1],
colnames(eQTL.table) %in% c("trait","qtl_chromosome","qtl_bp","qtl_marker","qtl_significance","qtl_effect","gene_chromosome","gene_bp","gene_WBID")]
eQTL.table <- merge(eQTL.table,trait.dataframe)
###calculate correlation per n.genes, 20 works pretty well
if(method =="per.genes"){
out <- group_by(eQTL.table,strain,gene_chromosome,sample) %>%
mutate(order_gene=ceiling(rank(gene_bp)/n.genes)) %>%
as.data.frame() %>%
group_by(strain,gene_chromosome,order_gene,sample) %>%
mutate(n.cor=length(qtl_effect)) %>%
as.data.frame() %>%
mutate(order_gene = ifelse(n.cor<n.genes/2,order_gene-1,order_gene)) %>%
group_by(strain,gene_chromosome,order_gene,sample) %>%
summarise(correlation = cor(x=ratio,y=qtl_effect*ref.genotype,use="complete.obs"),
correlation_significance=cor.test(x=ratio,y=qtl_effect*ref.genotype,na.action=na.omit())$p.value,n.cor=length(qtl_effect),
location_left=min(gene_bp)/1e6,location_median=median(gene_bp)/1e6,location_right=max(gene_bp)/1e6) %>%
as.data.frame()
}
if(method =="interval"){
out <- mutate(eQTL.table.output,order_gene=ceiling(gene_bp/interval)) %>%
group_by(strain,gene_chromosome,order_gene,sample) %>%
mutate(n.cor=length(qtl_effect)) %>%
as.data.frame() %>%
mutate(order_gene = ifelse(n.cor<10,order_gene-1,order_gene)) %>%
group_by(strain,gene_chromosome,order_gene,sample) %>%
mutate(n.cor=length(qtl_effect)) %>%
as.data.frame() %>%
mutate(order_gene = ifelse(n.cor<10,order_gene-1,order_gene)) %>%
group_by(strain,gene_chromosome,order_gene,sample) %>%
summarise(correlation = cor(x=ratio,y=qtl_effect*ref.genotype,use="complete.obs"),
correlation_significance=cor.test(x=ratio,y=qtl_effect*ref.genotype,na.action=na.omit())$p.value,n.cor=length(qtl_effect),
location_left=min(gene_bp)/1e6,location_median=median(gene_bp)/1e6,location_right=max(gene_bp)/1e6) %>%
as.data.frame()
}
return(out)
}
### strain.map, map of the investigated strains
### strain.marker, marker file of the investigated strains
plot.eQTL.correlation <- function(eQTL.correlator.output,strain.map,strain.marker,filename){
if(missing(eQTL.correlator.output)){ stop("provide output from eQTL.correlator function")}
if(missing(strain.map)){ stop("specify strain.map (genetic map)")}
if(missing(strain.marker)){ strain.marker <- data.frame(cbind(name=1:dim(strain.map)[1],chromosome=NA,bp=1:dim(strain.map)[1]))
rownames(strain.marker) <- 1:dim(strain.map)[1]}
if(missing(filename )){ filename <- "eQTL_genotype"}
pdf(paste(filename,".pdf",sep="_"),width=18,height=8)
for(i in 1:length(unique(eQTL.correlator.output$sample))){
plot.data <- eQTL.correlator.output[eQTL.correlator.output$sample==unique(eQTL.correlator.output$sample)[i],]
plot.eQTL <- ggplot(plot.data,aes(x=location_median,y=correlation,colour=n.cor)) +
geom_point(aes(alpha=0.5)) + geom_smooth(method="loess",span=0.5,se=TRUE) + geom_segment(aes(x=location_left,y=correlation,xend=location_right,yend=correlation,alpha=0.5)) +
facet_grid(.~gene_chromosome, scales="free") + geom_hline(yintercept=0) + presentation +
labs(x="Marker location (Mb)",y=paste("Correlation N2 and",tolower(unlist(unique(plot.data$strain))[1]))) + ylim(-2,2) +
ggtitle(unique(plot.data$sample))
plot.data <- data.frame(cbind(name=strain.marker[,1],
chromosome=strain.marker[,2],
bp=strain.marker[,3],
current=strain.map[,tolower(colnames(strain.map))==tolower(unlist(unique(plot.data$strain))[1])]))
plot.data[,1] <- as.character(unlist(plot.data[,1]))
plot.data[,2] <- as.character(unlist(plot.data[,2]))
plot.data[,3] <- as.numeric(as.character(unlist(plot.data[,3])))
plot.data[,4] <- as.numeric(as.character(unlist(plot.data[,4])))
plot.mapz <- ggplot(plot.data,aes(x=bp/1e6,y=current,colour=current)) +
geom_point() + geom_line() + facet_grid(.~chromosome, scales="free") +
geom_hline(yintercept=0) + presentation +
labs(x="Marker location (Mb)",y=paste("Map",tolower(unlist(unique(plot.data$strain))[1]))) + ylim(-2,2)
grid.arrange(plot.eQTL , plot.mapz,
ncol=1,heights=c(4,2))
}
dev.off()
}
emma.REML.t <- function (ys, xs, K, Z = NULL, X0 = NULL, ngrids = 100, llim = -10,ulim = 10, esp = 1e-10, ponly = FALSE){
if (is.null(dim(ys)) || ncol(ys) == 1) {
ys <- matrix(ys, 1, length(ys))
}
if (is.null(dim(xs)) || ncol(xs) == 1) {
xs <- matrix(xs, 1, length(xs))
}
if (is.null(X0)) {
X0 <- matrix(1, ncol(ys), 1)
}
g <- nrow(ys)
n <- ncol(ys)
m <- nrow(xs)
t <- ncol(xs)
q0 <- ncol(X0)
q1 <- q0 + 1
stopifnot(nrow(K) == t)
stopifnot(ncol(K) == t)
stopifnot(nrow(X0) == n)
if (!ponly) {
REMLs <- matrix(nrow = m, ncol = g)
vgs <- matrix(nrow = m, ncol = g)
ves <- matrix(nrow = m, ncol = g)
}
dfs <- matrix(nrow = m, ncol = g)
stats <- matrix(nrow = m, ncol = g)
ps <- matrix(nrow = m, ncol = g)
if (sum(is.na(ys)) == 0) {
eig.L <- emma.eigen.L(Z, K)
x.prev <- vector(length = 0)
for (i in 1:m) {
vids <- !is.na(xs[i, ])
nv <- sum(vids)
xv <- xs[i, vids]
if ((mean(xv) <= 0) || (mean(xv) >= 1)) {
if (!ponly) {
vgs[i, ] <- rep(NA, g)
ves[i, ] <- rep(NA, g)
dfs[i, ] <- rep(NA, g)
REMLs[i, ] <- rep(NA, g)
stats[i, ] <- rep(NA, g)
}
ps[i, ] = rep(1, g)
}
else if (identical(x.prev, xv)) {
if (!ponly) {
vgs[i, ] <- vgs[i - 1, ]
ves[i, ] <- ves[i - 1, ]
dfs[i, ] <- dfs[i - 1, ]
REMLs[i, ] <- REMLs[i - 1, ]
stats[i, ] <- stats[i - 1, ]
}
ps[i, ] <- ps[i - 1, ]
}
else {
if (is.null(Z)) {
X <- cbind(X0[vids, , drop = FALSE], xs[i,
vids])
eig.R1 = emma.eigen.R.wo.Z(K[vids, vids], X)
}
else {
vrows <- as.logical(rowSums(Z[, vids]))
X <- cbind(X0[vrows, , drop = FALSE], Z[vrows,
vids, drop = FALSE] %*% t(xs[i, vids, drop = FALSE]))
eig.R1 = emma.eigen.R.w.Z(Z[vrows, vids], K[vids,
vids], X)
}
for (j in 1:g) {
if (nv == t) {
REMLE <- emma.REMLE(ys[j, ], X, K, Z, ngrids,
llim, ulim, esp, eig.R1)
if (is.null(Z)) {
U <- eig.L$vectors * matrix(sqrt(1/(eig.L$values +
REMLE$delta)), t, t, byrow = TRUE)
dfs[i, j] <- nv - q1
}
else {
U <- eig.L$vectors * matrix(c(sqrt(1/(eig.L$values +
REMLE$delta)), rep(sqrt(1/REMLE$delta),
n - t)), n, n, byrow = TRUE)
dfs[i, j] <- n - q1
}
yt <- crossprod(U, ys[j, ])
Xt <- crossprod(U, X)
iXX <- solve(crossprod(Xt, Xt))
beta <- iXX %*% crossprod(Xt, yt)
if (!ponly) {
vgs[i, j] <- REMLE$vg
ves[i, j] <- REMLE$ve
REMLs[i, j] <- REMLE$REML
}
stats[i, j] <- beta[q1]/sqrt(iXX[q1, q1] *
REMLE$vg)
}
else {
if (is.null(Z)) {
eig.L0 <- emma.eigen.L.wo.Z(K[vids, vids])
nr <- sum(vids)
yv <- ys[j, vids]
REMLE <- emma.REMLE(yv, X, K[vids, vids,
drop = FALSE], NULL, ngrids, llim, ulim,
esp, eig.R1)
U <- eig.L0$vectors * matrix(sqrt(1/(eig.L0$values +
REMLE$delta)), nr, nr, byrow = TRUE)
dfs[i, j] <- nr - q1
}
else {
eig.L0 <- emma.eigen.L.w.Z(Z[vrows, vids,
drop = FALSE], K[vids, vids])
yv <- ys[j, vrows]
nr <- sum(vrows)
tv <- sum(vids)
REMLE <- emma.REMLE(yv, X, K[vids, vids,
drop = FALSE], Z[vrows, vids, drop = FALSE],
ngrids, llim, ulim, esp, eig.R1)
U <- eig.L0$vectors * matrix(c(sqrt(1/(eig.L0$values +
REMLE$delta)), rep(sqrt(1/REMLE$delta),
nr - tv)), nr, nr, byrow = TRUE)
dfs[i, j] <- nr - q1
}
yt <- crossprod(U, yv)
Xt <- crossprod(U, X)
iXX <- solve(crossprod(Xt, Xt))
beta <- iXX %*% crossprod(Xt, yt)
if (!ponly) {
vgs[i, j] <- REMLE$vg
ves[i, j] <- REMLE$ve
REMLs[i, j] <- REMLE$REML
}
stats[i, j] <- beta[q1]/sqrt(iXX[q1, q1] *
REMLE$vg)
}
}
ps[i, ] <- 2 * pt(abs(stats[i, ]), dfs[i, ],
lower.tail = FALSE)
}
}
}
else {
eig.L <- emma.eigen.L(Z, K)
eig.R0 <- emma.eigen.R(Z, K, X0)
x.prev <- vector(length = 0)
for (i in 1:m) {
vids <- !is.na(xs[i, ])
nv <- sum(vids)
xv <- xs[i, vids]
if ((mean(xv) <= 0) || (mean(xv) >= 1)) {
if (!ponly) {
vgs[i, ] <- rep(NA, g)
ves[i, ] <- rep(NA, g)
REMLs[i, ] <- rep(NA, g)
dfs[i, ] <- rep(NA, g)
}
ps[i, ] = rep(1, g)
}
else if (identical(x.prev, xv)) {
if (!ponly) {
stats[i, ] <- stats[i - 1, ]
vgs[i, ] <- vgs[i - 1, ]
ves[i, ] <- ves[i - 1, ]
REMLs[i, ] <- REMLs[i - 1, ]
dfs[i, ] <- dfs[i - 1, ]
}
ps[i, ] = ps[i - 1, ]
}
else {
if (is.null(Z)) {
X <- cbind(X0, xs[i, ])
if (nv == t) {
eig.R1 = emma.eigen.R.wo.Z(K, X)
}
}
else {
vrows <- as.logical(rowSums(Z[, vids, drop = FALSE]))
X <- cbind(X0, Z[, vids, drop = FALSE] %*%
t(xs[i, vids, drop = FALSE]))
if (nv == t) {
eig.R1 = emma.eigen.R.w.Z(Z, K, X)
}
}
for (j in 1:g) {
vrows <- !is.na(ys[j, ])
if (nv == t) {
yv <- ys[j, vrows]
nr <- sum(vrows)
if (is.null(Z)) {
if (nr == n) {
REMLE <- emma.REMLE(yv, X, K, NULL, ngrids,
llim, ulim, esp, eig.R1)
U <- eig.L$vectors * matrix(sqrt(1/(eig.L$values +
REMLE$delta)), n, n, byrow = TRUE)
}
else {
eig.L0 <- emma.eigen.L.wo.Z(K[vrows,
vrows, drop = FALSE])
REMLE <- emma.REMLE(yv, X[vrows, , drop = FALSE],
K[vrows, vrows, drop = FALSE], NULL,
ngrids, llim, ulim, esp)
U <- eig.L0$vectors * matrix(sqrt(1/(eig.L0$values +
REMLE$delta)), nr, nr, byrow = TRUE)
}
dfs[i, j] <- nr - q1
}
else {
if (nr == n) {
REMLE <- emma.REMLE(yv, X, K, Z, ngrids,
llim, ulim, esp, eig.R1)
U <- eig.L$vectors * matrix(c(sqrt(1/(eig.L$values +
REMLE$delta)), rep(sqrt(1/REMLE$delta),
n - t)), n, n, byrow = TRUE)
}
else {
vtids <- as.logical(colSums(Z[vrows,
, drop = FALSE]))
eig.L0 <- emma.eigen.L.w.Z(Z[vrows, vtids,
drop = FALSE], K[vtids, vtids, drop = FALSE])
REMLE <- emma.REMLE(yv, X[vrows, , drop = FALSE],
K[vtids, vtids, drop = FALSE], Z[vrows,
vtids, drop = FALSE], ngrids, llim,
ulim, esp)
U <- eig.L0$vectors * matrix(c(sqrt(1/(eig.L0$values +
REMLE$delta)), rep(sqrt(1/REMLE$delta),
nr - sum(vtids))), nr, nr, byrow = TRUE)
}
dfs[i, j] <- nr - q1
}
yt <- crossprod(U, yv)
Xt <- crossprod(U, X[vrows, , drop = FALSE])
iXX <- solve(crossprod(Xt, Xt))
beta <- iXX %*% crossprod(Xt, yt)
if (!ponly) {
vgs[i, j] <- REMLE$vg
ves[i, j] <- REMLE$ve
REMLs[i, j] <- REMLE$REML
}
stats[i, j] <- beta[q1]/sqrt(iXX[q1, q1] *
REMLE$vg)
}
else {
if (is.null(Z)) {
vtids <- vrows & vids
eig.L0 <- emma.eigen.L.wo.Z(K[vtids, vtids,
drop = FALSE])
yv <- ys[j, vtids]
nr <- sum(vtids)
REMLE <- emma.REMLE(yv, X[vtids, , drop = FALSE],
K[vtids, vtids, drop = FALSE], NULL,
ngrids, llim, ulim, esp)
U <- eig.L0$vectors * matrix(sqrt(1/(eig.L0$values +
REMLE$delta)), nr, nr, byrow = TRUE)
Xt <- crossprod(U, X[vtids, , drop = FALSE])
dfs[i, j] <- nr - q1
}
else {
vtids <- as.logical(colSums(Z[vrows, ,
drop = FALSE])) & vids
vtrows <- vrows & as.logical(rowSums(Z[,
vids, drop = FALSE]))
eig.L0 <- emma.eigen.L.w.Z(Z[vtrows, vtids,
drop = FALSE], K[vtids, vtids, drop = FALSE])
yv <- ys[j, vtrows]
nr <- sum(vtrows)
REMLE <- emma.REMLE(yv, X[vtrows, , drop = FALSE],
K[vtids, vtids, drop = FALSE], Z[vtrows,
vtids, drop = FALSE], ngrids, llim,
ulim, esp)
U <- eig.L0$vectors * matrix(c(sqrt(1/(eig.L0$values +
REMLE$delta)), rep(sqrt(1/REMLE$delta),
nr - sum(vtids))), nr, nr, byrow = TRUE)
Xt <- crossprod(U, X[vtrows, , drop = FALSE])
dfs[i, j] <- nr - q1
}
yt <- crossprod(U, yv)
iXX <- solve(crossprod(Xt, Xt))
beta <- iXX %*% crossprod(Xt, yt)
if (!ponly) {
vgs[i, j] <- REMLE$vg
ves[i, j] <- REMLE$ve
REMLs[i, j] <- REMLE$REML
}
stats[i, j] <- beta[q1]/sqrt(iXX[q1, q1] *
REMLE$vg)
}
}
ps[i, ] <- 2 * pt(abs(stats[i, ]), dfs[i, ],
lower.tail = FALSE)
}
}
}
if (ponly) {
return(ps)
}
else {
return(list(ps = ps, REMLs = REMLs, stats = stats, dfs = dfs,
vgs = vgs, ves = ves))
}
}
emma.kinship <- function (snps, method = "additive", use = "all")
{
n0 <- sum(snps == 0, na.rm = TRUE)
nh <- sum(snps == 0.5, na.rm = TRUE)
n1 <- sum(snps == 1, na.rm = TRUE)
nNA <- sum(is.na(snps))
stopifnot(n0 + nh + n1 + nNA == length(snps))
if (method == "dominant") {
flags <- matrix(as.double(rowMeans(snps, na.rm = TRUE) >
0.5), nrow(snps), ncol(snps))
snps[!is.na(snps) & (snps == 0.5)] <- flags[!is.na(snps) &
(snps == 0.5)]
}
else if (method == "recessive") {
flags <- matrix(as.double(rowMeans(snps, na.rm = TRUE) <
0.5), nrow(snps), ncol(snps))
snps[!is.na(snps) & (snps == 0.5)] <- flags[!is.na(snps) &
(snps == 0.5)]
}
else if ((method == "additive") && (nh > 0)) {
dsnps <- snps
rsnps <- snps
flags <- matrix(as.double(rowMeans(snps, na.rm = TRUE) >
0.5), nrow(snps), ncol(snps))
dsnps[!is.na(snps) & (snps == 0.5)] <- flags[!is.na(snps) &
(snps == 0.5)]
flags <- matrix(as.double(rowMeans(snps, na.rm = TRUE) <
0.5), nrow(snps), ncol(snps))
rsnps[!is.na(snps) & (snps == 0.5)] <- flags[!is.na(snps) &
(snps == 0.5)]
snps <- rbind(dsnps, rsnps)
}
if (use == "all") {
mafs <- matrix(rowMeans(snps, na.rm = TRUE), nrow(snps),
ncol(snps))
snps[is.na(snps)] <- mafs[is.na(snps)]
}
else if (use == "complete.obs") {
snps <- snps[rowSums(is.na(snps)) == 0, ]
}
n <- ncol(snps)
K <- matrix(nrow = n, ncol = n)
diag(K) <- 1
for (i in 2:n) {
for (j in 1:(i - 1)) {
x <- snps[, i] * snps[, j] + (1 - snps[, i]) * (1 -
snps[, j])
K[i, j] <- sum(x, na.rm = TRUE)/sum(!is.na(x))
K[j, i] <- K[i, j]
}
}
return(K)
}
emma.eigen.L <- function (Z, K, complete = TRUE)
{
if (is.null(Z)) {
return(emma.eigen.L.wo.Z(K))
}
else {
return(emma.eigen.L.w.Z(Z, K, complete))
}
}
emma.eigen.L.wo.Z <- function (K)
{
eig <- eigen(K, symmetric = TRUE)
return(list(values = eig$values, vectors = eig$vectors))
}
emma.eigen.R.wo.Z <- function (K, X)
{
n <- nrow(X)
q <- ncol(X)
S <- diag(n) - X %*% solve(crossprod(X, X)) %*% t(X)
eig <- eigen(S %*% (K + diag(1, n)) %*% S, symmetric = TRUE)
stopifnot(!is.complex(eig$values))
return(list(values = eig$values[1:(n - q)] - 1, vectors = eig$vectors[,
1:(n - q)]))
}
emma.eigen.R <- function(Z,K,X,complete=TRUE) {
if ( is.null(Z) ) {
return(emma.eigen.R.wo.Z(K,X))
}
else {
return(emma.eigen.R.w.Z(Z,K,X,complete))
}
}
emma.REMLE <- function (y, X, K, Z = NULL, ngrids = 100, llim = -10, ulim = 10,
esp = 1e-10, eig.L = NULL, eig.R = NULL)
{
n <- length(y)
t <- nrow(K)
q <- ncol(X)
stopifnot(ncol(K) == t)
stopifnot(nrow(X) == n)
if (det(crossprod(X, X)) == 0) {
warning("X is singular")
return(list(REML = 0, delta = 0, ve = 0, vg = 0))
}
if (is.null(Z)) {
if (is.null(eig.R)) {
eig.R <- emma.eigen.R.wo.Z(K, X)
}
etas <- crossprod(eig.R$vectors, y)
logdelta <- (0:ngrids)/ngrids * (ulim - llim) + llim
m <- length(logdelta)
delta <- exp(logdelta)
Lambdas <- matrix(eig.R$values, n - q, m) + matrix(delta,
n - q, m, byrow = TRUE)
Etasq <- matrix(etas * etas, n - q, m)
LL <- 0.5 * ((n - q) * (log((n - q)/(2 * pi)) - 1 - log(colSums(Etasq/Lambdas))) -
colSums(log(Lambdas)))
dLL <- 0.5 * delta * ((n - q) * colSums(Etasq/(Lambdas *
Lambdas))/colSums(Etasq/Lambdas) - colSums(1/Lambdas))
optlogdelta <- vector(length = 0)
optLL <- vector(length = 0)
if (dLL[1] < esp) {
optlogdelta <- append(optlogdelta, llim)
optLL <- append(optLL, emma.delta.REML.LL.wo.Z(llim,
eig.R$values, etas))
}
if (dLL[m - 1] > 0 - esp) {
optlogdelta <- append(optlogdelta, ulim)
optLL <- append(optLL, emma.delta.REML.LL.wo.Z(ulim,
eig.R$values, etas))
}
for (i in 1:(m - 1)) {
if ((dLL[i] * dLL[i + 1] < 0 - esp * esp) && (dLL[i] >
0) && (dLL[i + 1] < 0)) {
r <- uniroot(emma.delta.REML.dLL.wo.Z, lower = logdelta[i],
upper = logdelta[i + 1], lambda = eig.R$values,
etas = etas)
optlogdelta <- append(optlogdelta, r$root)
optLL <- append(optLL, emma.delta.REML.LL.wo.Z(r$root,
eig.R$values, etas))
}
}
}
else {
if (is.null(eig.R)) {
eig.R <- emma.eigen.R.w.Z(Z, K, X)
}
etas <- crossprod(eig.R$vectors, y)
etas.1 <- etas[1:(t - q)]
etas.2 <- etas[(t - q + 1):(n - q)]
etas.2.sq <- sum(etas.2 * etas.2)
logdelta <- (0:ngrids)/ngrids * (ulim - llim) + llim
m <- length(logdelta)
delta <- exp(logdelta)
Lambdas <- matrix(eig.R$values, t - q, m) + matrix(delta,
t - q, m, byrow = TRUE)
Etasq <- matrix(etas.1 * etas.1, t - q, m)
dLL <- 0.5 * delta * ((n - q) * (colSums(Etasq/(Lambdas *
Lambdas)) + etas.2.sq/(delta * delta))/(colSums(Etasq/Lambdas) +
etas.2.sq/delta) - (colSums(1/Lambdas) + (n - t)/delta))
optlogdelta <- vector(length = 0)
optLL <- vector(length = 0)
if (dLL[1] < esp) {
optlogdelta <- append(optlogdelta, llim)
optLL <- append(optLL, emma.delta.REML.LL.w.Z(llim,
eig.R$values, etas.1, n, t, etas.2.sq))
}
if (dLL[m - 1] > 0 - esp) {
optlogdelta <- append(optlogdelta, ulim)
optLL <- append(optLL, emma.delta.REML.LL.w.Z(ulim,
eig.R$values, etas.1, n, t, etas.2.sq))
}
for (i in 1:(m - 1)) {
if ((dLL[i] * dLL[i + 1] < 0 - esp * esp) && (dLL[i] >
0) && (dLL[i + 1] < 0)) {
r <- uniroot(emma.delta.REML.dLL.w.Z, lower = logdelta[i],
upper = logdelta[i + 1], lambda = eig.R$values,
etas.1 = etas.1, n = n, t1 = t, etas.2.sq = etas.2.sq)
optlogdelta <- append(optlogdelta, r$root)
optLL <- append(optLL, emma.delta.REML.LL.w.Z(r$root,
eig.R$values, etas.1, n, t, etas.2.sq))
}
}
}
maxdelta <- exp(optlogdelta[which.max(optLL)])
maxLL <- max(optLL)
if (is.null(Z)) {
maxva <- sum(etas * etas/(eig.R$values + maxdelta))/(n -
q)
}
else {
maxva <- (sum(etas.1 * etas.1/(eig.R$values + maxdelta)) +
etas.2.sq/maxdelta)/(n - q)
}
maxve <- maxva * maxdelta
return(list(REML = maxLL, delta = maxdelta, ve = maxve, vg = maxva))
}
emma.delta.REML.LL.wo.Z <- function (logdelta, lambda, etas)
{
nq <- length(etas)
delta <- exp(logdelta)
return(0.5 * (nq * (log(nq/(2 * pi)) - 1 - log(sum(etas *
etas/(lambda + delta)))) - sum(log(lambda + delta))))
}
emma.delta.REML.dLL.wo.Z <- function (logdelta, lambda, etas)
{
nq <- length(etas)
delta <- exp(logdelta)
etasq <- etas * etas
ldelta <- lambda + delta
return(0.5 * (nq * sum(etasq/(ldelta * ldelta))/sum(etasq/ldelta) -
sum(1/ldelta)))
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment