From f2034b8f253355088fe220ae802970e6a0210998 Mon Sep 17 00:00:00 2001 From: harta003 <margi.hartanto@wur.nl> Date: Thu, 6 Feb 2020 17:45:32 +0100 Subject: [PATCH] Initial commit --- .gitignore | 26 +- 1-clustering.R | 229 ++++++++++++++++ 2-eqtl-analysis.R | 419 +++++++++++++++++++++++++++++ 3-eqtl-visualization.R | 469 +++++++++++++++++++++++++++++++++ 4-qtl-integration.R | 148 +++++++++++ 5-create-community-network.R | 302 +++++++++++++++++++++ combined-stage-eqtl-analysis.R | 355 +++++++++++++++++++++++++ goe-app.R | 110 ++++++++ seed-germination-qtl.Rproj | 13 + ssh-key | 7 + ssh-key.pub | 1 + 11 files changed, 2066 insertions(+), 13 deletions(-) create mode 100644 1-clustering.R create mode 100644 2-eqtl-analysis.R create mode 100644 3-eqtl-visualization.R create mode 100644 4-qtl-integration.R create mode 100644 5-create-community-network.R create mode 100644 combined-stage-eqtl-analysis.R create mode 100644 goe-app.R create mode 100644 seed-germination-qtl.Rproj create mode 100644 ssh-key create mode 100644 ssh-key.pub diff --git a/.gitignore b/.gitignore index a4cd415..16132e0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,13 +1,13 @@ -.Rproj.user -.Rhistory -.RData -.Ruserdata -figures -files -functions -networks -qtl-peaks -qtl-permutations -qtl-tables -trans-bands -qtl-profiles +.Rproj.user +.Rhistory +.RData +.Ruserdata +figures +files +functions +networks +qtl-peaks +qtl-permutations +qtl-tables +trans-bands +qtl-profiles diff --git a/1-clustering.R b/1-clustering.R new file mode 100644 index 0000000..8ed33ea --- /dev/null +++ b/1-clustering.R @@ -0,0 +1,229 @@ +############################################### +##### seed transcript clustering analysis ##### +############################################### + + +#### prepare the script working environment #### + remove(list = ls()) + gc() + set.seed(1000) + + # set working directory #### + work.dir <- "C:/Users/harta005/Projects/seed-germination-qtl" + setwd(work.dir) + + # dependencies #### + library(dplyr) + library(ggplot2) + library(limma) + library(Biobase) + library(stats) + library(gplots) + library(RColorBrewer) + library(Rfast) + library(amap) + library(topGO) + library(org.At.tair.db) + # unused libraries #### + # library(Mfuzz) + # library(factoextra) + # library(doParallel) + # library(forcats) + # library(tidyr) + # library(gridExtra) + # library(Hmisc) + + + +# load the data #### + trait.matrix <- read.csv(file = 'files/trait-matrix.csv', row.names = 1) + sample.list <- read.csv(file = 'files/sample-list.csv') + sample.stage <- sample.list$stage + +# data presentation #### + presentation <- theme(axis.text.x = element_text(size=6, face="bold", color="black"), + axis.text.y = element_text(size=6, face="bold", color="black"), + axis.title.x = element_text(size=7, face="bold", color="black"), + axis.title.y = element_text(size=7, face="bold", color="black"), + strip.text.x = element_text(size=7, face="bold", color="black"), + strip.text.y = element_text(size=7, face="bold", color="black"), + strip.text = element_text(size =7, face="bold", color="black"), + #plot.title = element_text(size=15, face="bold"), + panel.background = element_rect(fill = "white",color="black"), + panel.grid.major = element_line(colour = "grey80"), + panel.grid.minor = element_blank()) + +# create the expression object #### + x <- trait.matrix # the expression matrix + genes <- data.frame(trait = rownames(x)) # the gene list + f <- data.frame(trait = rownames(x)) # the gene feature + rownames(f) <- rownames(x) + p <- data.frame(id = colnames(trait.matrix), stage = sample.stage) + rownames(p) <- colnames(x) + eset <- ExpressionSet(assayData = as.matrix(x), + phenoData = AnnotatedDataFrame(p), + featureData = AnnotatedDataFrame(f)) + +# PCA #### + pr.out <- prcomp(x = (t(trait.matrix)), center = T, scale. = F) + pc.df <- data.frame(pc1 = pr.out$x[, 1], + pc2 = pr.out$x[, 2], + stage = sample.stage, + population = c(rep('parent', 16), rep('RIL', 164))) + + pca.all <- ggplot(pc.df, aes(x = pc1, y = pc2, color = factor(stage, level = c('pd', 'ar', 'im', 'rp')))) + + geom_point(aes(shape = population)) + + scale_colour_manual(values = c('#ccbb44', '#228833', '#4477aa', '#cc3311')) + + scale_shape_manual(values = c(17, 19)) + + labs(x = "PC1 (55.56%)", y = "PC2 (14.82%)") + + labs(colour = 'stage') + + theme(text = element_text(size = 10), + panel.background = element_blank(), + panel.border=element_rect(fill=NA), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + strip.background=element_blank(), + axis.text.x=element_text(colour="black"), + axis.text.y=element_text(colour="black"), + axis.ticks=element_line(colour="black"), + plot.margin=unit(c(1,1,1,1),"line")) + + #presentation + + geom_hline(aes(yintercept = 0), linetype = 'dashed', size = .1) + + geom_vline(aes(xintercept = 0), linetype = 'dashed', size = .1) + tiff(file = paste0("figures/pca-all.tiff"), + width = 2250, + height = 1200, + units = 'px', + res = 300, + compression = 'lzw') + pca.all + dev.off() + +# differential expresed gene analysis and hierarchiecal clustering #### + group <- with (pData(eset), stage) + group <- factor(group) + design <- model.matrix(~ 0 + group) + colnames(design) <- levels(group) + colSums(design) + + # create pairwise contrast matrix among four different stages + cm <- makeContrasts(pd_ar = pd - ar, + ar_im = ar - im, + im_rp = im - rp, + levels = design) + coef <- colnames(cm) + + # fit the linear model to determine significant differentially expressed genes + # for each pairwise contrast + fit <- lmFit(eset, design) + fit2 <- contrasts.fit(fit, contrasts = cm) %>% + eBayes() + result <- decideTests(fit2) + summary(result) # 1 = upregulate + write.csv(result, 'files/differentally-expressed-genes.csv') + + # determine the differentially expressed genes for each pairwise contrast + # based on p value (>0.05) and log fold change more than 1 sorted by fold change + de.genes <- NA + coef2 <- + for (i in 1:length(coef)) { + print(i) + deg.tmp <- topTable(fit2, lfc = 1, coef = coef[i], + p.value = 0.05, + sort.by = 'logFC', + number = 100000) + de.genes <- c(de.genes, as.character(deg.tmp$trait)) + } + + de.genes <- unique(de.genes) # as much as 990 genes are differentially expressed between one of the contrasts + cluster.data <- trait.matrix[which(rownames(trait.matrix) %in% de.genes), ] + colnames(cluster.data) <- sample.stage + +# hierarchiecal tree clustering done for stage and genes #### + + # hc for stage + dist.matrix.stage <- Dist(t(cluster.data), + method = 'pearson', + upper = T, + diag = T) + hc.stage <- hclust(dist.matrix.stage, method = 'ward.D2') + #hc.stage$order <- c(hc.stage$order[46:180], hc.stage$order[1:45]) + reorder(x = as.dendrogram(hc.stage), c(46:180, 1:45)) + plot(reorder(x = as.dendrogram(hc.stage), 180:1, agglo.FUN = mean)) + + # hc for gene + dist.matrix.gene <- Dist(cluster.data, + method = 'pearson', # to group the genes based on patterns similarity across stages + upper = T, + diag = T) + hc.gene <- hclust(dist.matrix.gene, method = 'ward.D2') # similar to average linkage- + # it calculates the distance that is minimizing the variance within cluster + # and maximazing the variance between clusters + plot(hc.gene) + hc.gene <- cutree(hc.gene, k =6) # if k > 5, the cluster will be diproporsional i.e. a cluster only have few member + table(hc.gene) + hc.table <- as.data.frame(table(true = rownames(cluster.data), cluster = hc.gene)) + hc.table <- hc.table[which(hc.table$Freq != 0), ] + hc.table$cluster <- as.numeric(hc.table$cluster) + hc.table$true <- as.character(hc.table$true) + hc.table$Freq <- NULL + table(hc.table$cluster) + write.csv(hc.table, 'files/gene-cluster.csv') + + # heatmap and hierarchiecal clustering #### + sample.color <- ifelse(grepl('pd', colnames(trait.matrix)), '#4daf4a', + ifelse(grepl('ar', colnames(trait.matrix)), '#377eb8', + ifelse(grepl('im', colnames(trait.matrix)), '#984ea3', + ifelse(grepl('rp', colnames(trait.matrix)), '#e41a1c', NA)))) + + tiff(file = paste0("figures/heatmap-genes.tiff"), + width = 2250, + height = 2000, + units = 'px', + res = 300, + compression = 'lzw') + heatmap.2(x = as.matrix(cluster.data), cexRow = 0.8, cexCol = 1.2, + distfun = function(x) Dist(x, method = 'pearson'), + hclust = function(x) hclust(x, method = 'ward.D2'), + scale = 'row', + density.info = "density", + trace = 'none', + col = brewer.pal(9, 'YlGnBu'), + keysize = 1, + key.title = 'Z-score of\nlog-intensities', + key.xlab = NA, + Colv = reorder(x = as.dendrogram(hc.stage), 180:1, agglo.FUN = mean), + ColSideColors = sample.color) + dev.off() + +# GOE analysis for genes in each cluster #### + x <- org.At.tairCHR + all.genes <- as.list(rownames(trait.matrix)) + hc.go.list <- as.data.frame(matrix(data = NA, nrow = 0, ncol = 8)) + colnames(hc.go.list) <- c('GO.ID', 'Term', 'Annotated', 'Significant', 'Expected', 'Fisher', + 'FDR', 'cluster') + ontology <- 'BP' + + for (i in unique(hc.table$cluster)) { + gene.set <- hc.table$true[which(hc.table$cluster == i)] + gene.set <- factor(as.integer(all.genes %in% gene.set)) + names(gene.set) <- all.genes + GOdata <- new("topGOdata", + description = "Analyzing clustering results", ontology = ontology, + allGenes = gene.set, + annot = annFUN.org,mapping= "org.At.tair.db") + resultFisher <- runTest(GOdata, algorithm = 'weight', statistic = "fisher") + # GOE from topgo consider the general terms on the hierarchy + # the weight algortihm maintain the balance between type I and II errors + result.df <- GenTable(GOdata, Fisher = resultFisher, + orderBy = "Fisher", ranksOf = "Fisher", topNodes = length(resultFisher@score)) + result.df$FDR <- p.adjust(p = result.df$Fisher, method = 'fdr') + result.df$cluster <- i + result.df <- result.df[order(result.df$FDR), ] + result.df <- filter(result.df, FDR <= 0.001) + hc.go.list <- rbind(hc.go.list, result.df) + } + + write.csv(hc.go.list, paste0('files/goe-tables', ontology, '.csv')) + + \ No newline at end of file diff --git a/2-eqtl-analysis.R b/2-eqtl-analysis.R new file mode 100644 index 0000000..086cd5e --- /dev/null +++ b/2-eqtl-analysis.R @@ -0,0 +1,419 @@ +############################## +##### seed eQTL analysis ##### +############################## + +#### prepare the script working environment #### + remove(list = ls()) + gc() + set.seed(1000) + + # Set working directory #### + work.dir <- "C:/Users/harta005/Projects/seed-germination-qtl" + setwd(work.dir) + + # dependencies #### + if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + # BiocManager::install("Mfuzz") + # BiocManager::install("topGO") + # BiocManager::install("org.At.tair.db") + # BiocManager::install("Rgraphviz") + # BiocManager::install("org.At.eg.db") + library(doParallel) + library(dplyr) + library(ggplot2) + library(heritability) + library(topGO) + library(org.At.tair.db) + # unused libraries #### + # library(Mfuzz) + # library(Biobase) + # library(gplots) + # library(RColorBrewer) + # library(limma) + # library(factoextra) + # library(stats) + # library(amap) + # library(forcats) + # library(tidyr) + # library(VennDiagram) + # library(gridExtra) + # library(RCy3) + # library(corrplot) + # library(reshape2) + # library(Hmisc) + # library(igraph) + # library(threejs) + # library(MASS) + # library(UpSetR) + + + + + +# load required function #### + setwd('functions/') + for(i in 1:length(dir())){ + source(dir()[i]) + } # read function from Mark + setwd(work.dir) + + write.EleQTL <- function(map1.output,filename){ + + selector <- cbind(trait = rownames(map1.output$LOD), pval = apply(map1.output$LOD,1,max,na.rm=T)) %>% + data.frame() + + rownames(selector) <- NULL + + lod <- map1.output$LOD + lod <- lod[rownames(lod) %in% selector[,1],] + rownames(lod) <- selector$trait + colnames(lod) <- map1.output$Marker[,1] + + eff <- map1.output$Effect + eff <- eff[rownames(eff) %in% selector[,1],] + rownames(eff) <- selector$trait + colnames(eff) <- map1.output$Marker[,1] + + lod.eff <- lod*sign(eff) + + dat <- map1.output$Trait + dat <- dat[rownames(dat) %in% selector[,1],] + rownames(dat) <- selector$trait + colnames(dat) <- colnames(map1.output$Map) + + map <- map1.output$Map + rownames(map) <- map1.output$Marker[,1] + + marker <- map1.output$Marker + + write.table(lod,file=paste(filename,"_lod.txt",sep=""),sep="\t",quote=F) + write.table(eff,file=paste(filename,"_eff.txt",sep=""),sep="\t",quote=F) + write.table(lod.eff,file=paste(filename,"_lodxeff.txt",sep=""),sep="\t",quote=F) + write.table(marker,file=paste(filename,"_marker.txt",sep=""),sep="\t",quote=F) + write.table(dat,file=paste(filename,"_data.txt",sep=""),sep="\t",quote=F) + write.table(map,file=paste(filename,"_map.txt",sep=""),sep="\t",quote=F) + } # function to convert mapping result to tables + map.per.marker <- function(trait, marker) { + model <- lm(terms(trait ~ marker, keep.order = FALSE)) + summ <- summary(model) + pval <- summ$coefficients[2, 4] + lod <- -log10(pval) + eff <- summ$coefficients[2, 1] + output <- c(lod, eff) + return(output) + } # map the QTL at a marker location + map.all.marker <- function(trait, markers) { + eff.out <- rep(NA, nrow(markers)) + pval.out <- rep(NA, nrow(markers)) + for (i in 1:nrow(markers)) { + if(i == 1) { + out.tmp <- map.per.marker(trait, markers[i, ]) + } + if( i != 1 & sum(abs(as.numeric(markers[i-1,]) - as.numeric(markers[i,])), na.rm = T) != 0 ) { + out.tmp <- map.per.marker(trait, markers[i, ]) + } + if( i != 1 & sum(abs(as.numeric(markers[i-1,]) - as.numeric(markers[i,])), na.rm = T) == 0 ) { + out.tmp <- out.tmp + } + pval.out[i] <- out.tmp[1] + eff.out[i] <- out.tmp[2] + output.lod <- cbind(pval.out, eff.out) + colnames(output.lod) <- c('LOD', 'Eff') + } + return(output.lod) + } # map the QTL using genome wide markers + threshold.determination <- function(trait, strain.map, n.perm){ + + traits <- t(replicate(n.perm, trait)) + perm.trait <- permutate.traits(traits) + + ###Check for NAs + pval.out <- matrix(NA,nrow(perm.trait),nrow(strain.map)) + + for (i in 1:nrow(perm.trait)) { + pval.out[i, ] <- fast.lod.all.marker(perm.trait[i, ], strain.map) + } + + pval.distribution <- apply(pval.out, 1, max) + threshold <- quantile(x = pval.distribution, probs = 0.95) + return(threshold) + } # determine threshold for a QTL + +# load required dataset #### + trait.matrix <- as.matrix(read.csv(file = 'files/trait-matrix.csv', row.names = 1)) + genetic.map <- as.matrix(read.csv(file = 'files/genetic-map.csv')) + trait.matrix <- trait.matrix[, colnames(trait.matrix) %in% colnames(genetic.map)] # remove sample without genetic map + trait.matrix <- trait.matrix[, 17:176] #remove parent sample + genetic.map <- genetic.map[, 17:176] #remove parent sample + marker <- read.csv('files/marker.csv', row.names = 1) + sample.list <- read.csv(file = 'files/sample-list.csv') + sample.stage <- sample.list$stage + gene.info <- read.csv('files/gene.info.csv', row.names = 1) + ril.stage <- substr(x = colnames(trait.matrix), start = 8, stop = 10) + stage <- 'pd' + n.cores <- detectCores() - 1 + +# QTL mapping #### + for (stage in seed.stage) { + map <- genetic.map[, which(ril.stage == stage)] + map <- apply(map, 2, as.numeric) # make sure the alleles are treated as numeric + trait <- trait.matrix[, which(ril.stage == stage)] + + qtl.data <- QTL.data.prep(trait.matrix = trait, + strain.trait = colnames(trait), + strain.map = map, + strain.marker = marker) + cluster <- makeCluster(n.cores, type = "PSOCK") + registerDoParallel(cluster) + output <- foreach(i = 1:nrow(trait), .combine = 'cbind') %dopar% { + map.all.marker(trait = trait[i, ], markers = map) + } + stopCluster(cluster) + + pval.out <- t(output[, which(colnames(output) == 'LOD')]) + eff.out <- t(output[, which(colnames(output) == 'Eff')]) + + colnames(pval.out) <- rownames(marker); rownames(pval.out) <- rownames(trait) + colnames(eff.out) <- rownames(marker); rownames(eff.out) <- rownames(trait) + + qtl.profile <- NULL; qtl.profile <- as.list(qtl.profile) + qtl.profile[[1]] <- round(pval.out,digits=2) + qtl.profile[[2]] <- round(eff.out,digits=3) + qtl.profile[[3]] <- trait + qtl.profile[[4]] <- map + qtl.profile[[5]] <- marker + names(qtl.profile) <- c("LOD","Effect","Trait","Map","Marker") + + write.EleQTL(map1.output = qtl.profile, filename = paste0("qtl-tables/table_single-stage-eqtl_", stage)) + #saveRDS(object = qtl.profile, + #file = paste0("qtl-profiles/profile_single-stage-eqtl_", stage,".rds")) + } + +# permutation and FDR determination + ## based on Benjamini-Yekutieli + # setwd(dir = "qtl-permutations/") + # cluster <- makeCluster(n.cores, type = "PSOCK") + # registerDoParallel(cluster) + # foreach(i = 1:100) %dopar% { + # qtl.perm <- map.1.perm(trait.matrix = trait, + # strain.map = map, + # strain.marker = marker, + # n.perm = 1) + # save(qtl.perm, file = paste0("perm_single-stage-eqtl_", stage, i, ".RData")) + # } + # stopCluster(cluster) + # + # filenames.perm <- dir() + # filenames.perm <- filenames.perm[grep(paste("perm_single-stage-eqtl", stage, sep = "."), filenames.perm)] + # + # FDR <- map.perm.fdr(map1.output = qtl.profile, + # filenames.perm = filenames.perm, + # FDR_dir = paste0(getwd(), "/"), + # q.value = 0.05) + # saveRDS(FDR, file = paste0('fdr_single-stage-eqtl_', stage, '.RDS')) + # + # setwd(work.dir) + # + +# eQTL peak finder and table #### + + for (stage in seed.stage) { + # threshold + threshold <- ifelse(stage == 'pd' | stage == 'ar', 4.2, ifelse(stage == 'im', 4.1, ifelse(stage =='rp', 4.3, NA))) + # the thresholds are based on multiple-testing correction using 100 permuted datasets + qtl.profile <- readRDS(paste0('qtl-profiles/profile_single-stage-eqtl_', stage, '.rds')) + qtl.peak <- mapping.to.list(map1.output = qtl.profile) %>% + peak.finder(threshold = threshold) + qtl.peak <- na.omit(qtl.peak) + saveRDS(object = qtl.peak, + file = paste0("qtl-peaks/peak_single-stage-eqtl_", stage, ".rds")) + +# eQTL table #### + qtl.profile <- readRDS(paste0('qtl-profiles/profile_single-stage-eqtl_', stage, '.rds')) + qtl.peak <- readRDS(paste0('qtl-peaks/peak_single-stage-eqtl_', stage, '.rds')) + eqtl.table <- eQTL.table(peak.list.file = qtl.peak, trait.annotation = gene.info) %>% + eQTL.table.addR2(QTL.prep.file = qtl.profile) + eqtl.table$qtl_chromosome <- as.factor(eqtl.table$qtl_chromosome) + eqtl.table$gene_chromosome <- as.factor(eqtl.table$gene_chromosome) + + # add heritability - single stage #### + + h2.result <- as.list(NULL) + n.perm <- 100 + + qtl.genes <- eqtl.table$trait + + map <- genetic.map[, which(ril.stage == stage)] + map <- apply(map, 2, as.numeric) # make sure the alleles are treated as numeric + trait <- trait.matrix[, which(ril.stage == stage)] + + map2 <- map + #map2[map2 == 0] <- 0.5 + map2[map2 == -1] <- round(0, 0) + #map2[is.na(map2)] <- 0.5 + kinship.matrix <- emma.kinship(map2) + colnames(kinship.matrix) <- colnames(map2); rownames(kinship.matrix) <- colnames(map2) + + cluster <- makeCluster(n.cores, type = 'PSOCK') + clusterExport(cl = cluster, c('trait', 'map2', 'marker_h2', 'kinship.matrix', 'h2.REML')) + h2 <- t(parApply(cl = cluster, X = trait, MARGIN = 1, FUN = h2.REML, strain.names = colnames(map2), kinship.matrix = kinship.matrix, Vg.factor = 1)) + stopCluster(cluster) + + h2 <- cbind.data.frame(trait = rownames(h2), h2) + eqtl.table <- merge(x = eqtl.table, y = h2[, 1:2], by = 'trait', all.x = T) + #eqtl.table <- rename(.data = eqtl.table, h2_REML = h2) + + ###permutation + # print(paste0('permuting', " ", stage)) + # + # cluster <- makeCluster(n.cores, type = "PSOCK") + # registerDoParallel(cluster) + # perm.output <- foreach(i = 1:n.perm, .combine = 'cbind', + # .export = c('trait.tmp', 'map.tmp', 'marker_h2', 'kinship.matrix', 'h2.REML')) %dopar% { + # perm.trait <- t(apply(trait.tmp, 1, function(x){x <- x[order(runif(length(x)))];return(x)})) + # t(apply(perm.trait, 1, h2.REML, strain.names = colnames(map.tmp2), kinship.matrix = kinship.matrix, Vg.factor = 1))[, 1] + # } + # stopCluster(cluster) + # perm.result <- cbind(h2, FDR0.05_REML = apply(perm.output, 1, quantile, 0.95)) + # + # h2.result[[stage]] <- perm.result + # + # + # for (stage in development.stage) { + # h2.result[[stage]] <- cbind.data.frame(h2.result[[stage]], trait = rownames(h2.result[[stage]])) + # } + +# trans-bands identification #### + window.nu <- 2e6 + maxsize <- 100e6 + chr.num <- 5 + + transband.id <- mutate(eqtl.table, interval = findInterval(qtl_bp, seq(1, maxsize, by = window.nu))) %>% + group_by(qtl_chromosome, interval, qtl_type) %>% + summarise(n.ct = length(unique(trait))) %>% + data.frame() %>% + group_by(qtl_type) %>% + mutate(exp.ct = mean(as.numeric(unlist(n.ct)))) %>% + data.frame() %>% + mutate(transband_significance = ppois(n.ct, lambda = exp.ct, lower.tail = F)) %>% + filter(transband_significance < 0.0001, qtl_type == "trans") + + transband.id$transband_id <- with(transband.id, paste0("ch", qtl_chromosome, ":", + (interval - 1) * 2, "-", + interval * 2, "Mb")) + transband.id$stage <- stage + + saveRDS(object = transband.id, + file = paste0("trans-bands/trans.band_", stage, ".rds")) + + # for pd + if(stage == 'pd') { + eqtl.table <- mutate(eqtl.table, + trans_band = ifelse(qtl_type == "trans" & + qtl_chromosome == 1 & + qtl_bp > 6e6 & + qtl_bp <= 10e6, "ch1:6-10Mb", + ifelse(qtl_type == "trans" & + qtl_chromosome == 3 & + qtl_bp > 8e6 & + qtl_bp <= 12e6, "ch3:8-12Mb", "none"))) + } + + # for ar + if( stage == 'ar') { + eqtl.table <- mutate(eqtl.table, + trans_band = ifelse(qtl_type == "trans" & + qtl_chromosome == 2 & + qtl_bp > 12e6 & + qtl_bp <= 14e6, "ch2:12-14Mb", + ifelse(qtl_type == "trans" & + qtl_chromosome == 3 & + qtl_bp > 2e6 & + qtl_bp <= 4e6, "ch3:2-4Mb", "none"))) + } + + # for im + if( stage == 'im' ) { + eqtl.table <- mutate(eqtl.table, + trans_band = ifelse(qtl_type == "trans" & + qtl_chromosome == 5 & + qtl_bp > 6e6 & + qtl_bp <= 8e6, "ch5:6-8Mb", + ifelse(qtl_type == "trans" & + qtl_chromosome == 5 & + qtl_bp > 22e6 & + qtl_bp <= 26e6, "ch5:22-26Mb","none"))) + } + + + # for rp + if(stage == 'rp') { + eqtl.table <- mutate(eqtl.table, + trans_band = ifelse(qtl_type == "trans" & + qtl_chromosome == 1 & + qtl_bp > 0e6 & + qtl_bp <= 2e6, "ch1:0-2Mb ", + ifelse(qtl_type == "trans" & + qtl_chromosome == 1 & + qtl_bp > 6e6 & + qtl_bp <= 8e6, "ch1:6-8Mb", + ifelse(qtl_type == "trans" & + qtl_chromosome == 5 & + qtl_bp > 14e6 & + qtl_bp <= 16e6, "ch5:14-16Mb", + ifelse(qtl_type == "trans" & + qtl_chromosome == 5 & + qtl_bp > 24e6 & + qtl_bp <=26e6, "ch5:24-26Mb", "none"))))) + } + + write.csv(x = eqtl.table, + file = paste0("qtl-tables/table_single-stage-eqtl_", stage, ".csv"), + row.names = T) + saveRDS(object = eqtl.table, + file = paste0("qtl-tables/table_single-stage-eqtl_", stage, ".rds")) + table(eqtl.table$qtl_type, eqtl.table$trans_band!="none") + } + # GO for trans bands - single stage #### + ontology <- 'BP' #c('BP', 'CC', 'MF') + + x <- org.At.tairCHR + all.genes <- as.list(rownames(trait.matrix)) + transband.go <- as.data.frame(matrix(data = NA, nrow = 0, ncol = 9)) + colnames(transband.go) <- c('GO.ID', 'Term', 'Annotated', 'Significant', 'Expected', 'Fisher', + 'FDR', 'stage', 'transband') + transband.go.perstage <- transband.go + + for (stage in seed.stage) { + eqtl.table <- readRDS(paste0('qtl-tables/table_single-stage-eqtl_', stage, '.rds')) + transband.id <- unique(eqtl.table$trans_band) + transband.id <- transband.id[!transband.id %in% 'none'] + transband.go.perstage <- as.data.frame(matrix(data = NA, nrow = 0, ncol = 9)) + colnames(transband.go.perstage) <- c('GO.ID', 'Term', 'Annotated', 'Significant', 'Expected', 'Fisher', + 'FDR', 'stage', 'transband') + + for (i in 1:length(transband.id)) { + gene.set <- eqtl.table[which(eqtl.table$trans_band == transband.id[i]), 'trait'] + gene.set <- factor(as.integer(all.genes %in% gene.set)) + names(gene.set) <- all.genes + GOdata <- new("topGOdata", + description = "GOE for genes in regulated by trans bands", ontology = ontology, + allGenes = gene.set, + annot = annFUN.org,mapping= "org.At.tair.db") + resultFisher <- runTest(GOdata, algorithm = 'weight', statistic = "fisher") + result.df <- GenTable(GOdata, Fisher = resultFisher, + orderBy = "Fisher", ranksOf = "Fisher", topNodes = length(resultFisher@score)) + result.df$stage <- stage + result.df$transband <- transband.id[i] + result.df$Fisher <- as.numeric(result.df$Fisher) + result.df$FDR <- p.adjust(p = result.df$Fisher, method = 'fdr') + result.df <- result.df[order(result.df$FDR), ] + result.df <- dplyr::filter(result.df, Fisher <= 0.01) + transband.go.perstage <- rbind(transband.go.perstage, result.df) + } + transband.go <- rbind(transband.go, transband.go.perstage) + } + + write.csv(transband.go, paste0('files/trans-bands-go-', ontology, '.csv')) + \ No newline at end of file diff --git a/3-eqtl-visualization.R b/3-eqtl-visualization.R new file mode 100644 index 0000000..3760182 --- /dev/null +++ b/3-eqtl-visualization.R @@ -0,0 +1,469 @@ +################################################## +##### visualisation of eQTL analysis results ##### +################################################## + +#### prepare the script working environment #### + remove(list = ls()) + gc() + set.seed(1000) + + # Set working directory #### + work.dir <- "C:/Users/harta005/Projects/seed-germination-qtl" + setwd(work.dir) + + # dependencies #### + library(dplyr) + library(ggplot2) + library(tidyr) + library(doParallel) + library(VennDiagram) + library(UpSetR) + library(forcats) + library(grid) + + library(heritability) + library(topGO) + library(org.At.tair.db) + # unused libraries #### + # library(Mfuzz) + # library(Biobase) + # library(gplots) + # library(RColorBrewer) + # library(limma) + # library(factoextra) + # library(stats) + # library(amap) + # + # + # + # library(gridExtra) + # library(RCy3) + # library(corrplot) + # library(reshape2) + # library(Hmisc) + # library(igraph) + # library(threejs) + # library(MASS) + # + + + # load required function #### + setwd('functions/') + for(i in 1:length(dir())){ + source(dir()[i]) + } # read function from Mark + setwd(work.dir) + + overlap <- function(table, stage) { + for (i in 1:length(stage)) { + if (nrow(table) != 0) { + table <- subset(table, table[stage[i]] == T) + } else { + break + } + } + nrow(table) + } # function to determine the overlapping qtl between 2 stages + + # load the data #### + seed.stage <- c('pd', 'ar', 'im', 'rp') + + + # presentation + presentation <- theme(axis.text.x = element_text(size=10, face="bold", color="black"), + axis.text.y = element_text(size=10, face="bold", color="black"), + axis.title.x = element_text(size=12, face="bold", color="black"), + axis.title.y = element_text(size=12, face="bold", color="black"), + strip.text.x = element_text(size=12, face="bold", color="black"), + strip.text.y = element_text(size=12, face="bold", color="black"), + strip.text = element_text(size =12, face="bold", color="black"), + #plot.title = element_text(size=15, face="bold"), + panel.background = element_rect(fill = "white",color="black"), + panel.grid.major = element_line(colour = "grey80"), + panel.grid.minor = element_blank()) + +# combine the eqtl table and plotting - single stage #### + + all.eqtl.table <- data.frame(matrix(ncol = 20, nrow = 0)) + #colnames(all.eqtl.table) <- colnames(eqtl.table) + + for (stage in seed.stage) { + print(stage) + eqtl.table <- readRDS(paste0('qtl-tables/table_single-stage-eqtl_', stage, '.rds')) %>% + mutate(stage = stage) + all.eqtl.table <- rbind(all.eqtl.table, eqtl.table) + } + + all.eqtl.table <- all.eqtl.table %>% + mutate(qtl_type = ifelse(qtl_type == 'cis', 'local', 'distant')) + + saveRDS(all.eqtl.table, 'qtl-tables/table_single-stage-eqtl_all.rds') + +# comparing LOD, effect, and R2 of shared local and distant eQTL - single stage #### + + eqtl.stat <- group_by(all.eqtl.table, stage, qtl_type) %>% + summarise(median_sig = round(median(qtl_significance), 2), + #sd_sig = sd(qtl_significance), + median_eff = round(median(abs(qtl_effect)), 2), + #sd_eff = sd(qtl_effect), + median_r2 = round(median(qtl_R2_sm), 2)) %>% + #sd_r2 = sd(qtl_R2_sm),) %>% + mutate(pval_sig = NA, pval_eff = NA, pval_r2 = NA) %>% + as.data.frame() + + for (i in 1:nrow(eqtl.stat)) { + stage.row <- which(all.eqtl.table$stage == eqtl.stat[i, 1]) + eqtl.stat[i, 6] <- wilcox.test(qtl_significance ~ qtl_type, data = all.eqtl.table[stage.row, ])$p.value + eqtl.stat[i, 7] <- wilcox.test(abs(qtl_effect) ~ qtl_type, data = all.eqtl.table[stage.row, ])$p.value + eqtl.stat[i, 8] <- wilcox.test(qtl_R2_sm ~ qtl_type, data = all.eqtl.table[stage.row, ])$p.value + } + +# calculating shared local and distant eQTL #### + + qtl.table.summary <- data.frame(matrix(ncol = 18, nrow = 0)) + window.nu <- 4e6 + maxsize <- 100e6 + chr.num <- 5 + + all.eqtl.table2 <- mutate(all.eqtl.table, interval = findInterval(qtl_bp, seq(1, maxsize, by = window.nu))) + all.eqtl.table2 <- subset(x = all.eqtl.table2, select = c(trait, qtl_chromosome, interval, qtl_type, + stage)) + + local.table <- all.eqtl.table2[all.eqtl.table2$qtl_type == 'local', ] %>% + count(trait, qtl_chromosome, interval, qtl_type, stage) %>% + spread(stage, n) %>% + dplyr::select(-qtl_type) %>% + arrange(trait) + duplicated.local <- duplicated(local.table$trait) + + for (i in 1:nrow(local.table)) { + if (i == 1) { + next + } + if (duplicated.local[i] && + local.table$qtl_chromosome[i] == local.table$qtl_chromosome[i-1] && + abs(local.table$interval[i] - local.table$interval[i-1]) == 1 && + local.table$trait[i] == local.table$trait[i-1]) { + print(i) + table.tmp <- rbind(local.table[i, ], local.table[i-1, ]) + table.tmp <- aggregate(table.tmp[4:7], by = list(trait = table.tmp$trait), sum, na.rm = TRUE) + table.tmp$qtl_chromosome <- local.table$qtl_chromosome[i] + table.tmp$interval <- local.table$interval[i] + table.tmp <- table.tmp[, c(colnames(local.table))] + local.table[(i-1), 1] <- 'removed' + local.table[i, ] <- table.tmp + } + } + + local.table <- local.table[which(local.table$trait != 'removed'), ] + local.table[is.na(local.table)] <- 0 + write.csv(local.table, 'files/shared-local.csv') + local.table <- local.table[, 4:7] + local.table <- apply(local.table, 2, as.integer) + + distant.table <- all.eqtl.table2[all.eqtl.table2$qtl_type == 'distant', ] %>% + count(trait, qtl_chromosome, interval, qtl_type, stage) %>% + spread(stage, n) %>% + dplyr::select(-qtl_type) %>% + arrange(trait) + duplicated.distant <- duplicated(distant.table$trait) + + for (i in 1:nrow(distant.table)) { + if (i == 1) { + next + } + if (duplicated.local[i] && + distant.table$qtl_chromosome[i] == distant.table$qtl_chromosome[i-1] && + abs(distant.table$interval[i] - distant.table$interval[i-1]) == 1 && + distant.table$trait[i] == distant.table$trait[i-1]) { + print(i) + table.tmp <- rbind(distant.table[i, ], distant.table[i-1, ]) + table.tmp <- aggregate(table.tmp[4:7], by = list(trait = table.tmp$trait), sum, na.rm = TRUE) + table.tmp$qtl_chromosome <- distant.table$qtl_chromosome[i] + table.tmp$interval <- distant.table$interval[i] + table.tmp <- table.tmp[, c(colnames(distant.table))] + distant.table[(i-1), 1] <- 'removed' + distant.table[i, ] <- table.tmp + } + } + + distant.table <- distant.table[which(distant.table$trait != 'removed'), ] + distant.table[is.na(distant.table)] <- 0 + write.csv(distant.table, 'files/shared-distant.csv') + distant.table <- distant.table[, 4:7] + distant.table <- apply(distant.table, 2, as.integer) + + #subset(qtl.table.cis, qtl.table.cis[development.stage[1]] == T) + + # venn diagram #### + + draw.quad.venn(area1 = overlap(local.table, 'pd'), + area3 = overlap(local.table, 'ar'), + area4 = overlap(local.table, 'im'), + area2 = overlap(local.table, 'rp'), + n13 = overlap(local.table, c('pd', 'ar')), + n14 = overlap(local.table, c('pd', 'im')), + n12 = overlap(local.table, c('pd', 'rp')), + n34 = overlap(local.table, c('ar', 'im')), + n23 = overlap(local.table, c('ar', 'rp')), + n24 = overlap(local.table, c('im', 'rp')), + n134 = overlap(local.table, c('pd', 'ar', 'im')), + n123 = overlap(local.table, c('pd', 'ar', 'rp')), + n124 = overlap(local.table, c('pd', 'im', 'rp')), + n234 = overlap(local.table, c('ar', 'im', 'rp')), + n1234 = overlap(local.table, c('pd', 'ar', 'im', 'rp')), + category = c('primary\ndormant', 'radicle\nprotrusion', 'after-\nripenned', '6 hours after\nimbibition'), + lty = 'blank', + fill = c('#4daf4a', '#e41a1c', '#377eb8', '#984ea3'), + alpha = 0.5) + + draw.quad.venn(area1 = overlap(distant.table, 'pd'), + area3 = overlap(distant.table, 'ar'), + area4 = overlap(distant.table, 'im'), + area2 = overlap(distant.table, 'rp'), + n13 = overlap(distant.table, c('pd', 'ar')), + n14 = overlap(distant.table, c('pd', 'im')), + n12 = overlap(distant.table, c('pd', 'rp')), + n34 = overlap(distant.table, c('ar', 'im')), + n23 = overlap(distant.table, c('ar', 'rp')), + n24 = overlap(distant.table, c('im', 'rp')), + n134 = overlap(distant.table, c('pd', 'ar', 'im')), + n123 = overlap(distant.table, c('pd', 'ar', 'rp')), + n124 = overlap(distant.table, c('pd', 'im', 'rp')), + n234 = overlap(distant.table, c('ar', 'im', 'rp')), + n1234 = overlap(distant.table, c('pd', 'ar', 'im', 'rp')), + category = c('primary\ndormant', 'radicle\nprotrusion', 'after-\nripenned', '6 hours after\nimbibition'), + lty = 'blank', + fill = c('#4daf4a', '#e41a1c', '#377eb8', '#984ea3'), + alpha = 0.5, ) + + pdf(file = 'figures/distant-local-venn-diagram.pdf', width = 10, height = 5) + grid.arrange(local.plot, distant.plot, ncol = 2, heights = c(5, 5)) + dev.off() + + # upset graph #### + + upset.local <- upset(data = as.data.frame(local.table), + mainbar.y.max = 750, + sets = c('rp', 'im', 'ar', 'pd'), + empty.intersections = 'on', + mainbar.y.label = 'shared eQTL', + sets.x.label = 'eQTL per stage', + text.scale = 0.8, + point.size = 1, + line.size = 1, + keep.order = T, + set_size.scale_max = 1400) + + tiff(file = paste0("figures/upset-local.tiff"), + width = 1000, + height = 850, + units = 'px', + res = 300, + compression = 'lzw') + upset.local + dev.off() + + upset.distant <- upset(data = as.data.frame(distant.table), + mainbar.y.max = 750, + sets = c('rp', 'im', 'ar', 'pd'), + empty.intersections = 'on', + mainbar.y.label = 'shared eQTL', + sets.x.label = 'eQTL per stage', + text.scale = 0.8, + point.size = 1, + line.size = 1, + keep.order = T, + set_size.scale_max = 1400) + + tiff(file = paste0("figures/upset-distant.tiff"), + width = 2250, + height = 850, + units = 'px', + res = 300, + compression = 'lzw') + grid.arrange(upset.local, upset.distant, ncol = 2) + dev.off() + +# cis-trans plot - single stage ##### + + all.eqtl.table3 <- all.eqtl.table %>% + mutate(stage = replace(stage, qtl_type == 'local', 'local\neQTL')) + #all.eqtl.table3$stage <- ordered(all.eqtl.table$stage, seed.stage) + + all.eqtl.table3$qtl_type <- factor(all.eqtl.table3$qtl_type) + all.eqtl.table3 <- all.eqtl.table3 %>% + mutate(allele = ifelse(sign(qtl_effect) <= 0, 'sha', + ifelse(sign(qtl_effect) >=0, 'bay', 'null'))) + + eqtl.plot <- ggplot(all.eqtl.table3, aes(x = qtl_bp, y = gene_bp)) + + geom_segment(aes(x = qtl_bp_left, y = gene_bp, xend = qtl_bp_right, yend = gene_bp), + alpha = 0.75, colour = "grey") + + geom_point(aes(colour = ordered(stage, levels = c("local\neQTL", "pd", "ar", "im", "rp")), + shape = allele, + size = abs(qtl_effect), + alpha = log10(qtl_significance)) + ) + + scale_alpha(range = c(0.3, 1)) + + facet_grid(fct_rev(gene_chromosome) ~ qtl_chromosome, space = "free", scales = "free") + + presentation + + scale_colour_manual('stage', values = c('black','#ccbb44', '#228833', '#4477aa', '#cc3311')) + + labs(x = "eQTL peak position (Mb)", y = "Gene position (Mb)") + + scale_x_continuous(breaks=c(5, 10, 15, 20, 25, 30, 35, 40)*10^6,labels=c(5, 10, 15, 20, 25, 30, 35, 40)) + + scale_y_continuous(breaks=c(5, 10, 15, 20, 25, 30, 35, 40)*10^6,labels=c(5, 10, 15, 20, 25, 30, 35, 40)) + + tiff(file = paste0("figures/eqtl-plot.tiff"), + width = 2250, + height = 1600, + units = 'px', + res = 300, + compression = 'lzw') + eqtl.plot + dev.off() + +# histogram - single stage #### + + hist.threshold <- data.frame(stage = seed.stage, exp = c(7.45098, 5.464286, 7, 6.470588), threshold = rep(NA, 4)) + for (i in 1:nrow(hist.threshold)) { + hist.threshold$threshold[i] <- qpois(p = 0.0001, lambda = hist.threshold$exp[i], lower.tail = F) + } # create horizontal line as a threshold + + + eqtl.hist <- ggplot(all.eqtl.table3 %>% filter(qtl_type != 'local'), + aes(x=qtl_bp, fill=ordered(stage, levels = c("pd", "ar", "im", "rp")))) + # 750 x 400 + geom_histogram(binwidth = 2000000, right = T, origin = 0, alpha = 1) + + facet_grid(factor(stage, levels = c("pd", "ar", "im", "rp")) ~ qtl_chromosome, space = "free",scales="free") + + presentation + + scale_fill_manual('stage', values = c('#ccbb44', '#228833', '#4477aa', '#cc3311')) + + theme(legend.position = "right", legend.text=element_text(size=10)) + + labs(x="eQTL peak position (Mb)",y="eQTL counts") + + ylim(c(0, 100)) + + geom_hline(data = hist.threshold, aes(yintercept = threshold), + linetype = 'dashed', + color = 'red', + size = .1) + + scale_x_continuous(breaks=c(5, 10, 15, 20, 25, 30, 35, 40)*10^6,labels=c(5, 10, 15, 20, 25, 30, 35, 40)) + + tiff(file = paste0("figures/eqtl-hist.tiff"), + width = 2250, + height = 1200, + units = 'px', + res = 300, + compression = 'lzw') + eqtl.hist + dev.off() + +# compare the R2, effect, andsignificance of local and distant eQTL #### + r2.plot <- ggplot(all.eqtl.table, aes(x = qtl_R2_sm, + fill = factor(qtl_type, levels = c('local', 'distant')), + color = factor(qtl_type, levels = c('local', 'distant')))) + + #geom_histogram(aes(y = ..density..), binwidth = 0.001, position = 'identity', alpha = 0.3) + + geom_density(alpha = 0.5, size = 0) + + facet_wrap( ~ factor(stage, levels = c('pd', 'ar', 'im', 'rp')), ncol = 2) + + geom_vline(data = group_by(all.eqtl.table, stage, qtl_type) %>% + summarise(median = median(qtl_R2_sm)), aes(xintercept = median, + color = factor(qtl_type, levels = c('local', 'distant'))), + linetype = 'dashed', size = .5) + + presentation + + scale_color_brewer(palette = 'Set1') + + scale_fill_brewer(palette = 'Set1') + + xlab('explained phenotypic variance (R2)') + + ylab('density') + + labs(fill = 'eQTL type', color = 'eQTL type') + + #ylim(0, 1) + + xlim(0, 1) + + theme(strip.text.x = element_text(size = 12)) + + tiff(file = paste0("figures/r2-plot.tiff"), + width = 2250, + height = 1200, + units = 'px', + res = 300, + compression = 'lzw') + r2.plot + dev.off() + + effect.plot <- ggplot(all.eqtl.table, aes(x = log(abs(qtl_effect)), + fill = factor(qtl_type, levels = c('local', 'distant')), + color = factor(qtl_type, levels = c('local', 'distant')))) + + #geom_histogram(aes(y = ..density..), binwidth = 0.001, position = 'identity', alpha = 0.3) + + geom_density(alpha = 0.5, size = 0) + + facet_wrap( ~ factor(stage, levels = c('pd', 'ar', 'im', 'rp')), ncol = 2) + + geom_vline(data = group_by(all.eqtl.table, stage, qtl_type) %>% + summarise(median = median(log(abs(qtl_effect)))), aes(xintercept = median, + color = factor(qtl_type, levels = c('local', 'distant'))), + linetype = 'dashed', size = .5) + + presentation + + scale_color_brewer(palette = 'Set1') + + scale_fill_brewer(palette = 'Set1') + + xlab('absolute eQTL effect') + + ylab('density') + + labs(fill = 'eQTL type', color = 'eQTL type') + + #ylim(0, 1) + + #xlim(0, 1.2) + + theme(strip.text.x = element_text(size = 12)) + + tiff(file = paste0("figures/eff-plot.tiff"), + width = 2250, + height = 1200, + units = 'px', + res = 300, + compression = 'lzw') + effect.plot + dev.off() + + + sig.plot <- ggplot(all.eqtl.table, aes(x = qtl_significance, + fill = factor(qtl_type, levels = c('local', 'distant')), + color = factor(qtl_type, levels = c('local', 'distant')))) + + #geom_histogram(aes(y = ..density..), binwidth = 0.001, position = 'identity', alpha = 0.3) + + geom_density(alpha = 0.5, size = 0) + + facet_wrap( ~ factor(stage, levels = c('pd', 'ar', 'im', 'rp')), ncol = 2) + + geom_vline(data = group_by(all.eqtl.table, stage, qtl_type) %>% + summarise(median = median(qtl_significance)), aes(xintercept = median, + color = factor(qtl_type, levels = c('local', 'distant'))), + linetype = 'dashed', size = .5) + + presentation + + scale_color_brewer(palette = 'Set1') + + scale_fill_brewer(palette = 'Set1') + + xlab('-log10(p)') + + ylab('density') + + labs(fill = 'eQTL type', color = 'eQTL type') + + #ylim(0, 1) + + xlim(0, 36) + + theme(strip.text.x = element_text(size = 12)) + + tiff(file = paste0("figures/sig-plot.tiff"), + width = 2250, + height = 1200, + units = 'px', + res = 300, + compression = 'lzw') + sig.plot + dev.off() + + # ggplot(all.eqtl.table, aes(x = h2_REML, y = qtl_R2_sm, + # fill = factor(qtl_type, levels = c('local', 'distant')), + # color = factor(qtl_type, levels = c('local', 'distant')))) + + # geom_point(size = 1, alpha = 0.5) + + # facet_wrap( ~ factor(stage, levels = c('pd', 'ar', 'im', 'rp')), ncol = 2) + + # geom_vline(data = group_by(all.eqtl.table, stage, qtl_type) %>% + # summarise(med = median(h2_REML, na.rm = T)), aes(xintercept = med, + # color = factor(qtl_type, levels = c('local', 'distant'))), + # linetype = 'dashed', size = .5) + + # geom_hline(data = group_by(all.eqtl.table, stage, qtl_type) %>% + # summarise(med = median(qtl_R2_sm, na.rm = T)), aes(yintercept = med, + # color = factor(qtl_type, levels = c('local', 'distant'))), + # linetype = 'dashed', size = .5) + + # geom_smooth(model = lm) + + # geom_abline(aes(slope = 1, intercept = 0), size = .75, linetype = 'dashed') + + # presentation + + # scale_color_brewer(palette = 'Set1') + + # scale_fill_brewer(palette = 'Set1') + + # xlab('narrow-sense heritabllity (h2)') + + # ylab('explained phenotypic variance (R2)') + + # labs(fill = 'eQTL type', color = 'eQTL type') + + # ylim(0, 1) + + # xlim(0, 1) + + # theme(strip.text.x = element_text(size = 8)) + \ No newline at end of file diff --git a/4-qtl-integration.R b/4-qtl-integration.R new file mode 100644 index 0000000..e391455 --- /dev/null +++ b/4-qtl-integration.R @@ -0,0 +1,148 @@ +################################################## +##### integration of phQTL, mQTL, and eQTL ####### +################################################## + +# Set working directory and load libraries #### + remove(list = ls()) + gc() + +# load library + library(dplyr) + library(grid) + library(scales) + library(RColorBrewer) + library(devtools) + library(ggplot2) + +# set directory + work.dir <- "C:/Users/harta005/Projects/seed-germination-qtl" + setwd(work.dir) + +# gathering phQTL profile + phqtl.peak <- readRDS('qtl-peaks/peak_phqtl_without-outliers-not-transformed.RDS') %>% + mutate(qtl_level = 'phQTL') %>% + mutate(stage = 'phQTL') + +# gathering mQTL profile + seed.stage <- c('pd', 'ar', 'im', 'rp') + mqtl.peak <- data.frame(matrix(nrow = 0, ncol = 12)) + colnames(mqtl.peak) <- colnames(phqtl.peak) + for (i in seed.stage) { + table.tmp <- readRDS(paste0('qtl-peaks/peak_mqtl_', i, '-without-outliers-log-transformed.RDS')) %>% + mutate(qtl_level = 'mQTL', stage = i) + + mqtl.peak <- rbind(mqtl.peak, table.tmp) + } + +# gathering eQTL profiles + + eqtl.peak <- data.frame(matrix(nrow = 0, ncol = 13)) + colnames(eqtl.peak) <- colnames(phqtl.peak) + for (i in seed.stage) { + table.tmp <- readRDS(paste0('qtl-tables/table_single-stage-eqtl_', i, '.rds')) %>% + filter(qtl_type == 'trans') %>% + mutate(qtl_level = 'eQTL', stage = i) %>% + dplyr::select(colnames(phqtl.peak)) + eqtl.peak <- rbind(eqtl.peak, table.tmp) + } + +# creating the histogram plot #### + + # plot style #### + presentation <- theme(axis.text.x = element_text(size=12, face="bold", color="black"), + axis.text.y = element_text(size=12, face="bold", color="black"), + axis.title.x = element_text(size=15, face="bold", color="black"), + axis.title.y = element_text(size=15, face="bold", color="black"), + strip.text.x = element_text(size=15, face="bold", color="black"), + strip.text.y = element_text(size=15, face="bold", color="black"), + plot.title = element_text(size=15, face="bold"), + panel.background = element_rect(fill = "white",color="black"), + panel.grid.major = element_line(colour = "grey80"), + panel.grid.minor = element_blank(), + legend.position = "right") + + myColors <- brewer.pal(9,"Set1")[c(3,5,9,6,7)] + myColors <- c("black",brewer.pal(9,"Set1")[c(2,9)]) + plot.colour <- c("#000000", "#E69F00", "#009E73", "#0072B2", "#D55E00") + names(plot.colour) <- c("cis", "DryFresh", "DryAR", "6H", "RP") + hist.colour <- c("#E69F00", "#009E73", "#0072B2", "#D55E00") + names(hist.colour) <- c("DryFresh", "DryAR", "6H", "RP") + names(myColors) <- c("cis","trans","none") + Snoekcols <- scale_colour_manual(name = "eQTL type",values = myColors) + Snoekfill <- scale_fill_manual(name = "eQTL type",values = myColors) + +# plotting #### + + overlap <- function(table, stage) { + for (i in 1:length(stage)) { + if (nrow(table) != 0) { + table <- subset(table, table[stage[i]] == T) + } else { + break + } + } + nrow(table) + } + + all.peak <- rbind(phqtl.peak, mqtl.peak, eqtl.peak) + all.peak$stage <- factor(all.peak$stage, levels = c('phQTL', 'pd', 'ar', 'im', 'rp')) + qtl.hist <- ggplot(all.peak, aes(x=qtl_bp, fill = stage)) + + geom_histogram(binwidth = 2000000, right = T, origin = 0, alpha = 1) + + #geom_density(alpha = 0.5) + + facet_grid(factor(qtl_level, level = c('phQTL', 'mQTL', 'eQTL')) ~ qtl_chromosome, scales="free_y") + + presentation + + scale_fill_manual(values = c('#964B00', '#ccbb44', '#228833', '#4477aa', '#cc3311')) + + theme(legend.position = "top", legend.text=element_text(size=5)) + + labs(x="QTL peak position (Mb)",y="QTL counts") + + scale_x_continuous(breaks=c(5, 10, 15, 20, 25, 30, 35, 40)*10^6,labels=c(5, 10, 15, 20, 25, 30, 35, 40)) + + tiff(file = paste0("figures/all-qtl-hist.tiff"), + width = 2250, + height = 1600, + units = 'px', + res = 300, + compression = 'lzw') + qtl.hist + dev.off() + + # venn diagram #### + + qtl.table.summary <- data.frame(matrix(ncol = 18, nrow = 0)) + window.nu <- 2e6 + maxsize <- 100e6 + chr.num <- 5 + + all.peak2 <- all.peak %>% + mutate(interval = findInterval(qtl_bp, seq(1, maxsize, by = window.nu))) %>% + select(trait, qtl_chromosome, interval, qtl_significance, stage, qtl_level) %>% + #group_by(trait, qtl_chromosome, interval, qtl_significance, stage, qtl_level) %>% + count(qtl_chromosome, interval, qtl_level) %>% + #ungroup() %>% + spread(qtl_level, n) %>% + #select(phQTL, mQTL, eQTL) %>% + replace(is.na(.), 0) + + overlap <- function(table, stage) { + for (i in 1:length(stage)) { + if (nrow(table) != 0) { + table <- subset(table, table[stage[i]] > 0) + } else { + break + } + } + nrow(table) + } + overlap(all.peak2, c('eQTL', 'phQTL', 'mQTL')) + draw.triple.venn(area1 = overlap(all.peak2, 'phQTL'), + area3 = overlap(all.peak2, 'mQTL'), + area2 = overlap(all.peak2, 'eQTL'), + n13 = overlap(all.peak2, c('phQTL', 'mQTL')), + n12 = overlap(all.peak2, c('phQTL', 'eQTL')), + n23 = overlap(all.peak2, c('mQTL', 'eQTL')), + n123 = overlap(all.peak2, c('phQTL', 'mQTL', 'eQTL')), + category = c('phQTL', 'eQTL', 'mQTL'), + lty = 'blank', + fill = c('#0d98ba', '#c71585', '#f8d568')) + #alpha = 0.9) + + \ No newline at end of file diff --git a/5-create-community-network.R b/5-create-community-network.R new file mode 100644 index 0000000..cc387f6 --- /dev/null +++ b/5-create-community-network.R @@ -0,0 +1,302 @@ +####################################################### +##### seed eQTL network analysis using ARACNe ######### +####################################################### + +# prepare the script working environment #### + remove(list = ls()) + gc() + set.seed(1000) + + work.dir <- "C:/Users/harta005/Projects/seed-germination-qtl" + setwd(work.dir) + +# library + library(RCy3) + library(dplyr) + library(minet) + library(GENIE3) + library(doParallel) + library(reshape2) + library(devtools) + library(ggplot2) + library(Hmisc) + +# determine the stage and transband #### + dev.stage <- 'im' + chr <- 5 + start <- 6000000 + end <- 8000000 + n.perm <- 1000 + +# load required data #### + + # gene and genetic infp + genetic.map <- as.matrix(read.csv(file = 'files/genetic-map.csv')) + gene.info <- read.csv('files/gene.info.csv', row.names = 1) + gene.feature <- read.csv('files/gene-feature.csv') + gene.pol <- read.csv('files/baysha-polymorphism.csv', row.names = 1) + gene.data <- merge(gene.info, gene.pol, all = TRUE) + gene.feature <- merge(gene.info, gene.feature, all = TRUE) + + # expression matrix + expression <- as.matrix(read.csv('files/trait-matrix.csv', row.names = 1)) + expression <- expression[, colnames(expression) %in% colnames(genetic.map)] # remove sample without genetic map + expression <- expression[, 17:176] # remove parents + expression <- expression[, grep(pattern = 'pd', x = colnames(expression))] # only take ril for specific stage + + # ril stage + ril.stage <- substr(x = colnames(expression), start = 8, stop = 10) # write a vector of ril stage + + # load the qtl table + eqtl.table <- readRDS('qtl-tables/table_single-stage-eqtl_all.rds') + + # select the target trait + target.table <- filter(eqtl.table, qtl_bp >= start, + qtl_type == 'distant', + qtl_bp <= end, + qtl_chromosome == chr, + stage == dev.stage) + targets <- as.character(target.table$trait) + target.expression <- expression[which(rownames(expression) %in% targets), ] + + # select the candidate genes + candidate.table <- filter(eqtl.table, qtl_bp >= start, + qtl_bp <= end, + qtl_type == 'local', + qtl_chromosome == chr, + gene_bp >= start, + gene_bp <= end, + stage == dev.stage) + candidates <- as.character(candidate.table$trait) + candidate.expression <- expression[which(rownames(expression) %in% rownames(expression)), ] + + # combine expression matrix for targets, candidates, and genes underlying qtl + expression.m <- expression[which(rownames(expression) %in% c(targets, candidates)), ] + + # create gene table and determine the gene function + nodes <- gene.feature[which(gene.feature$trait %in% c(targets, candidates)), ] + nodes <- data.frame(id = c(targets, candidates), + role = c(rep('target', length(targets)), rep('candidates', length(candidates))), + is_tf = NA) + nodes[, 'is_tf'] <- gene.feature[match(nodes$id, gene.feature$trait), 'is_TF'] + regulators <- as.character(nodes$trait[which(nodes$role == 'candidates' | nodes$is_tf == 1)]) + + # network inference using GENIE3 #### + weight <- GENIE3(exprMatrix = expression.m, verbose = TRUE) + weight <- melt(weight) + weight$Var1 <- as.character(weight$Var1) + weight$Var2 <- as.character(weight$Var2) + names(weight) <- c('nodes1', 'nodes2', 'weight') + + # remove duplicates + + edges.genie3 <- as.data.frame(matrix(data = NA, nrow = 6670, ncol = 3)) + cur.row <- 1 + for(i in row.names(expression.m)) { + for(j in row.names(expression.m)) { + if(i == j) { + next + } + set1 <- weight[which(weight$nodes1 == i & weight$nodes2 == j), ] + set2 <- weight[which(weight$nodes2 == i & weight$nodes1 == j), ] + if(set1[, 3] >= set2[, 3]) { + edges.genie3[cur.row, ] <- set1 + cur.row <- cur.row + 1 + print(cur.row) + } + if(set1[, 3] <= set2[, 3]) { + edges.genie3[cur.row, ] <- set2 + cur.row <- cur.row + 1 + print(cur.row) + } + } + } + edges.genie3 <- distinct(edges.genie3) + names(edges.genie3) <- c('nodes1', 'nodes2', 'weight') + + # ranking + rank.genie3 <- edges.genie3 + rank.genie3$rank <- rank(-rank.genie3$weight) + rank.genie3 <- rank.genie3[, c(1, 2, 4)] + write.csv(x = rank.genie3, file = paste0('networks/', dev.stage, chr, '-genie.csv')) + + # network inference using pearson correlation #### + + edges.pearson <- rcorr(t(expression.m), type = 'pearson') + edges.pearson <- melt(edges.pearson$r) + names(edges.pearson) <- c('nodes1', 'nodes2', 'weight') + + # remove duplicates and determine direction + rank.pearson <- rank.genie3[, 1:2] + rank.pearson <- merge(rank.pearson, edges.pearson, by = c("nodes1", "nodes2")) + rank.pearson <- rank.pearson[, 1:3] + rank.pearson$weight <- abs(rank.pearson$weight) + + # ranking + rank.pearson$rank <- rank(-rank.pearson$weight) + rank.pearson <- rank.pearson[, c(1, 2, 4)] + names(rank.pearson) <- c('nodes1', 'nodes2', 'rank') + write.csv(x = rank.pearson, file = paste0('networks/', dev.stage, chr, '-pearson.csv')) + + + # network inference using spearman correlation #### + edges.spearman <- rcorr(t(expression.m), type = 'spearman') + edges.spearman <- melt(edges.spearman$r) + names(edges.spearman) <- c('nodes1', 'nodes2', 'weight') + + # remove duplicates and determine direction + rank.spearman <- rank.genie3[, 1:2] + rank.spearman <- merge(rank.spearman, edges.spearman, by = c("nodes1", "nodes2")) + rank.spearman <- rank.spearman[, 1:3] + rank.spearman$weight <- abs(rank.spearman$weight) + + # ranking + rank.spearman$rank <- rank(-rank.spearman$weight) + rank.spearman <- rank.spearman[, c(1, 2, 4)] + names(rank.spearman) <- c('nodes1', 'nodes2', 'rank') + write.csv(x = rank.spearman, file = paste0('networks/', dev.stage, chr, '-spearman.csv')) + + # network inference using CLR #### + edges.clr <- minet(dataset = t(expression.m), method = 'clr', estimator = 'mi.empirical', disc = 'equalfreq') + edges.clr <- melt(edges.clr) + names(edges.clr) <- c('nodes1', 'nodes2', 'weight') + + # ranking + rank.clr <- rank.genie3[, 1:2] + rank.clr <- merge(rank.clr, edges.clr, by = c("nodes1", "nodes2")) + rank.clr <- rank.clr[, 1:3] + + rank.clr$rank <- rank(-rank.clr$weight) + rank.clr <- rank.clr[, c(1, 2, 4)] + names(rank.clr) <- c('nodes1', 'nodes2', 'rank') + write.csv(x = rank.clr, file = paste0('networks/', dev.stage, chr, '-clr.csv')) + + # network inference using aracne #### + edges.aracne <- minet(dataset = t(expression.m), method = 'aracne', estimator = 'mi.empirical', disc = 'equalfreq') + edges.aracne <- melt(edges.aracne) + names(edges.aracne) <- c('nodes1', 'nodes2', 'weight') + + # ranking + rank.aracne <- rank.genie3[, 1:2] + rank.aracne <- merge(rank.aracne, edges.aracne, by = c("nodes1", "nodes2")) + rank.aracne <- rank.aracne[, 1:3] + + rank.aracne$rank <- rank(-rank.aracne$weight) + rank.aracne <- rank.aracne[, c(1, 2, 4)] + names(rank.aracne) <- c('nodes1', 'nodes2', 'rank') + write.csv(x = rank.aracne, file = paste0('networks/', dev.stage, chr, '-aracne.csv')) + + # network inference using mrnet #### + edges.mrnet <- minet(dataset = t(expression.m), method = 'mrnet', estimator = 'mi.empirical', disc = 'equalfreq') + edges.mrnet <- melt(edges.mrnet) + names(edges.mrnet) <- c('nodes1', 'nodes2', 'weight') + + # ranking + rank.mrnet <- rank.genie3[, 1:2] + rank.mrnet <- merge(rank.mrnet, edges.mrnet, by = c("nodes1", "nodes2")) + rank.mrnet <- rank.mrnet[, 1:3] + + rank.mrnet$rank <- rank(-rank.mrnet$weight) + rank.mrnet <- rank.mrnet[, c(1, 2, 4)] + names(rank.mrnet) <- c('nodes1', 'nodes2', 'rank') + write.csv(x = rank.mrnet, file = paste0('networks/', dev.stage, chr, '-mrnet.csv')) + + # network inference using tigress + weight.tigress <- tigress::tigress(t(expression.m), + nsplit = 1000, + nstepsLARS = 5, + allsteps = F) + weight.tigress <- melt(weight.tigress) + names(weight.tigress) <- c('nodes1', 'nodes2', 'weight') + weight.tigress$nodes1 <- as.character(weight.tigress$nodes1) + weight.tigress$nodes2 <- as.character(weight.tigress$nodes2) + + # remove duplicates + # combination <- nrow(expression.m) * (nrow(expression.m) - 1) /2 + # edges.tigress <- as.data.frame(matrix(data = NA, nrow = combination, ncol = 3)) + # cur.row <- 1 + # for(i in row.names(expression.m)) { + # for(j in row.names(expression.m)) { + # if(i == j) { + # next + # } + # set1 <- weight.tigress[which(weight.tigress$nodes1 == i & weight.tigress$nodes2 == j), ] + # set2 <- weight.tigress[which(weight.tigress$nodes2 == i & weight.tigress$nodes1 == j), ] + # if(set1[, 3] >= set2[, 3]) { + # edges.tigress[cur.row, ] <- set1 + # cur.row <- cur.row + 1 + # print(cur.row) + # } + # if(set1[, 3] <= set2[, 3]) { + # edges.genie3[cur.row, ] <- set2 + # cur.row <- cur.row + 1 + # print(cur.row) + # } + # } + # } + # edges.tigress <- distinct(edges.tigress) + # names(edges.tigress) <- c('nodes1', 'nodes2', 'weight') + + # ranking + rank.tigress <- rank.genie3[, 1:2] + rank.tigress <- merge(rank.tigress, weight.tigress, by = c("nodes1", "nodes2")) + rank.tigress <- rank.tigress[, 1:3] + + rank.tigress$rank <- rank(-rank.tigress$weight) + rank.tigress <- rank.tigress[, c(1, 2, 4)] + names(rank.tigress) <- c('nodes1', 'nodes2', 'rank') + write.csv(x = rank.tigress, file = paste0('networks/', dev.stage, chr, '-tigress.csv')) + + # merge the tables + + rank.list <- list(rank.genie3, rank.spearman, rank.clr, rank.aracne, rank.tigress) + #rank.list <- list(rank.genie3, rank.spearman, rank.pearson, rank.mrnet, rank.clr, rank.aracne, rank.tigress) + networks <- Reduce(function(x, y) full_join(x, y, by = c('nodes1', 'nodes2')), rank.list) + + names(networks) <- c('nodes1', 'nodes2', 'genie3', 'spearman', 'clr', 'aracne', 'tigress') + #names(networks) <- c('nodes1', 'nodes2', 'genie3', 'spearman', 'pearson', 'mrnet', 'clr', 'aracne', 'tigress') + networks$avg_rank <- NA + for(i in 1:nrow(networks)) { + networks$avg_rank[i] <- + mean(as.numeric(networks[i, 3:(ncol(networks) - 1)])) + } + + # PCA + pr.out <- prcomp(x = (t(networks[3:10])), center = T, scale. = F) + pc.df <- data.frame(pc1 = pr.out$x[, 1], # 0.37 + pc2 = pr.out$x[, 2]) # 0.32 + pc.df$method <- rownames(pc.df) + ggplot(pc.df, aes(x = pc1, y = pc2, color = method)) + geom_point(size = 3.5) + pc.sum <- summary(pr.out) + pc1 <- pc.sum$importance[2, 1] + pc2 <- pc.sum$importance[2, 2] + + # determine threshold + edges <- networks[, c('nodes1', 'nodes2', 'avg_rank')] + colnames(edges) <- c('source', 'target', 'interaction') + + edges <- na.omit(edges) + threshold.m <- as.data.frame(matrix(data = NA, ncol = 3, nrow = 0)) + names(threshold.m) <- c('threshold', "edge", "node") + cur.row <- 1 + for(i in max(round(edges$interaction, 0)):0){ + edges.tmp <- filter(edges, interaction <= i) + total.edge <- nrow(edges.tmp) + total.node <- length(unique(c(edges.tmp$source, edges.tmp$target))) + threshold.m[cur.row, ] <- c(i, total.edge, total.node) + cur.row <- cur.row + 1 + } + + # visualize the threshold matrix + plot(threshold.m$threshold, + threshold.m$node, + xlab = "rank threshold", + ylab = 'number of node') # 1552 for RP5 and 1248 for PD3 + + # visualize the network + edges.th <- filter(edges, interaction <= 288) + #edges$interaction <- log2(edges$interaction) + write.csv(edges, paste0('networks/', dev.stage, chr, '-edge.csv')) + write.csv(nodes, paste0('networks/', dev.stage, chr, '-node.csv')) + createNetworkFromDataFrames(nodes = nodes, edges = edges.th) + diff --git a/combined-stage-eqtl-analysis.R b/combined-stage-eqtl-analysis.R new file mode 100644 index 0000000..46c35ed --- /dev/null +++ b/combined-stage-eqtl-analysis.R @@ -0,0 +1,355 @@ +######################################## +##### combined-stage eQTL analysis ##### +######################################## + +#### prepare the script working environment #### + remove(list = ls()) + gc() + set.seed(1000) + + # Set working directory #### + work.dir <- "C:/Users/harta005/Projects/seed-germination-qtl" + setwd(work.dir) + + # dependencies #### + library(dplyr) + library(ggplot2) + library(tidyr) + library(doParallel) + library(VennDiagram) + library(UpSetR) + library(forcats) + library(grid) + + library(heritability) + library(topGO) + library(org.At.tair.db) + # unused libraries #### + # library(Mfuzz) + # library(Biobase) + # library(gplots) + # library(RColorBrewer) + # library(limma) + # library(factoextra) + # library(stats) + # library(amap) + # + # + # + # library(gridExtra) + # library(RCy3) + # library(corrplot) + # library(reshape2) + # library(Hmisc) + # library(igraph) + # library(threejs) + # library(MASS) + # + + # load required function #### + setwd('functions/') + for(i in 1:length(dir())){ + source(dir()[i]) + } # read function from Mark + setwd(work.dir) + + overlap <- function(table, stage) { + for (i in 1:length(stage)) { + if (nrow(table) != 0) { + table <- subset(table, table[stage[i]] == T) + } else { + break + } + } + nrow(table) + } # function to determine the overlapping qtl between 2 stages + + # load the data #### + seed.stage <- c('pd', 'ar', 'im', 'rp') + + + # presentation + presentation <- theme(axis.text.x = element_text(size=10, face="bold", color="black"), + axis.text.y = element_text(size=10, face="bold", color="black"), + axis.title.x = element_text(size=12, face="bold", color="black"), + axis.title.y = element_text(size=12, face="bold", color="black"), + strip.text.x = element_text(size=12, face="bold", color="black"), + strip.text.y = element_text(size=12, face="bold", color="black"), + strip.text = element_text(size =12, face="bold", color="black"), + #plot.title = element_text(size=15, face="bold"), + panel.background = element_rect(fill = "white",color="black"), + panel.grid.major = element_line(colour = "grey80"), + panel.grid.minor = element_blank()) + +#### combined-stage eQTL mapping - reduced model #### + + stage <- 'rm' + qtl.data <- QTL.data.prep(trait.matrix = trait.matrix, + strain.trait = colnames(trait.matrix), + strain.map = genetic.map, + strain.marker = marker) + + cluster <- makeCluster(n.cores, type = "PSOCK") + registerDoParallel(cluster) + output <- foreach(i = 1:nrow(trait.matrix), .combine = 'cbind') %dopar% { + map.all.marker(trait = trait.matrix[i, ], markers = genetic.map) + } + stopCluster(cluster) + + pval.out <- t(output[, which(colnames(output) == 'LOD')]) + eff.out <- t(output[, which(colnames(output) == 'Eff')]) + + colnames(pval.out) <- rownames(marker); rownames(pval.out) <- rownames(trait.matrix) + colnames(eff.out) <- rownames(marker); rownames(eff.out) <- rownames(trait.matrix) + + qtl.profile <- NULL; qtl.profile <- as.list(qtl.profile) + qtl.profile[[1]] <- round(pval.out,digits=2) + qtl.profile[[2]] <- round(eff.out,digits=3) + qtl.profile[[3]] <- trait.matrix + qtl.profile[[4]] <- genetic.map + qtl.profile[[5]] <- marker + names(qtl.profile) <- c("LOD","Effect","Trait","Map","Marker") + + write.EleQTL(map1.output = qtl.profile, filename = paste0("qtl-tables/table_single-stage-eqtl_", 'rm')) + saveRDS(object = qtl.profile, + file = paste0("qtl-profiles/profile_combined-stage-eqtl_", stage, ".rds")) + + # eQTL peak finder - combined-stage rm #### + + stage <- 'rm' + + # peak finder + qtl.profile <- readRDS(paste0('qtl-profiles/profile_single-stage-eqtl_', stage, '.rds')) + qtl.peak <- mapping.to.list(map1.output = qtl.profile) %>% + peak.finder(threshold = 4.3) + qtl.peak <- na.omit(qtl.peak) + saveRDS(object = qtl.peak, + file = paste0("qtl-peaks/peak_combined-stage-eqtl_", stage, ".rds")) + + + # eQTL table- combined-stage rm #### + stage <- 'rm' + qtl.profile <- readRDS(paste0('qtl-profiles/profile_combined-stage-eqtl_', stage, '.rds')) + qtl.peak <- readRDS(paste0('qtl-peaks/peak_combined-stage-eqtl_', stage, '.rds')) + eqtl.table <- eQTL.table(peak.list.file = qtl.peak, trait.annotation = gene.info) %>% + eQTL.table.addR2(QTL.prep.file = qtl.data) + eqtl.table$qtl_chromosome <- as.factor(eqtl.table$qtl_chromosome) + eqtl.table$gene_chromosome <- as.factor(eqtl.table$gene_chromosome) + + # add heritability - combined-stage rm ##### + + h2.result <- as.list(NULL) + n.perm <- 100 + qtl.genes <- eqtl.table$trait.matrix + + map <- genetic.map + map <- apply(map, 2, as.numeric) # make sure the alleles are treated as numeric + trait <- trait.matrix + + map2 <- map + #map2[map2 == 0] <- 0.5 + map2[map2 == -1] <- round(0, 0) + #map2[is.na(map2)] <- 0.5 + kinship.matrix <- emma.kinship(map2) + colnames(kinship.matrix) <- colnames(map2); rownames(kinship.matrix) <- colnames(map2) + + cluster <- makeCluster(n.cores, type = 'PSOCK') + clusterExport(cl = cluster, c('trait', 'map2', 'marker_h2', 'kinship.matrix', 'h2.REML')) + h2 <- t(parApply(cl = cluster, X = trait, MARGIN = 1, FUN = h2.REML, strain.names = colnames(map2), kinship.matrix = kinship.matrix, Vg.factor = 1)) + stopCluster(cluster) + + h2 <- cbind.data.frame(trait = rownames(h2), h2) + eqtl.table <- merge(x = eqtl.table, y = h2[, 1:2], by = 'trait', all.x = T) + eqtl.table <- rename(eqtl.table, h2 = h2_REML) + + # trans bands identification - combined-stage rm ##### + window.nu <- 2e6 + maxsize <- 100e6 + chr.num <- 5 + + transband.id <- mutate(eqtl.table, interval = findInterval(qtl_bp, seq(1, maxsize, by = window.nu))) %>% + group_by(qtl_chromosome, interval, qtl_type) %>% + summarise(n.ct = length(unique(trait))) %>% + data.frame() %>% + group_by(qtl_type) %>% + mutate(exp.ct = mean(as.numeric(unlist(n.ct)))) %>% + data.frame() %>% + mutate(transband_significance = ppois(n.ct, lambda = exp.ct, lower.tail = F)) %>% + filter(transband_significance < 0.0001, qtl_type == "trans") + + transband.id$transband_id <- with(transband.id, paste0("ch", qtl_chromosome, ":", + (interval - 1) * 2, "-", + interval * 2, "Mb")) + transband.id$stage <- stage + + saveRDS(object = transband.id, + file = paste0("trans-bands/trans.bands_", stage, ".rds")) + + # for rm + if( stage == 'rm' ) { + eqtl.table <- mutate(eqtl.table, + trans_band = ifelse(qtl_type == "trans" & + qtl_chromosome == 5 & + qtl_bp > 6e6 & + qtl_bp <= 8e6, "ch5:6-8Mb", + ifelse(qtl_type == "trans" & + qtl_chromosome == 1 & + qtl_bp > 26e6 & + qtl_bp <= 28e6, "ch1:26-28Mb", + ifelse(qtl_type == "trans" & + qtl_chromosome == 2 & + qtl_bp > 10e6 & + qtl_bp <= 14e6, "ch2:10-14Mb", + ifelse(qtl_type == "trans" & + qtl_chromosome == 4 & + qtl_bp > 0e6 & + qtl_bp <= 2e6, "ch4:0-2Mb", + ifelse(qtl_type == "trans" & + qtl_chromosome == 5 & + qtl_bp > 14e6 & + qtl_bp <= 16e6, "ch5:14-16Mb","none")))))) + } + + print("finish") + + write.csv(x = eqtl.table, + file = paste0("qtl-tables/table_combined-stage-eqtl_", stage, ".csv"), + row.names = T) + saveRDS(object = eqtl.table, + file = paste0("qtl-tables/table_combined-stage-eqtl_", stage, ".rds")) + table(eqtl.table$qtl_type, eqtl.table$trans_band != "none") + + # GO for trans bands - combined-stage rm #### + + ontology <- 'BP' + + x <- org.At.tairCHR + all.genes <- as.list(rownames(trait.matrix)) + transband.go <- as.data.frame(matrix(data = NA, nrow = 0, ncol = 9)) + colnames(transband.go) <- c('GO.ID', 'Term', 'Annotated', 'Significant', 'Expected', 'Fisher', + 'FDR', 'stage', 'transband') + + eqtl.table <- readRDS(paste0('qtl-tables/table_combined-stage-eqtl_', stage, '.rds')) + transband.id <- unique(eqtl.table$trans_band) + transband.id <- transband.id[!transband.id %in% 'none'] + transband.go.perstage <- as.data.frame(matrix(data = NA, nrow = 0, ncol = 9)) + colnames(transband.go.perstage) <- c('GO.ID', 'Term', 'Annotated', 'Significant', 'Expected', 'Fisher', + 'FDR', 'stage', 'transband') + + for (i in 1:length(transband.id)) { + gene.set <- eqtl.table[which(eqtl.table$trans_band == transband.id[i]), 'trait'] + gene.set <- factor(as.integer(all.genes %in% gene.set)) + names(gene.set) <- all.genes + GOdata <- new("topGOdata", + description = "GOE for genes in regulated by trans bands", ontology = ontology, + allGenes = gene.set, + annot = annFUN.org,mapping= "org.At.tair.db") + resultFisher <- runTest(GOdata, algorithm = 'weight', statistic = "fisher") + result.df <- GenTable(GOdata, Fisher = resultFisher, + orderBy = "Fisher", ranksOf = "Fisher", topNodes = length(resultFisher@score)) + result.df$stage <- stage + result.df$transband <- transband.id[i] + result.df$Fisher <- as.numeric(result.df$Fisher) + result.df$FDR <- p.adjust(p = result.df$Fisher, method = 'fdr') + result.df <- result.df[order(result.df$FDR), ] + result.df <- dplyr::filter(result.df, Fisher <= 0.01) + transband.go <- rbind(transband.go, result.df) + } + + write.csv(transband.go, paste0('files/trans-bands-rm-go', ontology, '.csv')) + + # prepare eqtl table and plotting - combined-stage rm #### + + eqtl.table <- eqtl.table %>% + mutate(qtl_type = ifelse(qtl_type == 'cis', 'local', 'distant')) + + # cis-trans plot - combined-stage rm #### + + eqtl.plot <- ggplot(eqtl.table, aes(x = qtl_bp, y = gene_bp)) + + geom_segment(aes(x = qtl_bp_left, y = gene_bp, xend = qtl_bp_right, yend = gene_bp), + alpha = 0.25, colour = "grey") + + geom_point(aes(colour = ordered(qtl_type, levels = c('local', 'distant'))), alpha = .75) + + facet_grid(fct_rev(gene_chromosome) ~ qtl_chromosome, space = "free", scales = "free") + + presentation + + scale_colour_manual('eQTL type', values = c('black', '#377eb8')) + + labs(x = "eQTL peak position (Mb)", y = "Gene position (Mb)") + + scale_x_continuous(breaks=c(5, 10, 15, 20, 25, 30, 35, 40)*10^6,labels=c(5, 10, 15, 20, 25, 30, 35, 40)) + + scale_y_continuous(breaks=c(5, 10, 15, 20, 25, 30, 35, 40)*10^6,labels=c(5, 10, 15, 20, 25, 30, 35, 40)) + + pdf(file = paste0("figures/cis-trans-plot-rm.pdf"), width = 10, height = 8) + eqtl.plot + dev.off() + + # histogram - combined-stage rm #### + + eqtl.hist <- ggplot(eqtl.table %>% filter(qtl_type != 'local'), + aes(x=qtl_bp)) + # 750 x 400 + geom_histogram(binwidth = 2000000, alpha = 0.8, fill = '#377EB8') + + facet_grid(. ~ qtl_chromosome, space = "free",scales="free") + + presentation + + scale_fill_manual(values = 'blue') + + theme(legend.position = "right", legend.text=element_text(size=10)) + + labs(x="eQTL peak position (Mb)",y="eQTL counts") + + ylim(c(0, 80)) + + geom_hline(aes(yintercept = 11.72), + linetype = 'dashed', + color = 'red', + size = .1) + + scale_x_continuous(breaks=c(5, 10, 15, 20, 25, 30, 35, 40)*10^6,labels=c(5, 10, 15, 20, 25, 30, 35, 40)) + + pdf(file = "figures/trans-eqtl-histogram-rm.pdf", width = 10, height = 5) + eqtl.hist + dev.off() + + # R2 vs heritability - combined-stage rm #### + ggplot(eqtl.table, aes(x = qtl_R2_sm, + fill = factor(qtl_type, levels = c('local', 'distant')), + color = factor(qtl_type, levels = c('local', 'distant')))) + + geom_histogram(aes(y = ..density..), binwidth = 0.01, position = 'identity', alpha = 0) + + geom_density(alpha = 0.5, size = 0) + + geom_vline(aes(xintercept = median(qtl_R2_sm), + color = factor(qtl_type, levels = c('local', 'distant'))), + linetype = 'dashed', size = .5) + + presentation + + scale_color_brewer(palette = 'Set1') + + scale_fill_brewer(palette = 'Set1') + + xlab('explained phenotypic variance (R2)') + + ylab('count of eQTL') + + labs(fill = 'eQTL type', color = 'eQTL type') + + #ylim(0, 1) + + xlim(0, 1) + + theme(strip.text.x = element_text(size = 8)) + + ggplot(eqtl.table, aes(x = h2, y = qtl_R2_sm, + fill = factor(qtl_type, levels = c('local', 'distant')), + color = factor(qtl_type, levels = c('local', 'distant')))) + + geom_point(size = 1, alpha = 0.5) + + geom_vline(data = group_by(eqtl.table, qtl_type) %>% + summarise(med = median(h2, na.rm = T)), + aes(xintercept = med, color = factor(qtl_type, levels = c('local', 'distant'))), + linetype = 'dashed', size = .5) + + geom_hline(data = group_by(eqtl.table, qtl_type) %>% + summarise(med = median(qtl_R2_sm, na.rm = T)), + aes(yintercept = med, color = factor(qtl_type, levels = c('local', 'distant'))), + linetype = 'dashed', size = .5) + + geom_smooth(model = lm) + + geom_abline(aes(slope = 1, intercept = 0), size = .75, linetype = 'dashed') + + presentation + + scale_color_brewer(palette = 'Set1') + + scale_fill_brewer(palette = 'Set1') + + xlab('narrow-sense heritabllity (h2)') + + ylab('explained phenotypic variance (R2)') + + labs(fill = 'eQTL type', color = 'eQTL type') + + ylim(0, 1) + + xlim(0, 1) + + theme(strip.text.x = element_text(size = 8)) + + + + + + + + + + \ No newline at end of file diff --git a/goe-app.R b/goe-app.R new file mode 100644 index 0000000..f97bd04 --- /dev/null +++ b/goe-app.R @@ -0,0 +1,110 @@ +# +# This is a Shiny web application. You can run the application by clicking +# the 'Run App' button above. +# +# Find out more about building applications with Shiny here: +# +# http://shiny.rstudio.com/ +# + +remove(list = ls()) +gc() + +# setting directory +work.dir <- "C:/Users/harta005/Projects/seed-germination-qtl/" +setwd(work.dir) + +# load library +library(shiny) +library(topGO) +library(org.At.tair.db) +library(ggplot2) + +# load variables for goe test +x <- org.At.tairCHR +all.genes <- as.list(as.character(read.csv('files/gene-list.csv', header = F)[, 1])) +hc.go.list <- as.data.frame(matrix(data = NA, nrow = 0, ncol = 8)) +colnames(hc.go.list) <- c('GO.ID', 'Term', 'Annotated', 'Significant', 'Expected', 'Fisher', + 'FDR') + + +# Define UI for application that draws a histogram +ui <- fluidPage( + + # Application title + titlePanel("Gene ontology enrichment test for Joosen data"), + + # Sidebar with a slider input for number of bins + sidebarLayout( + sidebarPanel( + textAreaInput(inputId = 'gene.list', + label = 'Put your list of gene here!', + width = '100%', + height = '400px', + value = '', + placeholder = "AT4G22890\nAT2G34630\nAT2G03270\nAT2G25590\nAT5G25130", + actionButton('button', label = 'Submit!') + ), + selectInput(inputId = 'go.term', + label = 'Select the GO term:', + choices = c('biological process', 'molecular function', 'cellular component'), + selected = 'biological process' + ), + actionButton(inputId = 'submit', label = 'Submit') + ), + + + # Show a plot of the generated distribution + mainPanel( + downloadButton(outputId = 'download', 'Download'), + DT::dataTableOutput("go.result") + ) + ) +) + +# Define server logic required to draw a histogram +server <- function(input, output) { + + reactive.gene.list <- eventReactive(input$submit, { + input$gene.list + }) + + # creating data table + data_table <- reactive({ + gene.list <- reactive.gene.list() + gene.set <- unlist(strsplit(gene.list, '\n')) + gene.set <- factor(as.integer(all.genes %in% gene.set)) + ontology <- ifelse(input$go.term == 'biological process', 'BP', + ifelse(input$go.term == 'molecular function', 'MF', 'CC')) + + names(gene.set) <- all.genes + GOdata <- new("topGOdata", + description = "Analyzing clustering results", + ontology = ontology, + allGenes = gene.set, + annot = annFUN.org,mapping= "org.At.tair.db") + resultFisher <- runTest(GOdata, algorithm = 'weight', statistic = "fisher") + result.df <- GenTable(GOdata, pval = resultFisher, + orderBy = "Fisher", ranksOf = "Fisher", topNodes = length(resultFisher@score)) + result.df$FDR <- round(p.adjust(p = result.df$pval, method = 'fdr'), 4) + result.df$pval <- round(as.numeric(result.df$pval), 4) + result.df <- result.df[order(result.df$FDR), ] + #result.df <- filter(result.df, FDR <= 0.001) + #hc.go.list <- rbind(hc.go.list, result.df) + result.df + }) + + output$go.result <- DT::renderDataTable({ + data_table() + }) + + output$download <- downloadHandler( + filename = paste0(input$go.term, ".csv"), + content = function(file) { + write.csv(data_table(), file, row.names = FALSE) + } + ) +} + +# Run the application +shinyApp(ui = ui, server = server) diff --git a/seed-germination-qtl.Rproj b/seed-germination-qtl.Rproj new file mode 100644 index 0000000..3af27f6 --- /dev/null +++ b/seed-germination-qtl.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/ssh-key b/ssh-key new file mode 100644 index 0000000..b142cbb --- /dev/null +++ b/ssh-key @@ -0,0 +1,7 @@ +-----BEGIN OPENSSH PRIVATE KEY----- +b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW +QyNTUxOQAAACBUWrInKzWUdNlKTr6empD/zBf3+RzF8PLM5MoR+wlqpQAAAJhXXXI6V11y +OgAAAAtzc2gtZWQyNTUxOQAAACBUWrInKzWUdNlKTr6empD/zBf3+RzF8PLM5MoR+wlqpQ +AAAEBqhGRCpBqPriAmbs/KkgZ53oSR9HHbjc4wOZC2NvAkrlRasicrNZR02UpOvp6akP/M +F/f5HMXw8szkyhH7CWqlAAAAFW1hcmdpLmhhcnRhbnRvQHd1ci5ubA== +-----END OPENSSH PRIVATE KEY----- diff --git a/ssh-key.pub b/ssh-key.pub new file mode 100644 index 0000000..fd6cec6 --- /dev/null +++ b/ssh-key.pub @@ -0,0 +1 @@ +ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFRasicrNZR02UpOvp6akP/MF/f5HMXw8szkyhH7CWql margi.hartanto@wur.nl -- GitLab