Skip to content
Snippets Groups Projects
Commit f2034b8f authored by Hartanto, Margi's avatar Hartanto, Margi
Browse files

Initial commit

parent 19b6e645
No related branches found
No related tags found
No related merge requests found
.Rproj.user
.Rhistory
.RData
.Ruserdata
figures
files
functions
networks
qtl-peaks
qtl-permutations
qtl-tables
trans-bands
qtl-profiles
.Rproj.user
.Rhistory
.RData
.Ruserdata
figures
files
functions
networks
qtl-peaks
qtl-permutations
qtl-tables
trans-bands
qtl-profiles
###############################################
##### seed transcript clustering analysis #####
###############################################
#### prepare the script working environment ####
remove(list = ls())
gc()
set.seed(1000)
# set working directory ####
work.dir <- "C:/Users/harta005/Projects/seed-germination-qtl"
setwd(work.dir)
# dependencies ####
library(dplyr)
library(ggplot2)
library(limma)
library(Biobase)
library(stats)
library(gplots)
library(RColorBrewer)
library(Rfast)
library(amap)
library(topGO)
library(org.At.tair.db)
# unused libraries ####
# library(Mfuzz)
# library(factoextra)
# library(doParallel)
# library(forcats)
# library(tidyr)
# library(gridExtra)
# library(Hmisc)
# load the data ####
trait.matrix <- read.csv(file = 'files/trait-matrix.csv', row.names = 1)
sample.list <- read.csv(file = 'files/sample-list.csv')
sample.stage <- sample.list$stage
# data presentation ####
presentation <- theme(axis.text.x = element_text(size=6, face="bold", color="black"),
axis.text.y = element_text(size=6, face="bold", color="black"),
axis.title.x = element_text(size=7, face="bold", color="black"),
axis.title.y = element_text(size=7, face="bold", color="black"),
strip.text.x = element_text(size=7, face="bold", color="black"),
strip.text.y = element_text(size=7, face="bold", color="black"),
strip.text = element_text(size =7, face="bold", color="black"),
#plot.title = element_text(size=15, face="bold"),
panel.background = element_rect(fill = "white",color="black"),
panel.grid.major = element_line(colour = "grey80"),
panel.grid.minor = element_blank())
# create the expression object ####
x <- trait.matrix # the expression matrix
genes <- data.frame(trait = rownames(x)) # the gene list
f <- data.frame(trait = rownames(x)) # the gene feature
rownames(f) <- rownames(x)
p <- data.frame(id = colnames(trait.matrix), stage = sample.stage)
rownames(p) <- colnames(x)
eset <- ExpressionSet(assayData = as.matrix(x),
phenoData = AnnotatedDataFrame(p),
featureData = AnnotatedDataFrame(f))
# PCA ####
pr.out <- prcomp(x = (t(trait.matrix)), center = T, scale. = F)
pc.df <- data.frame(pc1 = pr.out$x[, 1],
pc2 = pr.out$x[, 2],
stage = sample.stage,
population = c(rep('parent', 16), rep('RIL', 164)))
pca.all <- ggplot(pc.df, aes(x = pc1, y = pc2, color = factor(stage, level = c('pd', 'ar', 'im', 'rp')))) +
geom_point(aes(shape = population)) +
scale_colour_manual(values = c('#ccbb44', '#228833', '#4477aa', '#cc3311')) +
scale_shape_manual(values = c(17, 19)) +
labs(x = "PC1 (55.56%)", y = "PC2 (14.82%)") +
labs(colour = 'stage') +
theme(text = element_text(size = 10),
panel.background = element_blank(),
panel.border=element_rect(fill=NA),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background=element_blank(),
axis.text.x=element_text(colour="black"),
axis.text.y=element_text(colour="black"),
axis.ticks=element_line(colour="black"),
plot.margin=unit(c(1,1,1,1),"line")) +
#presentation +
geom_hline(aes(yintercept = 0), linetype = 'dashed', size = .1) +
geom_vline(aes(xintercept = 0), linetype = 'dashed', size = .1)
tiff(file = paste0("figures/pca-all.tiff"),
width = 2250,
height = 1200,
units = 'px',
res = 300,
compression = 'lzw')
pca.all
dev.off()
# differential expresed gene analysis and hierarchiecal clustering ####
group <- with (pData(eset), stage)
group <- factor(group)
design <- model.matrix(~ 0 + group)
colnames(design) <- levels(group)
colSums(design)
# create pairwise contrast matrix among four different stages
cm <- makeContrasts(pd_ar = pd - ar,
ar_im = ar - im,
im_rp = im - rp,
levels = design)
coef <- colnames(cm)
# fit the linear model to determine significant differentially expressed genes
# for each pairwise contrast
fit <- lmFit(eset, design)
fit2 <- contrasts.fit(fit, contrasts = cm) %>%
eBayes()
result <- decideTests(fit2)
summary(result) # 1 = upregulate
write.csv(result, 'files/differentally-expressed-genes.csv')
# determine the differentially expressed genes for each pairwise contrast
# based on p value (>0.05) and log fold change more than 1 sorted by fold change
de.genes <- NA
coef2 <-
for (i in 1:length(coef)) {
print(i)
deg.tmp <- topTable(fit2, lfc = 1, coef = coef[i],
p.value = 0.05,
sort.by = 'logFC',
number = 100000)
de.genes <- c(de.genes, as.character(deg.tmp$trait))
}
de.genes <- unique(de.genes) # as much as 990 genes are differentially expressed between one of the contrasts
cluster.data <- trait.matrix[which(rownames(trait.matrix) %in% de.genes), ]
colnames(cluster.data) <- sample.stage
# hierarchiecal tree clustering done for stage and genes ####
# hc for stage
dist.matrix.stage <- Dist(t(cluster.data),
method = 'pearson',
upper = T,
diag = T)
hc.stage <- hclust(dist.matrix.stage, method = 'ward.D2')
#hc.stage$order <- c(hc.stage$order[46:180], hc.stage$order[1:45])
reorder(x = as.dendrogram(hc.stage), c(46:180, 1:45))
plot(reorder(x = as.dendrogram(hc.stage), 180:1, agglo.FUN = mean))
# hc for gene
dist.matrix.gene <- Dist(cluster.data,
method = 'pearson', # to group the genes based on patterns similarity across stages
upper = T,
diag = T)
hc.gene <- hclust(dist.matrix.gene, method = 'ward.D2') # similar to average linkage-
# it calculates the distance that is minimizing the variance within cluster
# and maximazing the variance between clusters
plot(hc.gene)
hc.gene <- cutree(hc.gene, k =6) # if k > 5, the cluster will be diproporsional i.e. a cluster only have few member
table(hc.gene)
hc.table <- as.data.frame(table(true = rownames(cluster.data), cluster = hc.gene))
hc.table <- hc.table[which(hc.table$Freq != 0), ]
hc.table$cluster <- as.numeric(hc.table$cluster)
hc.table$true <- as.character(hc.table$true)
hc.table$Freq <- NULL
table(hc.table$cluster)
write.csv(hc.table, 'files/gene-cluster.csv')
# heatmap and hierarchiecal clustering ####
sample.color <- ifelse(grepl('pd', colnames(trait.matrix)), '#4daf4a',
ifelse(grepl('ar', colnames(trait.matrix)), '#377eb8',
ifelse(grepl('im', colnames(trait.matrix)), '#984ea3',
ifelse(grepl('rp', colnames(trait.matrix)), '#e41a1c', NA))))
tiff(file = paste0("figures/heatmap-genes.tiff"),
width = 2250,
height = 2000,
units = 'px',
res = 300,
compression = 'lzw')
heatmap.2(x = as.matrix(cluster.data), cexRow = 0.8, cexCol = 1.2,
distfun = function(x) Dist(x, method = 'pearson'),
hclust = function(x) hclust(x, method = 'ward.D2'),
scale = 'row',
density.info = "density",
trace = 'none',
col = brewer.pal(9, 'YlGnBu'),
keysize = 1,
key.title = 'Z-score of\nlog-intensities',
key.xlab = NA,
Colv = reorder(x = as.dendrogram(hc.stage), 180:1, agglo.FUN = mean),
ColSideColors = sample.color)
dev.off()
# GOE analysis for genes in each cluster ####
x <- org.At.tairCHR
all.genes <- as.list(rownames(trait.matrix))
hc.go.list <- as.data.frame(matrix(data = NA, nrow = 0, ncol = 8))
colnames(hc.go.list) <- c('GO.ID', 'Term', 'Annotated', 'Significant', 'Expected', 'Fisher',
'FDR', 'cluster')
ontology <- 'BP'
for (i in unique(hc.table$cluster)) {
gene.set <- hc.table$true[which(hc.table$cluster == i)]
gene.set <- factor(as.integer(all.genes %in% gene.set))
names(gene.set) <- all.genes
GOdata <- new("topGOdata",
description = "Analyzing clustering results", ontology = ontology,
allGenes = gene.set,
annot = annFUN.org,mapping= "org.At.tair.db")
resultFisher <- runTest(GOdata, algorithm = 'weight', statistic = "fisher")
# GOE from topgo consider the general terms on the hierarchy
# the weight algortihm maintain the balance between type I and II errors
result.df <- GenTable(GOdata, Fisher = resultFisher,
orderBy = "Fisher", ranksOf = "Fisher", topNodes = length(resultFisher@score))
result.df$FDR <- p.adjust(p = result.df$Fisher, method = 'fdr')
result.df$cluster <- i
result.df <- result.df[order(result.df$FDR), ]
result.df <- filter(result.df, FDR <= 0.001)
hc.go.list <- rbind(hc.go.list, result.df)
}
write.csv(hc.go.list, paste0('files/goe-tables', ontology, '.csv'))
\ No newline at end of file
##############################
##### seed eQTL analysis #####
##############################
#### prepare the script working environment ####
remove(list = ls())
gc()
set.seed(1000)
# Set working directory ####
work.dir <- "C:/Users/harta005/Projects/seed-germination-qtl"
setwd(work.dir)
# dependencies ####
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
# BiocManager::install("Mfuzz")
# BiocManager::install("topGO")
# BiocManager::install("org.At.tair.db")
# BiocManager::install("Rgraphviz")
# BiocManager::install("org.At.eg.db")
library(doParallel)
library(dplyr)
library(ggplot2)
library(heritability)
library(topGO)
library(org.At.tair.db)
# unused libraries ####
# library(Mfuzz)
# library(Biobase)
# library(gplots)
# library(RColorBrewer)
# library(limma)
# library(factoextra)
# library(stats)
# library(amap)
# library(forcats)
# library(tidyr)
# library(VennDiagram)
# library(gridExtra)
# library(RCy3)
# library(corrplot)
# library(reshape2)
# library(Hmisc)
# library(igraph)
# library(threejs)
# library(MASS)
# library(UpSetR)
# load required function ####
setwd('functions/')
for(i in 1:length(dir())){
source(dir()[i])
} # read function from Mark
setwd(work.dir)
write.EleQTL <- function(map1.output,filename){
selector <- cbind(trait = rownames(map1.output$LOD), pval = apply(map1.output$LOD,1,max,na.rm=T)) %>%
data.frame()
rownames(selector) <- NULL
lod <- map1.output$LOD
lod <- lod[rownames(lod) %in% selector[,1],]
rownames(lod) <- selector$trait
colnames(lod) <- map1.output$Marker[,1]
eff <- map1.output$Effect
eff <- eff[rownames(eff) %in% selector[,1],]
rownames(eff) <- selector$trait
colnames(eff) <- map1.output$Marker[,1]
lod.eff <- lod*sign(eff)
dat <- map1.output$Trait
dat <- dat[rownames(dat) %in% selector[,1],]
rownames(dat) <- selector$trait
colnames(dat) <- colnames(map1.output$Map)
map <- map1.output$Map
rownames(map) <- map1.output$Marker[,1]
marker <- map1.output$Marker
write.table(lod,file=paste(filename,"_lod.txt",sep=""),sep="\t",quote=F)
write.table(eff,file=paste(filename,"_eff.txt",sep=""),sep="\t",quote=F)
write.table(lod.eff,file=paste(filename,"_lodxeff.txt",sep=""),sep="\t",quote=F)
write.table(marker,file=paste(filename,"_marker.txt",sep=""),sep="\t",quote=F)
write.table(dat,file=paste(filename,"_data.txt",sep=""),sep="\t",quote=F)
write.table(map,file=paste(filename,"_map.txt",sep=""),sep="\t",quote=F)
} # function to convert mapping result to tables
map.per.marker <- function(trait, marker) {
model <- lm(terms(trait ~ marker, keep.order = FALSE))
summ <- summary(model)
pval <- summ$coefficients[2, 4]
lod <- -log10(pval)
eff <- summ$coefficients[2, 1]
output <- c(lod, eff)
return(output)
} # map the QTL at a marker location
map.all.marker <- function(trait, markers) {
eff.out <- rep(NA, nrow(markers))
pval.out <- rep(NA, nrow(markers))
for (i in 1:nrow(markers)) {
if(i == 1) {
out.tmp <- map.per.marker(trait, markers[i, ])
}
if( i != 1 & sum(abs(as.numeric(markers[i-1,]) - as.numeric(markers[i,])), na.rm = T) != 0 ) {
out.tmp <- map.per.marker(trait, markers[i, ])
}
if( i != 1 & sum(abs(as.numeric(markers[i-1,]) - as.numeric(markers[i,])), na.rm = T) == 0 ) {
out.tmp <- out.tmp
}
pval.out[i] <- out.tmp[1]
eff.out[i] <- out.tmp[2]
output.lod <- cbind(pval.out, eff.out)
colnames(output.lod) <- c('LOD', 'Eff')
}
return(output.lod)
} # map the QTL using genome wide markers
threshold.determination <- function(trait, strain.map, n.perm){
traits <- t(replicate(n.perm, trait))
perm.trait <- permutate.traits(traits)
###Check for NAs
pval.out <- matrix(NA,nrow(perm.trait),nrow(strain.map))
for (i in 1:nrow(perm.trait)) {
pval.out[i, ] <- fast.lod.all.marker(perm.trait[i, ], strain.map)
}
pval.distribution <- apply(pval.out, 1, max)
threshold <- quantile(x = pval.distribution, probs = 0.95)
return(threshold)
} # determine threshold for a QTL
# load required dataset ####
trait.matrix <- as.matrix(read.csv(file = 'files/trait-matrix.csv', row.names = 1))
genetic.map <- as.matrix(read.csv(file = 'files/genetic-map.csv'))
trait.matrix <- trait.matrix[, colnames(trait.matrix) %in% colnames(genetic.map)] # remove sample without genetic map
trait.matrix <- trait.matrix[, 17:176] #remove parent sample
genetic.map <- genetic.map[, 17:176] #remove parent sample
marker <- read.csv('files/marker.csv', row.names = 1)
sample.list <- read.csv(file = 'files/sample-list.csv')
sample.stage <- sample.list$stage
gene.info <- read.csv('files/gene.info.csv', row.names = 1)
ril.stage <- substr(x = colnames(trait.matrix), start = 8, stop = 10)
stage <- 'pd'
n.cores <- detectCores() - 1
# QTL mapping ####
for (stage in seed.stage) {
map <- genetic.map[, which(ril.stage == stage)]
map <- apply(map, 2, as.numeric) # make sure the alleles are treated as numeric
trait <- trait.matrix[, which(ril.stage == stage)]
qtl.data <- QTL.data.prep(trait.matrix = trait,
strain.trait = colnames(trait),
strain.map = map,
strain.marker = marker)
cluster <- makeCluster(n.cores, type = "PSOCK")
registerDoParallel(cluster)
output <- foreach(i = 1:nrow(trait), .combine = 'cbind') %dopar% {
map.all.marker(trait = trait[i, ], markers = map)
}
stopCluster(cluster)
pval.out <- t(output[, which(colnames(output) == 'LOD')])
eff.out <- t(output[, which(colnames(output) == 'Eff')])
colnames(pval.out) <- rownames(marker); rownames(pval.out) <- rownames(trait)
colnames(eff.out) <- rownames(marker); rownames(eff.out) <- rownames(trait)
qtl.profile <- NULL; qtl.profile <- as.list(qtl.profile)
qtl.profile[[1]] <- round(pval.out,digits=2)
qtl.profile[[2]] <- round(eff.out,digits=3)
qtl.profile[[3]] <- trait
qtl.profile[[4]] <- map
qtl.profile[[5]] <- marker
names(qtl.profile) <- c("LOD","Effect","Trait","Map","Marker")
write.EleQTL(map1.output = qtl.profile, filename = paste0("qtl-tables/table_single-stage-eqtl_", stage))
#saveRDS(object = qtl.profile,
#file = paste0("qtl-profiles/profile_single-stage-eqtl_", stage,".rds"))
}
# permutation and FDR determination
## based on Benjamini-Yekutieli
# setwd(dir = "qtl-permutations/")
# cluster <- makeCluster(n.cores, type = "PSOCK")
# registerDoParallel(cluster)
# foreach(i = 1:100) %dopar% {
# qtl.perm <- map.1.perm(trait.matrix = trait,
# strain.map = map,
# strain.marker = marker,
# n.perm = 1)
# save(qtl.perm, file = paste0("perm_single-stage-eqtl_", stage, i, ".RData"))
# }
# stopCluster(cluster)
#
# filenames.perm <- dir()
# filenames.perm <- filenames.perm[grep(paste("perm_single-stage-eqtl", stage, sep = "."), filenames.perm)]
#
# FDR <- map.perm.fdr(map1.output = qtl.profile,
# filenames.perm = filenames.perm,
# FDR_dir = paste0(getwd(), "/"),
# q.value = 0.05)
# saveRDS(FDR, file = paste0('fdr_single-stage-eqtl_', stage, '.RDS'))
#
# setwd(work.dir)
#
# eQTL peak finder and table ####
for (stage in seed.stage) {
# threshold
threshold <- ifelse(stage == 'pd' | stage == 'ar', 4.2, ifelse(stage == 'im', 4.1, ifelse(stage =='rp', 4.3, NA)))
# the thresholds are based on multiple-testing correction using 100 permuted datasets
qtl.profile <- readRDS(paste0('qtl-profiles/profile_single-stage-eqtl_', stage, '.rds'))
qtl.peak <- mapping.to.list(map1.output = qtl.profile) %>%
peak.finder(threshold = threshold)
qtl.peak <- na.omit(qtl.peak)
saveRDS(object = qtl.peak,
file = paste0("qtl-peaks/peak_single-stage-eqtl_", stage, ".rds"))
# eQTL table ####
qtl.profile <- readRDS(paste0('qtl-profiles/profile_single-stage-eqtl_', stage, '.rds'))
qtl.peak <- readRDS(paste0('qtl-peaks/peak_single-stage-eqtl_', stage, '.rds'))
eqtl.table <- eQTL.table(peak.list.file = qtl.peak, trait.annotation = gene.info) %>%
eQTL.table.addR2(QTL.prep.file = qtl.profile)
eqtl.table$qtl_chromosome <- as.factor(eqtl.table$qtl_chromosome)
eqtl.table$gene_chromosome <- as.factor(eqtl.table$gene_chromosome)
# add heritability - single stage ####
h2.result <- as.list(NULL)
n.perm <- 100
qtl.genes <- eqtl.table$trait
map <- genetic.map[, which(ril.stage == stage)]
map <- apply(map, 2, as.numeric) # make sure the alleles are treated as numeric
trait <- trait.matrix[, which(ril.stage == stage)]
map2 <- map
#map2[map2 == 0] <- 0.5
map2[map2 == -1] <- round(0, 0)
#map2[is.na(map2)] <- 0.5
kinship.matrix <- emma.kinship(map2)
colnames(kinship.matrix) <- colnames(map2); rownames(kinship.matrix) <- colnames(map2)
cluster <- makeCluster(n.cores, type = 'PSOCK')
clusterExport(cl = cluster, c('trait', 'map2', 'marker_h2', 'kinship.matrix', 'h2.REML'))
h2 <- t(parApply(cl = cluster, X = trait, MARGIN = 1, FUN = h2.REML, strain.names = colnames(map2), kinship.matrix = kinship.matrix, Vg.factor = 1))
stopCluster(cluster)
h2 <- cbind.data.frame(trait = rownames(h2), h2)
eqtl.table <- merge(x = eqtl.table, y = h2[, 1:2], by = 'trait', all.x = T)
#eqtl.table <- rename(.data = eqtl.table, h2_REML = h2)
###permutation
# print(paste0('permuting', " ", stage))
#
# cluster <- makeCluster(n.cores, type = "PSOCK")
# registerDoParallel(cluster)
# perm.output <- foreach(i = 1:n.perm, .combine = 'cbind',
# .export = c('trait.tmp', 'map.tmp', 'marker_h2', 'kinship.matrix', 'h2.REML')) %dopar% {
# perm.trait <- t(apply(trait.tmp, 1, function(x){x <- x[order(runif(length(x)))];return(x)}))
# t(apply(perm.trait, 1, h2.REML, strain.names = colnames(map.tmp2), kinship.matrix = kinship.matrix, Vg.factor = 1))[, 1]
# }
# stopCluster(cluster)
# perm.result <- cbind(h2, FDR0.05_REML = apply(perm.output, 1, quantile, 0.95))
#
# h2.result[[stage]] <- perm.result
#
#
# for (stage in development.stage) {
# h2.result[[stage]] <- cbind.data.frame(h2.result[[stage]], trait = rownames(h2.result[[stage]]))
# }
# trans-bands identification ####
window.nu <- 2e6
maxsize <- 100e6
chr.num <- 5
transband.id <- mutate(eqtl.table, interval = findInterval(qtl_bp, seq(1, maxsize, by = window.nu))) %>%
group_by(qtl_chromosome, interval, qtl_type) %>%
summarise(n.ct = length(unique(trait))) %>%
data.frame() %>%
group_by(qtl_type) %>%
mutate(exp.ct = mean(as.numeric(unlist(n.ct)))) %>%
data.frame() %>%
mutate(transband_significance = ppois(n.ct, lambda = exp.ct, lower.tail = F)) %>%
filter(transband_significance < 0.0001, qtl_type == "trans")
transband.id$transband_id <- with(transband.id, paste0("ch", qtl_chromosome, ":",
(interval - 1) * 2, "-",
interval * 2, "Mb"))
transband.id$stage <- stage
saveRDS(object = transband.id,
file = paste0("trans-bands/trans.band_", stage, ".rds"))
# for pd
if(stage == 'pd') {
eqtl.table <- mutate(eqtl.table,
trans_band = ifelse(qtl_type == "trans" &
qtl_chromosome == 1 &
qtl_bp > 6e6 &
qtl_bp <= 10e6, "ch1:6-10Mb",
ifelse(qtl_type == "trans" &
qtl_chromosome == 3 &
qtl_bp > 8e6 &
qtl_bp <= 12e6, "ch3:8-12Mb", "none")))
}
# for ar
if( stage == 'ar') {
eqtl.table <- mutate(eqtl.table,
trans_band = ifelse(qtl_type == "trans" &
qtl_chromosome == 2 &
qtl_bp > 12e6 &
qtl_bp <= 14e6, "ch2:12-14Mb",
ifelse(qtl_type == "trans" &
qtl_chromosome == 3 &
qtl_bp > 2e6 &
qtl_bp <= 4e6, "ch3:2-4Mb", "none")))
}
# for im
if( stage == 'im' ) {
eqtl.table <- mutate(eqtl.table,
trans_band = ifelse(qtl_type == "trans" &
qtl_chromosome == 5 &
qtl_bp > 6e6 &
qtl_bp <= 8e6, "ch5:6-8Mb",
ifelse(qtl_type == "trans" &
qtl_chromosome == 5 &
qtl_bp > 22e6 &
qtl_bp <= 26e6, "ch5:22-26Mb","none")))
}
# for rp
if(stage == 'rp') {
eqtl.table <- mutate(eqtl.table,
trans_band = ifelse(qtl_type == "trans" &
qtl_chromosome == 1 &
qtl_bp > 0e6 &
qtl_bp <= 2e6, "ch1:0-2Mb ",
ifelse(qtl_type == "trans" &
qtl_chromosome == 1 &
qtl_bp > 6e6 &
qtl_bp <= 8e6, "ch1:6-8Mb",
ifelse(qtl_type == "trans" &
qtl_chromosome == 5 &
qtl_bp > 14e6 &
qtl_bp <= 16e6, "ch5:14-16Mb",
ifelse(qtl_type == "trans" &
qtl_chromosome == 5 &
qtl_bp > 24e6 &
qtl_bp <=26e6, "ch5:24-26Mb", "none")))))
}
write.csv(x = eqtl.table,
file = paste0("qtl-tables/table_single-stage-eqtl_", stage, ".csv"),
row.names = T)
saveRDS(object = eqtl.table,
file = paste0("qtl-tables/table_single-stage-eqtl_", stage, ".rds"))
table(eqtl.table$qtl_type, eqtl.table$trans_band!="none")
}
# GO for trans bands - single stage ####
ontology <- 'BP' #c('BP', 'CC', 'MF')
x <- org.At.tairCHR
all.genes <- as.list(rownames(trait.matrix))
transband.go <- as.data.frame(matrix(data = NA, nrow = 0, ncol = 9))
colnames(transband.go) <- c('GO.ID', 'Term', 'Annotated', 'Significant', 'Expected', 'Fisher',
'FDR', 'stage', 'transband')
transband.go.perstage <- transband.go
for (stage in seed.stage) {
eqtl.table <- readRDS(paste0('qtl-tables/table_single-stage-eqtl_', stage, '.rds'))
transband.id <- unique(eqtl.table$trans_band)
transband.id <- transband.id[!transband.id %in% 'none']
transband.go.perstage <- as.data.frame(matrix(data = NA, nrow = 0, ncol = 9))
colnames(transband.go.perstage) <- c('GO.ID', 'Term', 'Annotated', 'Significant', 'Expected', 'Fisher',
'FDR', 'stage', 'transband')
for (i in 1:length(transband.id)) {
gene.set <- eqtl.table[which(eqtl.table$trans_band == transband.id[i]), 'trait']
gene.set <- factor(as.integer(all.genes %in% gene.set))
names(gene.set) <- all.genes
GOdata <- new("topGOdata",
description = "GOE for genes in regulated by trans bands", ontology = ontology,
allGenes = gene.set,
annot = annFUN.org,mapping= "org.At.tair.db")
resultFisher <- runTest(GOdata, algorithm = 'weight', statistic = "fisher")
result.df <- GenTable(GOdata, Fisher = resultFisher,
orderBy = "Fisher", ranksOf = "Fisher", topNodes = length(resultFisher@score))
result.df$stage <- stage
result.df$transband <- transband.id[i]
result.df$Fisher <- as.numeric(result.df$Fisher)
result.df$FDR <- p.adjust(p = result.df$Fisher, method = 'fdr')
result.df <- result.df[order(result.df$FDR), ]
result.df <- dplyr::filter(result.df, Fisher <= 0.01)
transband.go.perstage <- rbind(transband.go.perstage, result.df)
}
transband.go <- rbind(transband.go, transband.go.perstage)
}
write.csv(transband.go, paste0('files/trans-bands-go-', ontology, '.csv'))
\ No newline at end of file
This diff is collapsed.
##################################################
##### integration of phQTL, mQTL, and eQTL #######
##################################################
# Set working directory and load libraries ####
remove(list = ls())
gc()
# load library
library(dplyr)
library(grid)
library(scales)
library(RColorBrewer)
library(devtools)
library(ggplot2)
# set directory
work.dir <- "C:/Users/harta005/Projects/seed-germination-qtl"
setwd(work.dir)
# gathering phQTL profile
phqtl.peak <- readRDS('qtl-peaks/peak_phqtl_without-outliers-not-transformed.RDS') %>%
mutate(qtl_level = 'phQTL') %>%
mutate(stage = 'phQTL')
# gathering mQTL profile
seed.stage <- c('pd', 'ar', 'im', 'rp')
mqtl.peak <- data.frame(matrix(nrow = 0, ncol = 12))
colnames(mqtl.peak) <- colnames(phqtl.peak)
for (i in seed.stage) {
table.tmp <- readRDS(paste0('qtl-peaks/peak_mqtl_', i, '-without-outliers-log-transformed.RDS')) %>%
mutate(qtl_level = 'mQTL', stage = i)
mqtl.peak <- rbind(mqtl.peak, table.tmp)
}
# gathering eQTL profiles
eqtl.peak <- data.frame(matrix(nrow = 0, ncol = 13))
colnames(eqtl.peak) <- colnames(phqtl.peak)
for (i in seed.stage) {
table.tmp <- readRDS(paste0('qtl-tables/table_single-stage-eqtl_', i, '.rds')) %>%
filter(qtl_type == 'trans') %>%
mutate(qtl_level = 'eQTL', stage = i) %>%
dplyr::select(colnames(phqtl.peak))
eqtl.peak <- rbind(eqtl.peak, table.tmp)
}
# creating the histogram plot ####
# plot style ####
presentation <- theme(axis.text.x = element_text(size=12, face="bold", color="black"),
axis.text.y = element_text(size=12, face="bold", color="black"),
axis.title.x = element_text(size=15, face="bold", color="black"),
axis.title.y = element_text(size=15, face="bold", color="black"),
strip.text.x = element_text(size=15, face="bold", color="black"),
strip.text.y = element_text(size=15, face="bold", color="black"),
plot.title = element_text(size=15, face="bold"),
panel.background = element_rect(fill = "white",color="black"),
panel.grid.major = element_line(colour = "grey80"),
panel.grid.minor = element_blank(),
legend.position = "right")
myColors <- brewer.pal(9,"Set1")[c(3,5,9,6,7)]
myColors <- c("black",brewer.pal(9,"Set1")[c(2,9)])
plot.colour <- c("#000000", "#E69F00", "#009E73", "#0072B2", "#D55E00")
names(plot.colour) <- c("cis", "DryFresh", "DryAR", "6H", "RP")
hist.colour <- c("#E69F00", "#009E73", "#0072B2", "#D55E00")
names(hist.colour) <- c("DryFresh", "DryAR", "6H", "RP")
names(myColors) <- c("cis","trans","none")
Snoekcols <- scale_colour_manual(name = "eQTL type",values = myColors)
Snoekfill <- scale_fill_manual(name = "eQTL type",values = myColors)
# plotting ####
overlap <- function(table, stage) {
for (i in 1:length(stage)) {
if (nrow(table) != 0) {
table <- subset(table, table[stage[i]] == T)
} else {
break
}
}
nrow(table)
}
all.peak <- rbind(phqtl.peak, mqtl.peak, eqtl.peak)
all.peak$stage <- factor(all.peak$stage, levels = c('phQTL', 'pd', 'ar', 'im', 'rp'))
qtl.hist <- ggplot(all.peak, aes(x=qtl_bp, fill = stage)) +
geom_histogram(binwidth = 2000000, right = T, origin = 0, alpha = 1) +
#geom_density(alpha = 0.5) +
facet_grid(factor(qtl_level, level = c('phQTL', 'mQTL', 'eQTL')) ~ qtl_chromosome, scales="free_y") +
presentation +
scale_fill_manual(values = c('#964B00', '#ccbb44', '#228833', '#4477aa', '#cc3311')) +
theme(legend.position = "top", legend.text=element_text(size=5)) +
labs(x="QTL peak position (Mb)",y="QTL counts") +
scale_x_continuous(breaks=c(5, 10, 15, 20, 25, 30, 35, 40)*10^6,labels=c(5, 10, 15, 20, 25, 30, 35, 40))
tiff(file = paste0("figures/all-qtl-hist.tiff"),
width = 2250,
height = 1600,
units = 'px',
res = 300,
compression = 'lzw')
qtl.hist
dev.off()
# venn diagram ####
qtl.table.summary <- data.frame(matrix(ncol = 18, nrow = 0))
window.nu <- 2e6
maxsize <- 100e6
chr.num <- 5
all.peak2 <- all.peak %>%
mutate(interval = findInterval(qtl_bp, seq(1, maxsize, by = window.nu))) %>%
select(trait, qtl_chromosome, interval, qtl_significance, stage, qtl_level) %>%
#group_by(trait, qtl_chromosome, interval, qtl_significance, stage, qtl_level) %>%
count(qtl_chromosome, interval, qtl_level) %>%
#ungroup() %>%
spread(qtl_level, n) %>%
#select(phQTL, mQTL, eQTL) %>%
replace(is.na(.), 0)
overlap <- function(table, stage) {
for (i in 1:length(stage)) {
if (nrow(table) != 0) {
table <- subset(table, table[stage[i]] > 0)
} else {
break
}
}
nrow(table)
}
overlap(all.peak2, c('eQTL', 'phQTL', 'mQTL'))
draw.triple.venn(area1 = overlap(all.peak2, 'phQTL'),
area3 = overlap(all.peak2, 'mQTL'),
area2 = overlap(all.peak2, 'eQTL'),
n13 = overlap(all.peak2, c('phQTL', 'mQTL')),
n12 = overlap(all.peak2, c('phQTL', 'eQTL')),
n23 = overlap(all.peak2, c('mQTL', 'eQTL')),
n123 = overlap(all.peak2, c('phQTL', 'mQTL', 'eQTL')),
category = c('phQTL', 'eQTL', 'mQTL'),
lty = 'blank',
fill = c('#0d98ba', '#c71585', '#f8d568'))
#alpha = 0.9)
\ No newline at end of file
#######################################################
##### seed eQTL network analysis using ARACNe #########
#######################################################
# prepare the script working environment ####
remove(list = ls())
gc()
set.seed(1000)
work.dir <- "C:/Users/harta005/Projects/seed-germination-qtl"
setwd(work.dir)
# library
library(RCy3)
library(dplyr)
library(minet)
library(GENIE3)
library(doParallel)
library(reshape2)
library(devtools)
library(ggplot2)
library(Hmisc)
# determine the stage and transband ####
dev.stage <- 'im'
chr <- 5
start <- 6000000
end <- 8000000
n.perm <- 1000
# load required data ####
# gene and genetic infp
genetic.map <- as.matrix(read.csv(file = 'files/genetic-map.csv'))
gene.info <- read.csv('files/gene.info.csv', row.names = 1)
gene.feature <- read.csv('files/gene-feature.csv')
gene.pol <- read.csv('files/baysha-polymorphism.csv', row.names = 1)
gene.data <- merge(gene.info, gene.pol, all = TRUE)
gene.feature <- merge(gene.info, gene.feature, all = TRUE)
# expression matrix
expression <- as.matrix(read.csv('files/trait-matrix.csv', row.names = 1))
expression <- expression[, colnames(expression) %in% colnames(genetic.map)] # remove sample without genetic map
expression <- expression[, 17:176] # remove parents
expression <- expression[, grep(pattern = 'pd', x = colnames(expression))] # only take ril for specific stage
# ril stage
ril.stage <- substr(x = colnames(expression), start = 8, stop = 10) # write a vector of ril stage
# load the qtl table
eqtl.table <- readRDS('qtl-tables/table_single-stage-eqtl_all.rds')
# select the target trait
target.table <- filter(eqtl.table, qtl_bp >= start,
qtl_type == 'distant',
qtl_bp <= end,
qtl_chromosome == chr,
stage == dev.stage)
targets <- as.character(target.table$trait)
target.expression <- expression[which(rownames(expression) %in% targets), ]
# select the candidate genes
candidate.table <- filter(eqtl.table, qtl_bp >= start,
qtl_bp <= end,
qtl_type == 'local',
qtl_chromosome == chr,
gene_bp >= start,
gene_bp <= end,
stage == dev.stage)
candidates <- as.character(candidate.table$trait)
candidate.expression <- expression[which(rownames(expression) %in% rownames(expression)), ]
# combine expression matrix for targets, candidates, and genes underlying qtl
expression.m <- expression[which(rownames(expression) %in% c(targets, candidates)), ]
# create gene table and determine the gene function
nodes <- gene.feature[which(gene.feature$trait %in% c(targets, candidates)), ]
nodes <- data.frame(id = c(targets, candidates),
role = c(rep('target', length(targets)), rep('candidates', length(candidates))),
is_tf = NA)
nodes[, 'is_tf'] <- gene.feature[match(nodes$id, gene.feature$trait), 'is_TF']
regulators <- as.character(nodes$trait[which(nodes$role == 'candidates' | nodes$is_tf == 1)])
# network inference using GENIE3 ####
weight <- GENIE3(exprMatrix = expression.m, verbose = TRUE)
weight <- melt(weight)
weight$Var1 <- as.character(weight$Var1)
weight$Var2 <- as.character(weight$Var2)
names(weight) <- c('nodes1', 'nodes2', 'weight')
# remove duplicates
edges.genie3 <- as.data.frame(matrix(data = NA, nrow = 6670, ncol = 3))
cur.row <- 1
for(i in row.names(expression.m)) {
for(j in row.names(expression.m)) {
if(i == j) {
next
}
set1 <- weight[which(weight$nodes1 == i & weight$nodes2 == j), ]
set2 <- weight[which(weight$nodes2 == i & weight$nodes1 == j), ]
if(set1[, 3] >= set2[, 3]) {
edges.genie3[cur.row, ] <- set1
cur.row <- cur.row + 1
print(cur.row)
}
if(set1[, 3] <= set2[, 3]) {
edges.genie3[cur.row, ] <- set2
cur.row <- cur.row + 1
print(cur.row)
}
}
}
edges.genie3 <- distinct(edges.genie3)
names(edges.genie3) <- c('nodes1', 'nodes2', 'weight')
# ranking
rank.genie3 <- edges.genie3
rank.genie3$rank <- rank(-rank.genie3$weight)
rank.genie3 <- rank.genie3[, c(1, 2, 4)]
write.csv(x = rank.genie3, file = paste0('networks/', dev.stage, chr, '-genie.csv'))
# network inference using pearson correlation ####
edges.pearson <- rcorr(t(expression.m), type = 'pearson')
edges.pearson <- melt(edges.pearson$r)
names(edges.pearson) <- c('nodes1', 'nodes2', 'weight')
# remove duplicates and determine direction
rank.pearson <- rank.genie3[, 1:2]
rank.pearson <- merge(rank.pearson, edges.pearson, by = c("nodes1", "nodes2"))
rank.pearson <- rank.pearson[, 1:3]
rank.pearson$weight <- abs(rank.pearson$weight)
# ranking
rank.pearson$rank <- rank(-rank.pearson$weight)
rank.pearson <- rank.pearson[, c(1, 2, 4)]
names(rank.pearson) <- c('nodes1', 'nodes2', 'rank')
write.csv(x = rank.pearson, file = paste0('networks/', dev.stage, chr, '-pearson.csv'))
# network inference using spearman correlation ####
edges.spearman <- rcorr(t(expression.m), type = 'spearman')
edges.spearman <- melt(edges.spearman$r)
names(edges.spearman) <- c('nodes1', 'nodes2', 'weight')
# remove duplicates and determine direction
rank.spearman <- rank.genie3[, 1:2]
rank.spearman <- merge(rank.spearman, edges.spearman, by = c("nodes1", "nodes2"))
rank.spearman <- rank.spearman[, 1:3]
rank.spearman$weight <- abs(rank.spearman$weight)
# ranking
rank.spearman$rank <- rank(-rank.spearman$weight)
rank.spearman <- rank.spearman[, c(1, 2, 4)]
names(rank.spearman) <- c('nodes1', 'nodes2', 'rank')
write.csv(x = rank.spearman, file = paste0('networks/', dev.stage, chr, '-spearman.csv'))
# network inference using CLR ####
edges.clr <- minet(dataset = t(expression.m), method = 'clr', estimator = 'mi.empirical', disc = 'equalfreq')
edges.clr <- melt(edges.clr)
names(edges.clr) <- c('nodes1', 'nodes2', 'weight')
# ranking
rank.clr <- rank.genie3[, 1:2]
rank.clr <- merge(rank.clr, edges.clr, by = c("nodes1", "nodes2"))
rank.clr <- rank.clr[, 1:3]
rank.clr$rank <- rank(-rank.clr$weight)
rank.clr <- rank.clr[, c(1, 2, 4)]
names(rank.clr) <- c('nodes1', 'nodes2', 'rank')
write.csv(x = rank.clr, file = paste0('networks/', dev.stage, chr, '-clr.csv'))
# network inference using aracne ####
edges.aracne <- minet(dataset = t(expression.m), method = 'aracne', estimator = 'mi.empirical', disc = 'equalfreq')
edges.aracne <- melt(edges.aracne)
names(edges.aracne) <- c('nodes1', 'nodes2', 'weight')
# ranking
rank.aracne <- rank.genie3[, 1:2]
rank.aracne <- merge(rank.aracne, edges.aracne, by = c("nodes1", "nodes2"))
rank.aracne <- rank.aracne[, 1:3]
rank.aracne$rank <- rank(-rank.aracne$weight)
rank.aracne <- rank.aracne[, c(1, 2, 4)]
names(rank.aracne) <- c('nodes1', 'nodes2', 'rank')
write.csv(x = rank.aracne, file = paste0('networks/', dev.stage, chr, '-aracne.csv'))
# network inference using mrnet ####
edges.mrnet <- minet(dataset = t(expression.m), method = 'mrnet', estimator = 'mi.empirical', disc = 'equalfreq')
edges.mrnet <- melt(edges.mrnet)
names(edges.mrnet) <- c('nodes1', 'nodes2', 'weight')
# ranking
rank.mrnet <- rank.genie3[, 1:2]
rank.mrnet <- merge(rank.mrnet, edges.mrnet, by = c("nodes1", "nodes2"))
rank.mrnet <- rank.mrnet[, 1:3]
rank.mrnet$rank <- rank(-rank.mrnet$weight)
rank.mrnet <- rank.mrnet[, c(1, 2, 4)]
names(rank.mrnet) <- c('nodes1', 'nodes2', 'rank')
write.csv(x = rank.mrnet, file = paste0('networks/', dev.stage, chr, '-mrnet.csv'))
# network inference using tigress
weight.tigress <- tigress::tigress(t(expression.m),
nsplit = 1000,
nstepsLARS = 5,
allsteps = F)
weight.tigress <- melt(weight.tigress)
names(weight.tigress) <- c('nodes1', 'nodes2', 'weight')
weight.tigress$nodes1 <- as.character(weight.tigress$nodes1)
weight.tigress$nodes2 <- as.character(weight.tigress$nodes2)
# remove duplicates
# combination <- nrow(expression.m) * (nrow(expression.m) - 1) /2
# edges.tigress <- as.data.frame(matrix(data = NA, nrow = combination, ncol = 3))
# cur.row <- 1
# for(i in row.names(expression.m)) {
# for(j in row.names(expression.m)) {
# if(i == j) {
# next
# }
# set1 <- weight.tigress[which(weight.tigress$nodes1 == i & weight.tigress$nodes2 == j), ]
# set2 <- weight.tigress[which(weight.tigress$nodes2 == i & weight.tigress$nodes1 == j), ]
# if(set1[, 3] >= set2[, 3]) {
# edges.tigress[cur.row, ] <- set1
# cur.row <- cur.row + 1
# print(cur.row)
# }
# if(set1[, 3] <= set2[, 3]) {
# edges.genie3[cur.row, ] <- set2
# cur.row <- cur.row + 1
# print(cur.row)
# }
# }
# }
# edges.tigress <- distinct(edges.tigress)
# names(edges.tigress) <- c('nodes1', 'nodes2', 'weight')
# ranking
rank.tigress <- rank.genie3[, 1:2]
rank.tigress <- merge(rank.tigress, weight.tigress, by = c("nodes1", "nodes2"))
rank.tigress <- rank.tigress[, 1:3]
rank.tigress$rank <- rank(-rank.tigress$weight)
rank.tigress <- rank.tigress[, c(1, 2, 4)]
names(rank.tigress) <- c('nodes1', 'nodes2', 'rank')
write.csv(x = rank.tigress, file = paste0('networks/', dev.stage, chr, '-tigress.csv'))
# merge the tables
rank.list <- list(rank.genie3, rank.spearman, rank.clr, rank.aracne, rank.tigress)
#rank.list <- list(rank.genie3, rank.spearman, rank.pearson, rank.mrnet, rank.clr, rank.aracne, rank.tigress)
networks <- Reduce(function(x, y) full_join(x, y, by = c('nodes1', 'nodes2')), rank.list)
names(networks) <- c('nodes1', 'nodes2', 'genie3', 'spearman', 'clr', 'aracne', 'tigress')
#names(networks) <- c('nodes1', 'nodes2', 'genie3', 'spearman', 'pearson', 'mrnet', 'clr', 'aracne', 'tigress')
networks$avg_rank <- NA
for(i in 1:nrow(networks)) {
networks$avg_rank[i] <-
mean(as.numeric(networks[i, 3:(ncol(networks) - 1)]))
}
# PCA
pr.out <- prcomp(x = (t(networks[3:10])), center = T, scale. = F)
pc.df <- data.frame(pc1 = pr.out$x[, 1], # 0.37
pc2 = pr.out$x[, 2]) # 0.32
pc.df$method <- rownames(pc.df)
ggplot(pc.df, aes(x = pc1, y = pc2, color = method)) + geom_point(size = 3.5)
pc.sum <- summary(pr.out)
pc1 <- pc.sum$importance[2, 1]
pc2 <- pc.sum$importance[2, 2]
# determine threshold
edges <- networks[, c('nodes1', 'nodes2', 'avg_rank')]
colnames(edges) <- c('source', 'target', 'interaction')
edges <- na.omit(edges)
threshold.m <- as.data.frame(matrix(data = NA, ncol = 3, nrow = 0))
names(threshold.m) <- c('threshold', "edge", "node")
cur.row <- 1
for(i in max(round(edges$interaction, 0)):0){
edges.tmp <- filter(edges, interaction <= i)
total.edge <- nrow(edges.tmp)
total.node <- length(unique(c(edges.tmp$source, edges.tmp$target)))
threshold.m[cur.row, ] <- c(i, total.edge, total.node)
cur.row <- cur.row + 1
}
# visualize the threshold matrix
plot(threshold.m$threshold,
threshold.m$node,
xlab = "rank threshold",
ylab = 'number of node') # 1552 for RP5 and 1248 for PD3
# visualize the network
edges.th <- filter(edges, interaction <= 288)
#edges$interaction <- log2(edges$interaction)
write.csv(edges, paste0('networks/', dev.stage, chr, '-edge.csv'))
write.csv(nodes, paste0('networks/', dev.stage, chr, '-node.csv'))
createNetworkFromDataFrames(nodes = nodes, edges = edges.th)
########################################
##### combined-stage eQTL analysis #####
########################################
#### prepare the script working environment ####
remove(list = ls())
gc()
set.seed(1000)
# Set working directory ####
work.dir <- "C:/Users/harta005/Projects/seed-germination-qtl"
setwd(work.dir)
# dependencies ####
library(dplyr)
library(ggplot2)
library(tidyr)
library(doParallel)
library(VennDiagram)
library(UpSetR)
library(forcats)
library(grid)
library(heritability)
library(topGO)
library(org.At.tair.db)
# unused libraries ####
# library(Mfuzz)
# library(Biobase)
# library(gplots)
# library(RColorBrewer)
# library(limma)
# library(factoextra)
# library(stats)
# library(amap)
#
#
#
# library(gridExtra)
# library(RCy3)
# library(corrplot)
# library(reshape2)
# library(Hmisc)
# library(igraph)
# library(threejs)
# library(MASS)
#
# load required function ####
setwd('functions/')
for(i in 1:length(dir())){
source(dir()[i])
} # read function from Mark
setwd(work.dir)
overlap <- function(table, stage) {
for (i in 1:length(stage)) {
if (nrow(table) != 0) {
table <- subset(table, table[stage[i]] == T)
} else {
break
}
}
nrow(table)
} # function to determine the overlapping qtl between 2 stages
# load the data ####
seed.stage <- c('pd', 'ar', 'im', 'rp')
# presentation
presentation <- theme(axis.text.x = element_text(size=10, face="bold", color="black"),
axis.text.y = element_text(size=10, face="bold", color="black"),
axis.title.x = element_text(size=12, face="bold", color="black"),
axis.title.y = element_text(size=12, face="bold", color="black"),
strip.text.x = element_text(size=12, face="bold", color="black"),
strip.text.y = element_text(size=12, face="bold", color="black"),
strip.text = element_text(size =12, face="bold", color="black"),
#plot.title = element_text(size=15, face="bold"),
panel.background = element_rect(fill = "white",color="black"),
panel.grid.major = element_line(colour = "grey80"),
panel.grid.minor = element_blank())
#### combined-stage eQTL mapping - reduced model ####
stage <- 'rm'
qtl.data <- QTL.data.prep(trait.matrix = trait.matrix,
strain.trait = colnames(trait.matrix),
strain.map = genetic.map,
strain.marker = marker)
cluster <- makeCluster(n.cores, type = "PSOCK")
registerDoParallel(cluster)
output <- foreach(i = 1:nrow(trait.matrix), .combine = 'cbind') %dopar% {
map.all.marker(trait = trait.matrix[i, ], markers = genetic.map)
}
stopCluster(cluster)
pval.out <- t(output[, which(colnames(output) == 'LOD')])
eff.out <- t(output[, which(colnames(output) == 'Eff')])
colnames(pval.out) <- rownames(marker); rownames(pval.out) <- rownames(trait.matrix)
colnames(eff.out) <- rownames(marker); rownames(eff.out) <- rownames(trait.matrix)
qtl.profile <- NULL; qtl.profile <- as.list(qtl.profile)
qtl.profile[[1]] <- round(pval.out,digits=2)
qtl.profile[[2]] <- round(eff.out,digits=3)
qtl.profile[[3]] <- trait.matrix
qtl.profile[[4]] <- genetic.map
qtl.profile[[5]] <- marker
names(qtl.profile) <- c("LOD","Effect","Trait","Map","Marker")
write.EleQTL(map1.output = qtl.profile, filename = paste0("qtl-tables/table_single-stage-eqtl_", 'rm'))
saveRDS(object = qtl.profile,
file = paste0("qtl-profiles/profile_combined-stage-eqtl_", stage, ".rds"))
# eQTL peak finder - combined-stage rm ####
stage <- 'rm'
# peak finder
qtl.profile <- readRDS(paste0('qtl-profiles/profile_single-stage-eqtl_', stage, '.rds'))
qtl.peak <- mapping.to.list(map1.output = qtl.profile) %>%
peak.finder(threshold = 4.3)
qtl.peak <- na.omit(qtl.peak)
saveRDS(object = qtl.peak,
file = paste0("qtl-peaks/peak_combined-stage-eqtl_", stage, ".rds"))
# eQTL table- combined-stage rm ####
stage <- 'rm'
qtl.profile <- readRDS(paste0('qtl-profiles/profile_combined-stage-eqtl_', stage, '.rds'))
qtl.peak <- readRDS(paste0('qtl-peaks/peak_combined-stage-eqtl_', stage, '.rds'))
eqtl.table <- eQTL.table(peak.list.file = qtl.peak, trait.annotation = gene.info) %>%
eQTL.table.addR2(QTL.prep.file = qtl.data)
eqtl.table$qtl_chromosome <- as.factor(eqtl.table$qtl_chromosome)
eqtl.table$gene_chromosome <- as.factor(eqtl.table$gene_chromosome)
# add heritability - combined-stage rm #####
h2.result <- as.list(NULL)
n.perm <- 100
qtl.genes <- eqtl.table$trait.matrix
map <- genetic.map
map <- apply(map, 2, as.numeric) # make sure the alleles are treated as numeric
trait <- trait.matrix
map2 <- map
#map2[map2 == 0] <- 0.5
map2[map2 == -1] <- round(0, 0)
#map2[is.na(map2)] <- 0.5
kinship.matrix <- emma.kinship(map2)
colnames(kinship.matrix) <- colnames(map2); rownames(kinship.matrix) <- colnames(map2)
cluster <- makeCluster(n.cores, type = 'PSOCK')
clusterExport(cl = cluster, c('trait', 'map2', 'marker_h2', 'kinship.matrix', 'h2.REML'))
h2 <- t(parApply(cl = cluster, X = trait, MARGIN = 1, FUN = h2.REML, strain.names = colnames(map2), kinship.matrix = kinship.matrix, Vg.factor = 1))
stopCluster(cluster)
h2 <- cbind.data.frame(trait = rownames(h2), h2)
eqtl.table <- merge(x = eqtl.table, y = h2[, 1:2], by = 'trait', all.x = T)
eqtl.table <- rename(eqtl.table, h2 = h2_REML)
# trans bands identification - combined-stage rm #####
window.nu <- 2e6
maxsize <- 100e6
chr.num <- 5
transband.id <- mutate(eqtl.table, interval = findInterval(qtl_bp, seq(1, maxsize, by = window.nu))) %>%
group_by(qtl_chromosome, interval, qtl_type) %>%
summarise(n.ct = length(unique(trait))) %>%
data.frame() %>%
group_by(qtl_type) %>%
mutate(exp.ct = mean(as.numeric(unlist(n.ct)))) %>%
data.frame() %>%
mutate(transband_significance = ppois(n.ct, lambda = exp.ct, lower.tail = F)) %>%
filter(transband_significance < 0.0001, qtl_type == "trans")
transband.id$transband_id <- with(transband.id, paste0("ch", qtl_chromosome, ":",
(interval - 1) * 2, "-",
interval * 2, "Mb"))
transband.id$stage <- stage
saveRDS(object = transband.id,
file = paste0("trans-bands/trans.bands_", stage, ".rds"))
# for rm
if( stage == 'rm' ) {
eqtl.table <- mutate(eqtl.table,
trans_band = ifelse(qtl_type == "trans" &
qtl_chromosome == 5 &
qtl_bp > 6e6 &
qtl_bp <= 8e6, "ch5:6-8Mb",
ifelse(qtl_type == "trans" &
qtl_chromosome == 1 &
qtl_bp > 26e6 &
qtl_bp <= 28e6, "ch1:26-28Mb",
ifelse(qtl_type == "trans" &
qtl_chromosome == 2 &
qtl_bp > 10e6 &
qtl_bp <= 14e6, "ch2:10-14Mb",
ifelse(qtl_type == "trans" &
qtl_chromosome == 4 &
qtl_bp > 0e6 &
qtl_bp <= 2e6, "ch4:0-2Mb",
ifelse(qtl_type == "trans" &
qtl_chromosome == 5 &
qtl_bp > 14e6 &
qtl_bp <= 16e6, "ch5:14-16Mb","none"))))))
}
print("finish")
write.csv(x = eqtl.table,
file = paste0("qtl-tables/table_combined-stage-eqtl_", stage, ".csv"),
row.names = T)
saveRDS(object = eqtl.table,
file = paste0("qtl-tables/table_combined-stage-eqtl_", stage, ".rds"))
table(eqtl.table$qtl_type, eqtl.table$trans_band != "none")
# GO for trans bands - combined-stage rm ####
ontology <- 'BP'
x <- org.At.tairCHR
all.genes <- as.list(rownames(trait.matrix))
transband.go <- as.data.frame(matrix(data = NA, nrow = 0, ncol = 9))
colnames(transband.go) <- c('GO.ID', 'Term', 'Annotated', 'Significant', 'Expected', 'Fisher',
'FDR', 'stage', 'transband')
eqtl.table <- readRDS(paste0('qtl-tables/table_combined-stage-eqtl_', stage, '.rds'))
transband.id <- unique(eqtl.table$trans_band)
transband.id <- transband.id[!transband.id %in% 'none']
transband.go.perstage <- as.data.frame(matrix(data = NA, nrow = 0, ncol = 9))
colnames(transband.go.perstage) <- c('GO.ID', 'Term', 'Annotated', 'Significant', 'Expected', 'Fisher',
'FDR', 'stage', 'transband')
for (i in 1:length(transband.id)) {
gene.set <- eqtl.table[which(eqtl.table$trans_band == transband.id[i]), 'trait']
gene.set <- factor(as.integer(all.genes %in% gene.set))
names(gene.set) <- all.genes
GOdata <- new("topGOdata",
description = "GOE for genes in regulated by trans bands", ontology = ontology,
allGenes = gene.set,
annot = annFUN.org,mapping= "org.At.tair.db")
resultFisher <- runTest(GOdata, algorithm = 'weight', statistic = "fisher")
result.df <- GenTable(GOdata, Fisher = resultFisher,
orderBy = "Fisher", ranksOf = "Fisher", topNodes = length(resultFisher@score))
result.df$stage <- stage
result.df$transband <- transband.id[i]
result.df$Fisher <- as.numeric(result.df$Fisher)
result.df$FDR <- p.adjust(p = result.df$Fisher, method = 'fdr')
result.df <- result.df[order(result.df$FDR), ]
result.df <- dplyr::filter(result.df, Fisher <= 0.01)
transband.go <- rbind(transband.go, result.df)
}
write.csv(transband.go, paste0('files/trans-bands-rm-go', ontology, '.csv'))
# prepare eqtl table and plotting - combined-stage rm ####
eqtl.table <- eqtl.table %>%
mutate(qtl_type = ifelse(qtl_type == 'cis', 'local', 'distant'))
# cis-trans plot - combined-stage rm ####
eqtl.plot <- ggplot(eqtl.table, aes(x = qtl_bp, y = gene_bp)) +
geom_segment(aes(x = qtl_bp_left, y = gene_bp, xend = qtl_bp_right, yend = gene_bp),
alpha = 0.25, colour = "grey") +
geom_point(aes(colour = ordered(qtl_type, levels = c('local', 'distant'))), alpha = .75) +
facet_grid(fct_rev(gene_chromosome) ~ qtl_chromosome, space = "free", scales = "free") +
presentation +
scale_colour_manual('eQTL type', values = c('black', '#377eb8')) +
labs(x = "eQTL peak position (Mb)", y = "Gene position (Mb)") +
scale_x_continuous(breaks=c(5, 10, 15, 20, 25, 30, 35, 40)*10^6,labels=c(5, 10, 15, 20, 25, 30, 35, 40)) +
scale_y_continuous(breaks=c(5, 10, 15, 20, 25, 30, 35, 40)*10^6,labels=c(5, 10, 15, 20, 25, 30, 35, 40))
pdf(file = paste0("figures/cis-trans-plot-rm.pdf"), width = 10, height = 8)
eqtl.plot
dev.off()
# histogram - combined-stage rm ####
eqtl.hist <- ggplot(eqtl.table %>% filter(qtl_type != 'local'),
aes(x=qtl_bp)) + # 750 x 400
geom_histogram(binwidth = 2000000, alpha = 0.8, fill = '#377EB8') +
facet_grid(. ~ qtl_chromosome, space = "free",scales="free") +
presentation +
scale_fill_manual(values = 'blue') +
theme(legend.position = "right", legend.text=element_text(size=10)) +
labs(x="eQTL peak position (Mb)",y="eQTL counts") +
ylim(c(0, 80)) +
geom_hline(aes(yintercept = 11.72),
linetype = 'dashed',
color = 'red',
size = .1) +
scale_x_continuous(breaks=c(5, 10, 15, 20, 25, 30, 35, 40)*10^6,labels=c(5, 10, 15, 20, 25, 30, 35, 40))
pdf(file = "figures/trans-eqtl-histogram-rm.pdf", width = 10, height = 5)
eqtl.hist
dev.off()
# R2 vs heritability - combined-stage rm ####
ggplot(eqtl.table, aes(x = qtl_R2_sm,
fill = factor(qtl_type, levels = c('local', 'distant')),
color = factor(qtl_type, levels = c('local', 'distant')))) +
geom_histogram(aes(y = ..density..), binwidth = 0.01, position = 'identity', alpha = 0) +
geom_density(alpha = 0.5, size = 0) +
geom_vline(aes(xintercept = median(qtl_R2_sm),
color = factor(qtl_type, levels = c('local', 'distant'))),
linetype = 'dashed', size = .5) +
presentation +
scale_color_brewer(palette = 'Set1') +
scale_fill_brewer(palette = 'Set1') +
xlab('explained phenotypic variance (R2)') +
ylab('count of eQTL') +
labs(fill = 'eQTL type', color = 'eQTL type') +
#ylim(0, 1) +
xlim(0, 1) +
theme(strip.text.x = element_text(size = 8))
ggplot(eqtl.table, aes(x = h2, y = qtl_R2_sm,
fill = factor(qtl_type, levels = c('local', 'distant')),
color = factor(qtl_type, levels = c('local', 'distant')))) +
geom_point(size = 1, alpha = 0.5) +
geom_vline(data = group_by(eqtl.table, qtl_type) %>%
summarise(med = median(h2, na.rm = T)),
aes(xintercept = med, color = factor(qtl_type, levels = c('local', 'distant'))),
linetype = 'dashed', size = .5) +
geom_hline(data = group_by(eqtl.table, qtl_type) %>%
summarise(med = median(qtl_R2_sm, na.rm = T)),
aes(yintercept = med, color = factor(qtl_type, levels = c('local', 'distant'))),
linetype = 'dashed', size = .5) +
geom_smooth(model = lm) +
geom_abline(aes(slope = 1, intercept = 0), size = .75, linetype = 'dashed') +
presentation +
scale_color_brewer(palette = 'Set1') +
scale_fill_brewer(palette = 'Set1') +
xlab('narrow-sense heritabllity (h2)') +
ylab('explained phenotypic variance (R2)') +
labs(fill = 'eQTL type', color = 'eQTL type') +
ylim(0, 1) +
xlim(0, 1) +
theme(strip.text.x = element_text(size = 8))
\ No newline at end of file
goe-app.R 0 → 100644
#
# This is a Shiny web application. You can run the application by clicking
# the 'Run App' button above.
#
# Find out more about building applications with Shiny here:
#
# http://shiny.rstudio.com/
#
remove(list = ls())
gc()
# setting directory
work.dir <- "C:/Users/harta005/Projects/seed-germination-qtl/"
setwd(work.dir)
# load library
library(shiny)
library(topGO)
library(org.At.tair.db)
library(ggplot2)
# load variables for goe test
x <- org.At.tairCHR
all.genes <- as.list(as.character(read.csv('files/gene-list.csv', header = F)[, 1]))
hc.go.list <- as.data.frame(matrix(data = NA, nrow = 0, ncol = 8))
colnames(hc.go.list) <- c('GO.ID', 'Term', 'Annotated', 'Significant', 'Expected', 'Fisher',
'FDR')
# Define UI for application that draws a histogram
ui <- fluidPage(
# Application title
titlePanel("Gene ontology enrichment test for Joosen data"),
# Sidebar with a slider input for number of bins
sidebarLayout(
sidebarPanel(
textAreaInput(inputId = 'gene.list',
label = 'Put your list of gene here!',
width = '100%',
height = '400px',
value = '',
placeholder = "AT4G22890\nAT2G34630\nAT2G03270\nAT2G25590\nAT5G25130",
actionButton('button', label = 'Submit!')
),
selectInput(inputId = 'go.term',
label = 'Select the GO term:',
choices = c('biological process', 'molecular function', 'cellular component'),
selected = 'biological process'
),
actionButton(inputId = 'submit', label = 'Submit')
),
# Show a plot of the generated distribution
mainPanel(
downloadButton(outputId = 'download', 'Download'),
DT::dataTableOutput("go.result")
)
)
)
# Define server logic required to draw a histogram
server <- function(input, output) {
reactive.gene.list <- eventReactive(input$submit, {
input$gene.list
})
# creating data table
data_table <- reactive({
gene.list <- reactive.gene.list()
gene.set <- unlist(strsplit(gene.list, '\n'))
gene.set <- factor(as.integer(all.genes %in% gene.set))
ontology <- ifelse(input$go.term == 'biological process', 'BP',
ifelse(input$go.term == 'molecular function', 'MF', 'CC'))
names(gene.set) <- all.genes
GOdata <- new("topGOdata",
description = "Analyzing clustering results",
ontology = ontology,
allGenes = gene.set,
annot = annFUN.org,mapping= "org.At.tair.db")
resultFisher <- runTest(GOdata, algorithm = 'weight', statistic = "fisher")
result.df <- GenTable(GOdata, pval = resultFisher,
orderBy = "Fisher", ranksOf = "Fisher", topNodes = length(resultFisher@score))
result.df$FDR <- round(p.adjust(p = result.df$pval, method = 'fdr'), 4)
result.df$pval <- round(as.numeric(result.df$pval), 4)
result.df <- result.df[order(result.df$FDR), ]
#result.df <- filter(result.df, FDR <= 0.001)
#hc.go.list <- rbind(hc.go.list, result.df)
result.df
})
output$go.result <- DT::renderDataTable({
data_table()
})
output$download <- downloadHandler(
filename = paste0(input$go.term, ".csv"),
content = function(file) {
write.csv(data_table(), file, row.names = FALSE)
}
)
}
# Run the application
shinyApp(ui = ui, server = server)
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
-----BEGIN OPENSSH PRIVATE KEY-----
b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW
QyNTUxOQAAACBUWrInKzWUdNlKTr6empD/zBf3+RzF8PLM5MoR+wlqpQAAAJhXXXI6V11y
OgAAAAtzc2gtZWQyNTUxOQAAACBUWrInKzWUdNlKTr6empD/zBf3+RzF8PLM5MoR+wlqpQ
AAAEBqhGRCpBqPriAmbs/KkgZ53oSR9HHbjc4wOZC2NvAkrlRasicrNZR02UpOvp6akP/M
F/f5HMXw8szkyhH7CWqlAAAAFW1hcmdpLmhhcnRhbnRvQHd1ci5ubA==
-----END OPENSSH PRIVATE KEY-----
ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFRasicrNZR02UpOvp6akP/MF/f5HMXw8szkyhH7CWql margi.hartanto@wur.nl
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment