1-clustering.R

###############################################
##### seed transcript clustering analysis #####
###############################################


#### prepare the script working environment ####
  remove(list = ls())
  gc()
  set.seed(1000)  
  
  # set working directory ####
  work.dir <- "C:/Users/harta005/Projects/seed-germination-qtl"
  setwd(work.dir)

  # dependencies ####
  library(dplyr)
  library(ggplot2)
  library(limma)
  library(Biobase)
  library(stats)
  library(gplots)
  library(RColorBrewer)
  library(Rfast)
  library(amap)
  library(topGO)
  library(org.At.tair.db)
  # unused libraries ####
  # library(Mfuzz)
  # library(factoextra)
  # library(doParallel)
  # library(forcats)
  # library(tidyr)
  # library(gridExtra)
  # library(Hmisc)


# load the data ####
  trait.matrix <- read.csv(file = 'files/trait-matrix.csv', row.names = 1)
  sample.list <- read.csv(file = 'files/sample-list.csv')
  sample.stage <- sample.list$stage
  
# data presentation ####
  presentation <- theme(axis.text.x = element_text(size=6, face="bold", color="black"),
                        axis.text.y = element_text(size=6, face="bold", color="black"),
                        axis.title.x = element_text(size=7, face="bold", color="black"),
                        axis.title.y = element_text(size=7, face="bold", color="black"),
                        strip.text.x = element_text(size=7, face="bold", color="black"),
                        strip.text.y = element_text(size=7, face="bold", color="black"),
                        strip.text = element_text(size =7, face="bold", color="black"),
                        #plot.title = element_text(size=15, face="bold"),
                        panel.background = element_rect(fill = "white",color="black"),
                        panel.grid.major = element_line(colour = "grey80"),
                        panel.grid.minor = element_blank())
  
# create the expression object ####
  x <- trait.matrix # the expression matrix
  genes <- data.frame(trait = rownames(x)) # the gene list
  f <- data.frame(trait = rownames(x)) # the gene feature
  rownames(f) <- rownames(x)
  p <- data.frame(id = colnames(trait.matrix), stage = sample.stage)
  rownames(p) <- colnames(x)
  eset <- ExpressionSet(assayData = as.matrix(x),
                        phenoData = AnnotatedDataFrame(p),
                        featureData = AnnotatedDataFrame(f))
  
# PCA ####
  pr.out <- prcomp(x = (t(trait.matrix)),  center = T, scale. = F)
  pc.df <- data.frame(pc1 = pr.out$x[, 1], 
                          pc2 = pr.out$x[, 2], 
                          stage = sample.stage, 
                          population = c(rep('parent', 16), rep('RIL', 164)))
  
  pca.all <- ggplot(pc.df, aes(x = pc1, y = pc2, color = factor(stage, level = c('pd', 'ar', 'im', 'rp')))) + 
                            geom_point(aes(shape = population)) +
                            scale_colour_manual(values = c('#ccbb44', '#228833', '#4477aa', '#cc3311')) +
                            scale_shape_manual(values = c(17, 19)) +
                            labs(x = "PC1 (55.56%)", y = "PC2 (14.82%)") +
                            labs(colour = 'stage') +
                            theme(text = element_text(size = 10),
                                  panel.background = element_blank(),
                                  panel.border=element_rect(fill=NA),
                                  panel.grid.major = element_blank(),
                                  panel.grid.minor = element_blank(),
                                  strip.background=element_blank(),
                                  axis.text.x=element_text(colour="black"),
                                  axis.text.y=element_text(colour="black"),
                                  axis.ticks=element_line(colour="black"),
                                  plot.margin=unit(c(1,1,1,1),"line")) +
                            #presentation +
                            geom_hline(aes(yintercept = 0), linetype = 'dashed', size = .1) +
                            geom_vline(aes(xintercept = 0), linetype = 'dashed', size = .1) 
  tiff(file = paste0("figures/pca-all.tiff"), 
       width = 2250, 
       height = 1200, 
       units = 'px',
       res = 300,
       compression = 'lzw')
  pca.all
  dev.off()  
  
# differential expresed gene analysis and hierarchiecal clustering ####
  group <- with (pData(eset), stage)
  group <- factor(group)
  design <- model.matrix(~ 0 + group)
  colnames(design) <- levels(group)
  colSums(design) 
  
  # create pairwise contrast matrix among four different stages
  cm <- makeContrasts(pd_ar = pd - ar,
                      ar_im = ar - im,
                      im_rp = im - rp,
                      levels = design)
  coef <- colnames(cm)
  
  # fit the linear model to determine significant differentially expressed genes
  # for each pairwise contrast 
  fit <- lmFit(eset, design)
  fit2 <- contrasts.fit(fit, contrasts = cm) %>%
    eBayes()
  result <- decideTests(fit2)
  summary(result) # 1 = upregulate
  write.csv(result, 'files/differentally-expressed-genes.csv')

  # determine the differentially expressed genes for each pairwise contrast  
  # based on p value (>0.05) and  log fold change more than 1 sorted by fold change
  de.genes <- NA
  coef2 <- 
  for (i in 1:length(coef)) {
    print(i)
    deg.tmp <- topTable(fit2, lfc = 1, coef = coef[i], 
                        p.value = 0.05, 
                        sort.by = 'logFC', 
                        number = 100000)
    de.genes <- c(de.genes, as.character(deg.tmp$trait))
  }
  
  de.genes <- unique(de.genes) # as much as 990 genes are differentially expressed between one of the contrasts
  cluster.data <- trait.matrix[which(rownames(trait.matrix) %in% de.genes), ]
  colnames(cluster.data) <- sample.stage
  
# hierarchiecal tree clustering done for stage and genes ####
  
  # hc for stage
  dist.matrix.stage <- Dist(t(cluster.data), 
                            method = 'pearson', 
                            upper = T, 
                            diag = T) 
  hc.stage <- hclust(dist.matrix.stage, method = 'ward.D2')
  #hc.stage$order <- c(hc.stage$order[46:180], hc.stage$order[1:45])
  reorder(x = as.dendrogram(hc.stage), c(46:180, 1:45))
  plot(reorder(x = as.dendrogram(hc.stage), 180:1, agglo.FUN = mean))
  
  # hc for gene
  dist.matrix.gene <- Dist(cluster.data, 
                           method = 'pearson', # to group the genes based on patterns similarity across stages
                           upper = T, 
                           diag = T)  
  hc.gene <- hclust(dist.matrix.gene, method = 'ward.D2') # similar to average linkage-
  # it calculates the distance that is minimizing the variance within cluster
  # and maximazing the variance between clusters
  plot(hc.gene)
  hc.gene <- cutree(hc.gene, k =6) # if k > 5, the cluster will be diproporsional i.e. a cluster only have few member
  table(hc.gene)
  hc.table <- as.data.frame(table(true = rownames(cluster.data), cluster = hc.gene))
  hc.table <- hc.table[which(hc.table$Freq != 0), ]
  hc.table$cluster <- as.numeric(hc.table$cluster)
  hc.table$true <- as.character(hc.table$true)
  hc.table$Freq <- NULL
  table(hc.table$cluster)
  write.csv(hc.table, 'files/gene-cluster.csv')
  
  # heatmap and hierarchiecal clustering ####
  sample.color <- ifelse(grepl('pd', colnames(trait.matrix)), '#4daf4a', 
                         ifelse(grepl('ar', colnames(trait.matrix)), '#377eb8', 
                                ifelse(grepl('im', colnames(trait.matrix)), '#984ea3',
                                       ifelse(grepl('rp', colnames(trait.matrix)), '#e41a1c', NA))))
  
  tiff(file = paste0("figures/heatmap-genes.tiff"), 
       width = 2250, 
       height = 2000, 
       units = 'px',
       res = 300,
       compression = 'lzw')
  heatmap.2(x = as.matrix(cluster.data), cexRow = 0.8, cexCol = 1.2,
                  distfun = function(x) Dist(x, method = 'pearson'),
                  hclust = function(x) hclust(x, method = 'ward.D2'),
                  scale = 'row', 
                  density.info = "density", 
                  trace = 'none', 
                  col = brewer.pal(9, 'YlGnBu'),
                  keysize = 1,
                  key.title = 'Z-score of\nlog-intensities',
                  key.xlab = NA, 
                  Colv = reorder(x = as.dendrogram(hc.stage), 180:1, agglo.FUN = mean), 
                  ColSideColors = sample.color)
  dev.off() 
 
# GOE analysis for genes in each cluster ####
  x <- org.At.tairCHR
  all.genes <- as.list(rownames(trait.matrix))
  hc.go.list <- as.data.frame(matrix(data = NA, nrow = 0, ncol = 8))
  colnames(hc.go.list) <- c('GO.ID', 'Term', 'Annotated', 'Significant', 'Expected', 'Fisher', 
                            'FDR', 'cluster')
  ontology <- 'BP'
  
  for (i in unique(hc.table$cluster)) {
    gene.set <- hc.table$true[which(hc.table$cluster == i)]
    gene.set <- factor(as.integer(all.genes %in% gene.set))
    names(gene.set) <- all.genes
    GOdata <- new("topGOdata",
                  description = "Analyzing clustering results", ontology = ontology,
                  allGenes = gene.set, 
                  annot = annFUN.org,mapping= "org.At.tair.db")
    resultFisher <- runTest(GOdata, algorithm = 'weight', statistic = "fisher")
    # GOE from topgo consider the general terms on the hierarchy
    # the weight algortihm maintain the balance between type I and II errors
    result.df <- GenTable(GOdata, Fisher = resultFisher,
                          orderBy = "Fisher", ranksOf = "Fisher", topNodes = length(resultFisher@score))
    result.df$FDR <- p.adjust(p = result.df$Fisher, method = 'fdr')
    result.df$cluster <- i
    result.df <- result.df[order(result.df$FDR), ]
    result.df <- filter(result.df, FDR <= 0.001)
    hc.go.list <- rbind(hc.go.list, result.df)
  }
  
  write.csv(hc.go.list, paste0('files/goe-tables', ontology, '.csv'))