5-create-community-network.R

#######################################################
##### seed eQTL network analysis using ARACNe #########
#######################################################

# prepare the script working environment ####
  remove(list = ls())
  gc()
  set.seed(1000) 
  
  work.dir <- "C:/Users/harta005/Projects/seed-germination-qtl"
  setwd(work.dir)
  
# library
  library(RCy3)
  library(dplyr)
  library(minet)
  library(GENIE3)
  library(doParallel)
  library(reshape2)
  library(devtools)
  library(ggplot2)
  library(Hmisc)
  
# determine the stage and transband ####
  dev.stage <- 'rp'
  chr <- 5
  start <- 24000000
  end <- 26000000
  n.perm <- 1000
  
# load required data ####
  
  # gene and genetic infp
  genetic.map <- as.matrix(read.csv(file = 'files/genetic-map.csv'))
  gene.info <- read.csv('files/gene.info.csv', row.names = 1)
  gene.feature <- read.csv('files/gene-feature.csv')
  gene.pol <- read.csv('files/baysha-polymorphism.csv', row.names = 1)
  gene.data <- merge(gene.info, gene.pol, all = TRUE)
  gene.feature <- merge(gene.info, gene.feature, all = TRUE)
  
  # expression matrix
  expression <- as.matrix(read.csv('files/trait-matrix.csv', row.names = 1))
  expression <- expression[, colnames(expression) %in% colnames(genetic.map)] # remove sample without genetic map
  expression <- expression[, 17:176] # remove parents
  expression <- expression[, grep(pattern = 'pd', x = colnames(expression))] # only take ril for specific stage 
  
  # ril stage
  ril.stage <- substr(x = colnames(expression), start = 8, stop = 10) # write a vector of ril stage
  
  # load the qtl table
  eqtl.table <- readRDS('qtl-tables/table_single-stage-eqtl_all.rds')
  
  # select the target trait
  target.table <- filter(eqtl.table, qtl_bp >= start,
                              qtl_type == 'distant',
                              qtl_bp <= end, 
                              qtl_chromosome == chr,
                              stage == dev.stage)
  targets <- as.character(target.table$trait)
  target.expression <- expression[which(rownames(expression) %in% targets), ]
  
  # select the candidate genes 
  candidate.table <- filter(eqtl.table, qtl_bp >= start, 
                          qtl_bp <= end,
                          qtl_type == 'local', 
                          qtl_chromosome == chr,
                          gene_bp >= start,
                          gene_bp <= end,
                          stage == dev.stage)
  candidates <- as.character(candidate.table$trait)
  candidate.expression <- expression[which(rownames(expression) %in% rownames(expression)), ]
  
  # combine expression matrix for targets, candidates, and genes underlying qtl
  expression.m <- expression[which(rownames(expression) %in% c(targets, candidates)), ]
  
  # create gene table and determine the gene function
  nodes <- gene.feature[which(gene.feature$trait %in% c(targets, candidates)), ]
  nodes <- data.frame(id = c(targets, candidates), 
                      role = c(rep('target', length(targets)), rep('candidates', length(candidates))),
                      is_tf = NA)
  nodes[, 'is_tf'] <- gene.feature[match(nodes$id, gene.feature$trait), 'is_TF']
  regulators <- as.character(nodes$trait[which(nodes$role == 'candidates' | nodes$is_tf == 1)])
  
  # network inference using GENIE3 ####
  weight <- GENIE3(exprMatrix = expression.m, verbose = TRUE)
  weight <- melt(weight)
  weight$Var1 <- as.character(weight$Var1)
  weight$Var2 <- as.character(weight$Var2)
  names(weight) <- c('nodes1', 'nodes2', 'weight')
  
  # remove duplicates 
  
  edges.genie3 <- as.data.frame(matrix(data = NA, nrow = 6670, ncol = 3))
  cur.row <- 1
  for(i in row.names(expression.m)) {
    for(j in row.names(expression.m)) {
      if(i == j) {
        next
      }
      set1 <- weight[which(weight$nodes1 == i & weight$nodes2 == j), ]
      set2 <- weight[which(weight$nodes2 == i & weight$nodes1 == j), ]
      if(set1[, 3] >= set2[, 3]) {
        edges.genie3[cur.row, ] <- set1
        cur.row <- cur.row + 1
        print(cur.row)
      } 
      if(set1[, 3] <= set2[, 3]) {
        edges.genie3[cur.row, ] <- set2
        cur.row <- cur.row + 1
        print(cur.row)
      }
    }
  }
  edges.genie3 <- distinct(edges.genie3)
  names(edges.genie3) <- c('nodes1', 'nodes2', 'weight')
  
  # ranking
  rank.genie3 <- edges.genie3
  rank.genie3$rank <- rank(-rank.genie3$weight)
  rank.genie3 <- rank.genie3[, c(1, 2, 4)]
  write.csv(x = rank.genie3, file = paste0('networks/', dev.stage, chr, '-genie.csv'))
  
  # network inference using pearson correlation ####
  
  edges.pearson <- rcorr(t(expression.m), type = 'pearson')
  edges.pearson <- melt(edges.pearson$r)
  names(edges.pearson) <- c('nodes1', 'nodes2', 'weight')
  
  # remove duplicates and determine direction
  rank.pearson <- rank.genie3[, 1:2]
  rank.pearson <- merge(rank.pearson, edges.pearson, by = c("nodes1", "nodes2"))
  rank.pearson <- rank.pearson[, 1:3]
  rank.pearson$weight <- abs(rank.pearson$weight)
  
  # ranking
  rank.pearson$rank <- rank(-rank.pearson$weight)
  rank.pearson <- rank.pearson[, c(1, 2, 4)]
  names(rank.pearson) <- c('nodes1', 'nodes2', 'rank')
  write.csv(x = rank.pearson, file = paste0('networks/', dev.stage, chr, '-pearson.csv'))

  
  # network inference using spearman correlation ####
  edges.spearman <- rcorr(t(expression.m), type = 'spearman')
  edges.spearman <- melt(edges.spearman$r)
  names(edges.spearman) <- c('nodes1', 'nodes2', 'weight')
  
  # remove duplicates and determine direction
  rank.spearman <- rank.genie3[, 1:2]
  rank.spearman <- merge(rank.spearman, edges.spearman, by = c("nodes1", "nodes2"))
  rank.spearman <- rank.spearman[, 1:3]
  rank.spearman$weight <- abs(rank.spearman$weight)
  
  # ranking
  rank.spearman$rank <- rank(-rank.spearman$weight)
  rank.spearman <- rank.spearman[, c(1, 2, 4)]
  names(rank.spearman) <- c('nodes1', 'nodes2', 'rank')
  write.csv(x = rank.spearman, file = paste0('networks/', dev.stage, chr, '-spearman.csv'))
  
  # network inference using CLR ####
  edges.clr <- minet(dataset = t(expression.m), method = 'clr', estimator = 'mi.empirical', disc = 'equalfreq')
  edges.clr <- melt(edges.clr)
  names(edges.clr) <- c('nodes1', 'nodes2', 'weight')
  
  # ranking
  rank.clr <- rank.genie3[, 1:2]
  rank.clr <- merge(rank.clr, edges.clr, by = c("nodes1", "nodes2"))
  rank.clr <- rank.clr[, 1:3]
  
  rank.clr$rank <- rank(-rank.clr$weight)
  rank.clr <- rank.clr[, c(1, 2, 4)]
  names(rank.clr) <- c('nodes1', 'nodes2', 'rank')
  write.csv(x = rank.clr, file = paste0('networks/', dev.stage, chr, '-clr.csv'))
  
  # network inference using aracne ####
  edges.aracne <- minet(dataset = t(expression.m), method = 'aracne', estimator = 'mi.empirical', disc = 'equalfreq')
  edges.aracne <- melt(edges.aracne)
  names(edges.aracne) <- c('nodes1', 'nodes2', 'weight')
  
  # ranking
  rank.aracne <- rank.genie3[, 1:2]
  rank.aracne <- merge(rank.aracne, edges.aracne, by = c("nodes1", "nodes2"))
  rank.aracne <- rank.aracne[, 1:3]
  
  rank.aracne$rank <- rank(-rank.aracne$weight)
  rank.aracne <- rank.aracne[, c(1, 2, 4)]
  names(rank.aracne) <- c('nodes1', 'nodes2', 'rank')
  write.csv(x = rank.aracne, file = paste0('networks/', dev.stage, chr, '-aracne.csv'))
  
  # network inference using mrnet ####
  edges.mrnet <- minet(dataset = t(expression.m), method = 'mrnet', estimator = 'mi.empirical', disc = 'equalfreq')
  edges.mrnet <- melt(edges.mrnet)
  names(edges.mrnet) <- c('nodes1', 'nodes2', 'weight')
  
  # ranking
  rank.mrnet <- rank.genie3[, 1:2]
  rank.mrnet <- merge(rank.mrnet, edges.mrnet, by = c("nodes1", "nodes2"))
  rank.mrnet <- rank.mrnet[, 1:3]
  
  rank.mrnet$rank <- rank(-rank.mrnet$weight)
  rank.mrnet <- rank.mrnet[, c(1, 2, 4)]
  names(rank.mrnet) <- c('nodes1', 'nodes2', 'rank')
  write.csv(x = rank.mrnet, file = paste0('networks/', dev.stage, chr, '-mrnet.csv'))
  
  # network inference using tigress
  weight.tigress <- tigress::tigress(t(expression.m), 
                                    nsplit = 1000, 
                                    nstepsLARS = 5, 
                                    allsteps = F)
  weight.tigress <- melt(weight.tigress)
  names(weight.tigress) <- c('nodes1', 'nodes2', 'weight')
  weight.tigress$nodes1 <- as.character(weight.tigress$nodes1)
  weight.tigress$nodes2 <- as.character(weight.tigress$nodes2)

  # remove duplicates 
  # combination <- nrow(expression.m) * (nrow(expression.m) - 1) /2
  # edges.tigress <- as.data.frame(matrix(data = NA, nrow = combination, ncol = 3))
  # cur.row <- 1
  # for(i in row.names(expression.m)) {
  #   for(j in row.names(expression.m)) {
  #     if(i == j) {
  #       next
  #     }
  #     set1 <- weight.tigress[which(weight.tigress$nodes1 == i & weight.tigress$nodes2 == j), ]
  #     set2 <- weight.tigress[which(weight.tigress$nodes2 == i & weight.tigress$nodes1 == j), ]
  #     if(set1[, 3] >= set2[, 3]) {
  #       edges.tigress[cur.row, ] <- set1
  #       cur.row <- cur.row + 1
  #       print(cur.row)
  #     }       
  #     if(set1[, 3] <= set2[, 3]) {
  #       edges.genie3[cur.row, ] <- set2
  #       cur.row <- cur.row + 1
  #       print(cur.row)
  #     }
  #   }
  # }
  # edges.tigress <- distinct(edges.tigress)
  # names(edges.tigress) <- c('nodes1', 'nodes2', 'weight')
  
  # ranking
  rank.tigress <- rank.genie3[, 1:2]
  rank.tigress <- merge(rank.tigress, weight.tigress, by = c("nodes1", "nodes2"))
  rank.tigress <- rank.tigress[, 1:3]
  
  rank.tigress$rank <- rank(-rank.tigress$weight)
  rank.tigress <- rank.tigress[, c(1, 2, 4)]
  names(rank.tigress) <- c('nodes1', 'nodes2', 'rank')
  write.csv(x = rank.tigress, file = paste0('networks/', dev.stage, chr, '-tigress.csv'))
  
  # merge the tables

  rank.list <- list(rank.genie3, rank.spearman, rank.clr, rank.aracne, rank.tigress)
  #rank.list <- list(rank.genie3, rank.spearman, rank.pearson, rank.mrnet, rank.clr, rank.aracne, rank.tigress)
  networks <- Reduce(function(x, y) full_join(x, y, by = c('nodes1', 'nodes2')), rank.list)

  names(networks) <- c('nodes1', 'nodes2', 'genie3', 'spearman', 'clr', 'aracne', 'tigress')
  #names(networks) <- c('nodes1', 'nodes2', 'genie3', 'spearman', 'pearson', 'mrnet', 'clr', 'aracne', 'tigress')
  networks$avg_rank <- NA
  for(i in 1:nrow(networks)) {
    networks$avg_rank[i] <- 
      mean(as.numeric(networks[i, 3:(ncol(networks) - 1)]))
  }
  
  # PCA
  pr.out <- prcomp(x = (t(networks[3:10])),  center = T, scale. = F)
  pc.df <- data.frame(pc1 = pr.out$x[, 1], # 0.37
                      pc2 = pr.out$x[, 2]) # 0.32
  pc.df$method <- rownames(pc.df)
  ggplot(pc.df, aes(x = pc1, y = pc2, color = method)) + geom_point(size = 3.5)
  pc.sum <- summary(pr.out)
  pc1 <- pc.sum$importance[2, 1]
  pc2 <- pc.sum$importance[2, 2]
  
  # determine threshold
  edges <- networks[, c('nodes1', 'nodes2', 'avg_rank')]
  colnames(edges) <- c('source', 'target', 'interaction')
  
  edges <- na.omit(edges)
  threshold.m <- as.data.frame(matrix(data = NA, ncol = 3, nrow = 0))
  names(threshold.m) <- c('threshold', "edge", "node")
  cur.row <- 1
  for(i in max(round(edges$interaction, 0)):0){
    edges.tmp <- filter(edges, interaction <= i)
    total.edge <- nrow(edges.tmp)
    total.node <- length(unique(c(edges.tmp$source, edges.tmp$target))) 
    threshold.m[cur.row, ] <- c(i, total.edge, total.node)
    cur.row <- cur.row + 1
  }
  
  # visualize the threshold matrix
  plot(threshold.m$threshold, 
       threshold.m$node, 
       xlab = "rank threshold", 
       ylab = 'number of node') # 1552 for RP4 and 1248 for PD2
  
  # determine the stability of rank (RP4)
  
  degree.table <- data.frame(candidates = candidates)
  
  for(i in max(round(edges$interaction, 0)):1) {
    print(i)
    edge.thresholded <- filter(edges, interaction <= i)
    degree.table.tmp <- as.data.frame(table(edge.thresholded$source))
    colnames(degree.table.tmp) <- c("candidates", i)
    rank.degree.table <- merge(data.frame(candidates = candidates),
                               degree.table.tmp, all.x = T)
    rank.degree.table$rank <- rank(-rank.degree.table[, 2], 
                                   na.last = T, ties.method = 'average')
    degree.table[, paste0(i)] <- rank(-rank.degree.table[, 2], 
                                      na.last = T, ties.method = 'average')
  }
  
  all.rank <- data.frame(candidates,
                         mean = round(apply(degree.table[2:ncol(degree.table)], 1, mean), 2),
                         sd = round(apply(degree.table[2:ncol(degree.table)], 1, sd), 2))
  all.rank <- all.rank[order(all.rank$mean), ]
  write.csv(degree.table, paste0('networks/', dev.stage, chr, '-ranks-for-all-threshold.csv'))
  write.csv(all.rank, paste0('networks/', dev.stage, chr, '-ranks.csv'))
  
  # visualize the network
  edges.th <- filter(edges, interaction <= 288)
  #edges$interaction <- log2(edges$interaction)
  write.csv(edges, paste0('networks/', dev.stage, chr, '-edge.csv'))
  write.csv(nodes, paste0('networks/', dev.stage, chr, '-node.csv'))
  createNetworkFromDataFrames(nodes = nodes, edges = edges.th)