Commit ef6e3eae authored by Benis, Nirupama's avatar Benis, Nirupama
Browse files

first commit

parents
# Script to load Agilent data, visualise with limma and analyse using maSigPro
##load libraries
library(Biobase)
library(limma)
library(VennDiagram)
# Input for the gene expression microarray data
##read targets with the experimental design, map sample files to conditions
targets <- readTargets("/home/user/allInfoDir/workspace/testdir_ssb/targets_1.txt") #without replicate information
poolData <- readTargets("/home/user/allInfoDir/workspace/testdir_ssb/targets_pool.txt") #with replicate information
##read in raw data as an ElistRaw object (limma)
rawData <- read.maimages(targets,green.only = TRUE, source = "agilent.median", path = "/home/user/allInfoDir/workspace/testdir_ssb/ileumNjejunum") #from LIMMA for Agilent microarrays
write.table (rawData, file = "/testdir_ssb/raw_data.txt", sep = "\t")
###Quality Control
#colors for controls
spotTypes <- readSpotTypes("/home/user/allInfoDir/workspace/testdir_ssb/spotTypesAllGrouped.txt") #information on controls in the microarray for quality control
spotTypes$SpotType
rawData$genes$Status <- controlStatus(spotTypes, rawData$genes)
##background correction,use normalize mexp method and add offset 1 (in case we want to take logs) and normalize (quantile normalization method)
rawBgcorrect <- backgroundCorrect(rawData, method = "normexp", offset = 1)
normal <- normalizeBetweenArrays(rawBgcorrect, method = "quantile") #makes it an Elist object
normalExprsnData <- normal$E #the expression part of the Elist object
#changes the filenames to condition names just to make the graphs readable
colnames(normalExprsnData) <- poolData$Condition
##MA plots for whole normal$E adapted from the limma function plotMA3by2()
#gives .pdf files with 6 plots in each
source("/home/user/allInfoDir/workspace/newFunctions/plotMA3by2Mod.r")
plotMA3by2Mod(normal)
##Fitting, ranking and merging biological replicates
# substitute the probe names with gene names where ever possible, but retain the unmapped probes
mapProbeGene <- read.delim("PnGn.txt", as.is = T)
#use the information from agilent to map probe names to gene names
normalNoControl <- normal[normal$genes$ControlType == 0,]
replacedProbeName <- normalNoControl$genes$ProbeName
position <- which(mapProbeGene$ProbeName %in% normalNoControl$genes$ProbeName)
replacedProbeName <- replace(replacedProbeName, position, mapProbeGene$GeneName)
normWithGeneName <- normalNoControl
normWithGeneName$genes$ProbeName <- replacedProbeName
rownames(normWithGeneName$E) <- replacedProbeName
save(normWithGeneName, file="D:/workspace/testdir_ssb/output/NormalGnPn.RData")
## average over probes that map to the same gene
normGeneProbeAvg <- avereps(normWithGeneName, ID=normWithGeneName$gene$ProbeName)
colnames(normGeneProbeAvg) <- poolData$Condition
## Remove some probes (lower percentile and duplicate probes)
#Get the highest of the dark spots to get a base value and then multiply by 1.1
negative95 <- apply (normal$E [normal$gene$ControlType == -1,], 2, function(x) quantile(x, p = 0.95))
cutoff <- matrix (1.1 * negative95, nrow(normGeneProbeAvg), ncol(normGeneProbeAvg), byrow = TRUE)
#A spot is "expressed" if it is above the threshold in at least half of the conditions
isExpressed <- rowSums(normGeneProbeAvg$E > cutoff) >= dim(targets)[1]/2
condition <- unique(targets$Condition)
exprsdInAtleast1 <- list()
for(i in 1:length(condition)) {
nCols <- grep(condition[i], colnames(normGeneProbeAvg$E))
isExpressed <- rowSums(normGeneProbeAvg$E[,nCols] > cutoff[,nCols])== length(nCols)
perCondition <- normGeneProbeAvg$genes$ProbeName[isExpressed]
exprsdInAtleast1[[i]] <- perCondition
}
specificCols <- unique(unlist(exprsdInAtleast1))
reduGeneProbeSpC <- normGeneProbeAvg[specificCols,]
save(reduGeneProbeSpC, file= "/home/user/allInfoDir/workspace/testdir_ssb/output/reduGeneProbeSpC.RData")
## PCA plots
#Limit columns, change according to requirements
#For expressed genes
#ExprGenes <- avgGenes[match(i.t3.days,avgGenes$genes$ProbeName),]
#For all the genes
exprGenes <- normal[match(i.t3.days,normal$genes$ProbeName),]
tissue <- "jejunum"
onlyJejunum <- avgGenes[,grep("jejunum", colnames(avgGenes))]
expr.mat <- as.matrix(onlyJejunum$E)
rownames(expr.mat) <- onlyJejunum$gene$ProbeName
#Performing PCA
pca <- prcomp(t(expr.mat))
source("/home/user/allInfoDir/workspace/newFunctions/colourPlotPdfPca.r")
colourPlotPdfPca(pca, colnames(expr.mat), day, treatment)
##Hierarchical
d <- dist(t(expr.mat), method = "euclidean")
d <- as.dist(1 - cor(expr.mat))
hr <- hclust(d, method = "complete", members = NULL)
plot(hr)
dhr <- as.dendrogram(hr)
pdf("Hierarchical_pool.pdf", height = 14, width = 14)
par( mar= c(3,10,1,10))
plot(dhr,type = c("rectangle"), center = FALSE, edge.root = FALSE, leaflab = c("perpendicular"), dLeaf = NULL, horiz = TRUE)
dev.off()
################## Quadratic regression analysis ##############
library(maSigPro)
load("/home/user/allInfoDir/workspace/testdir_ssb/output/GnPnspC.RData") #data that has been normalised, low intensity is removed and gene names substituted, where ever possible so has probe names too
jData <- reduGnPnspC$E[,grep("jejunum", colnames(reduGnPnspC$E))]
source("/home/user/allInfoDir/workspace/newFunctions/makeDesignMasigpro.r")
expDesign <- makeDesignMasigpro(targets, colnames(reduGnPnspC))
#standardise the data
library(Mfuzz)
jDataEset <- new("ExpressionSet", expr=jData)
featurenames <- rownames(jData)
featureNames(jDataEset)<-featurenames
SjData <- standardise(jDataEset)
# perform analysis with the wrapper function
analysisWhole <- maSigPro(exprs(SjData), Q=0.05, vars="groups", expDesign, pdf=F, cluster.method="mfuzz")
#original file
load("/home/user/allInfoDir/workspace/testdir_ssb/output/maSigProJStd.RData")
analysis <- analysisWhole
listCtrl <- rownames(analysis$sig.genes$tmt1$sig.profiles)
listTmt1 <- rownames(analysis$sig.genes$tmt2vstmt1$sig.profiles)
listTmt2 <- rownames(analysis$sig.genes$tmt3vstmt1$sig.profiles)
genesTmt1 <- listTmt1[grep("A_72_",listTmt1, invert=T)]
genesTmt2 <- listTmt2[grep("A_72_",listTmt2, invert=T)]
genesTmt1 <- gsub("[[:punct:]]", "\\.",genesTmt1)
genesTmt2 <- gsub("[[:punct:]]", "\\.",genesTmt2)
tmt1Ntmt2 <- intersect(genesTmt1, genesTmt2)
onlyTmt1 <- setdiff(genesTmt1, genesTmt2)
onlyTmt2 <- setdiff(genesTmt2, genesTmt1)
tmt1Ntmt2 <- scan(file= "/home/user/allInfoDir/workspace/testdir_ssb/masigpro/Jalone/jdue2tmt.txt", sep="\t", what="")
onlyTmt1 <- scan(file= "/home/user/allInfoDir/workspace/testdir_ssb/masigpro/Jalone/jonlyt2.txt", sep="\t", what="")
onlyTmt2 <- scan(file= "/home/user/allInfoDir/workspace/testdir_ssb/masigpro/Jalone/jonlyt3.txt", sep="\t", what="")
########## TopGO script ###########
library(topGO)
# goids from biomart in convOtherDB
# humanMart <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# goids = getBM(attributes=c('entrezgene','go_id'), filters='entrezgene', values=allEnt, mart=humanMart)
# geneID2GO <- data.frame()
# for(i in 1:length(allEnt)){
# tmp <- goids[goids$entrezgene %in% allEnt[i],]
# idTmp <- paste(tmp$go_id[grep("GO", tmp$go_id)], collapse=", ")
# oneRow <- data.frame(entrezgene=allEnt[i], go_id=idTmp, stringsAsFactors=F)
# if(oneRow$go_id != "")
# geneID2GO <- rbind(geneID2GO,oneRow)
#
# }
#
# write.table(geneID2GO, file="geneID2GOprop.txt", sep="\t", row.names=F, col.names=F, quote=F)
geneID2GO <- readMappings(file="geneID2GOprop.txt")
propGnId <- read.table(file=paste0("/home/user/allInfoDir/workspace/conversion/propGenesIds.txt"), sep="\t", as.is=T)
allEntrez <- as.character(propGnId$Id)
selectGenesList <- list(tmt1Ntmt2, onlyTmt1, onlyTmt2)
fnameVector <- c("tmt1Ntmt2topGOBP", "onlyTmt1topGOBP", "onlyTmt2topGOBP")
for(i in 1:length(selGenesList)) {
fname=fnameVector[i]
selectGenes <- selectGenesList[[i]]
selectEntrezIds <- as.character(unique(propGnId$Id[propGnId$Gn %in% selectGenes]))
geneList <- factor(as.integer(allEntrez %in% selectEntrezIds))
names(geneList) <- allEntrez
GOdataBP <- new("topGOdata", ontology = "BP", allGenes = geneList, annot = annFUN.gene2GO, gene2GO = geneID2GO)
# save(GOdataBP, file=paste0("D:/workspace/topGO/GOdata", fname, ".RData"))
GOdata <- GOdataBP
annotatedGenes <- genesInTerm(GOdata) #terms for genes, opp to geneID2GO
goNames <- names(annotatedGenes)
test.stat <- new("classicCount", testStatistic = GOFisherTest, name = "Fisher test")
resultFisher <- getSigGroups(GOdata, test.stat)
pvalFis <- score(resultFisher)
pvalFis <- sort(pvalFis)
pval.01 <- sum(pvalFis < 0.01)
significantGO <- names(pvalFis[1:pval.01])
source("/home/user/allInfoDir/workspace/newFunctions/makeTableTopGO.r")
makeTableTopGO(significantGO, annotatedGenes, fname)
}
\ No newline at end of file
########## Data Integration ###############
# This script performs data preprocessing and data integration of host gene expression and microbiota
# Load libraries
library(RColorBrewer)
library(igraph)
library(mixOmics)
# load and process microbiota data
inputMicrob <- read.delim(file="/home/user/allInfoDir/workspace/microbiota/selectedMasigpro.txt", as.is=T)
inputMicrob <- t(inputMicrob)
# separate data based on conditions
tmt1Ntmt2Microb4mixomics <- inputMicrob
onlytmt1Microb4mixomics <- inputMicrob[grep("T3", rownames(inputMicrob), invert = T),]
onlytmt2Microb4mixomics <- inputMicrob[grep("T2", rownames(inputMicrob), invert = T),]
# load and process gene expression data
load("/home/user/allInfoDir/workspace/testdir_ssb/output/GnspC.RData")
inputExpression <- reduGn$E[,grep("jejunum", colnames(reduGn$E))]
inputExpression <- t(as.matrix(inputExpression))
tmt1Ntmt2 <- scan(file= paste0("/home/user/allInfoDir/workspace/testdir_ssb/masigpro/Jalone/jdue2tmt.txt"), what="")
onlyTmt1 <- scan(file= paste0("/home/user/allInfoDir/workspace/testdir_ssb/masigpro/Jalone/jonlyt2.txt"), what="")
onlyTmt2 <- scan(file= paste0("/home/user/allInfoDir/workspace/testdir_ssb/masigpro/Jalone/jonlyt3.txt"), what="")
# separate the different groups of genes for the analysis, genes
tmt1Ntmt2Exprsn <- inputExpression[,colnames(inputExpression) %in% tmt1Ntmt2]
onlyTmt1Exprsn <- inputExpression[,colnames(inputExpression) %in% onlyTmt1]
onlyTmt2Exprsn <- inputExpression[,colnames(inputExpression) %in% onlyTmt2]
# separate the different groups of genes for the analysis, conditions
tmt1Ntmt2Exprsn4mixomics <- tmt1Ntmt2Exprsn
onlyTmt1Exprsn4mixomics <- onlyTmt1Exprsn[grep("T3", rownames(onlyTmt1Exprsn), invert = T),]
onlyTmt2Exprsn4mixomics <- onlyTmt2Exprsn[grep("T2", rownames(onlyTmt2Exprsn), invert = T),]
## mixomics anlaysis
# calculate similarities
ncomp = 8 #number of components is usually number of conditions - 1
tmt1Ntmt2Spars <- spls(X = tmt1Ntmt2Microb4mixomics, Y = tmt1Ntmt2Exprsn4mixomics, ncomp=ncomp)
ncomp = 5
onlyTmt1Spars <- spls(X = onlytmt1Microb4mixomics, Y = onlyTmt1Exprsn4mixomics, ncomp=ncomp)
onlyTmt2Spars <- spls(X = onlytmt2Microb4mixomics, Y = onlyTmt2Exprsn4mixomics, ncomp=ncomp)
# build networks
color.edge <- colorRampPalette(c("darkgreen", "green", "yellow", "red", "darkred")) # make a color palette for the network
ncomp = 8
netResultTmt1Ntmt2 <- network(tmt1Ntmt2Spars, comp = 1:ncomp, keep.var = TRUE, shape.node = c("rectangle", "rectangle"), color.node = c("white", "pink"), color.edge = color.edge(10), alpha = 3, threshold = 0.8)
ncomp = 5
netResultOnlyTmt1 <- network(onlyTmt1Spars, comp = 1:ncomp, keep.var = TRUE, shape.node = c("rectangle", "rectangle"), color.node = c("white", "pink"), color.edge = color.edge(10), alpha = 3, threshold = 0.8)
netResultOnlyTmt2 <- network(onlyTmt2Spars, comp = 1:ncomp, keep.var = TRUE, shape.node = c("rectangle", "rectangle"), color.node = c("white", "pink"), color.edge = color.edge(10), alpha = 3, threshold = 0.8)
############# Microbiota data ####################
# This script reads in relative contribution data of microbiota, filters it and performs regression analysis using masigpro
#Read in relative contribution data and filter out the groups with low contribution at the level 3 (genera)
#Process data with proper row and column names
allMicrob <- read.delim(file="/home/user/allInfoDir/workspace/microbiota/withTmt.txt", as.is=T)
colNames <- colnames(allMicrob)
colNames <- sub("(\\.[1234567890]+)(T[1234567890]+)", "\\2\\1", colNames, perl=T)
colnames(allMicrob) <- colNames
# Rearrange the columns
dLevels <- c("J8", "J55", "J176")
dT <- c("T1", "T2", "T3")
allMicrob <- allMicrob[,unlist(lapply(dT, grep, colnames(allMicrob)))]
allMicrob <- allMicrob[,unlist(lapply(dLevels, grep, colnames(allMicrob)))]
write.table(palmicrobT, file="D:/workspace/microbiota/microbProper.txt")
palmicrobT <- read.table(file="/home/user/allInfoDir/workspace/microbiota/microbProper.txt", as.is=T)
#check the values for variance
allVar <- list()
variance <- NULL
palmicrob <- allMicrob
condition <- colnames(allMicrob)
condition <- sub("\\.[1234567890]+", "", condition)
condition <- unique(condition)
exLeast1 <- list()
for(i in 1:length(condition)) {
nCols <- grep(condition[i], colnames(allMicrob))
for(j in 1:dim(allMicrob)[1])
variance[j] <- as.numeric(var(t(allMicrob[,nCols][j,])))
allVar[[i]] <- variance
}
lapply(allVar, function(x) max(x))
#shorten microb names
shorten <- scan(file="D:/workspace/microbiota/MgroupsShort.txt", what="", sep="\n")
shorten <- sub("et rel.", "", shorten)
rownames(allMicrob) <- shorten
#check cutoff n filter
# highContri <- rowSums(almicrob > cutoff)>=4
source("/home/user/allInfoDir/workspace/newFunctions/filterPerCondition.r")
specificCols <- filterPerCondition(allMicrob, 0.1, condition)
filteredMicrob <- allMicrob[specificCols,]
passFilt <- rownames(filteredMicrob)
write(passFilt, file="D:/workspace/microbiota/passFilt.txt")
passFilt <- scan(file="D:/workspace/microbiota/passFilt.txt", what="", sep="\t")
######## Masigpro
library(maSigPro)
#remove the three samples from animals rejected in gene expression quality control
mpoolData <- colnames(filteredMicrob)
remTmp <- c ("J8T2.8", "J176T2.8", "J176T3.4")
rnames <- mpoolData[!mpoolData %in% remTmp]
filteredMicrob <- subset(filteredMicrob, select= -c(J8T2.8, J176T2.8, J176T3.4)) #NA value in output check
expDesign <- read.table(file="/home/user/allInfoDir/workspace/testdir_ssb/masigpro/edesignJ.txt")
rownames(expDesign) <- colnames(filteredMicrob)
firstStep <- p.vector(data=filteredMicrob, design=expDesign)
# write(rownames(firstStep$SELEC), file=paste0(wd,"/microbiota/masigMicrob1st46.txt"), sep="\t")
selectedMicrob <- firstStep$SELEC
write.table(selectedMicrob, file="/home/user/allInfoDir/workspace/microbiota/selectedMasigpro.txt", sep = "\t", quote = F)
########################################
\ No newline at end of file
colourPlotPdfPca <- function(pca, cnames.expr.mat, day, treatment){
pca <- pca
sum.pca <- summary(pca)
cnames.expr.mat <- cnames.expr.mat
day <- day
treatment <- treatment
colors <- cnames.expr.mat
names(colors) <- cnames.expr.mat
colors[grep(paste0("_", day[1], "_"), colors)] <- "red"
colors[grep(paste0("_", day[2], "_"), colors)] <- "green"
colors[grep(paste0("_", day[3], "_"), colors)] <- "blue"
treatmentsAll <- cnames.expr.mat
names(treatmentsAll) <- cnames.expr.mat
treatmentsAll[grep("_T1", treatmentsAll)] <- 1
treatmentsAll[grep("_T2", treatmentsAll)] <- 2
treatmentsAll[grep("_T3", treatmentsAll)] <- 3
#Put the plots in a .pdf
pdf("normal_pca.pdf", height=10, width=10)
plot(pca$x , main="PCA", pch=17, col=colors)
text(pca$x, pos=1, treatmentsAll)
mtext(paste("first two components explain ",format(100 * sum.pca$importance["Cumulative Proportion",2],digits=2),"% of variance", sep=""))
dev.off()
}
\ No newline at end of file
contrastAcrossDays <- function(tissue, day, treatment, f) {
tissue <- tissue
day <- day
treatment <- treatment
f <- f
req.contrast <- NULL
contrast.names.tissue <- NULL
contrast.names <- NULL
for (i in 1:2) {
req.tissue <- levels(f)[grep (tissue[i], levels(f))]
for (j in 1:3){
req.contrast <- NULL
req.treatment <- req.tissue[grep (treatment[j], req.tissue)]
for (k in 1:3){
req.contrast <- append(req.contrast, req.treatment[grep (day[k], req.treatment)])
if (k == 3)
contrast.names.tissue[[j]] <- c(paste(req.contrast[k-2],"-", req.contrast[k-1], sep= ""), paste(req.contrast[k-1],"-", req.contrast[k], sep= ""))
}
}
contrast.names[[i]] <- contrast.names.tissue
names(contrast.names[[i]]) <- treatment
}
names(contrast.names) <- tissue
return(contrast.names)
}
\ No newline at end of file
filterPerCondition <- function(dataDF, threshold, condition){
dataDF <- dataDF #a dataframe
threshold <- threshold # numeric
condition <- condition # should be found in the colnames of dataDF
cutoff <- matrix (threshold, nrow(dataDF), ncol(dataDF), byrow = TRUE)
contribLeast1 <- list()
for(i in 1:length(condition)) {
nCols <- grep(condition[i], colnames(dataDF))
highContrib <- rowSums(dataDF[,nCols] > cutoff[,nCols]) == length(nCols)
perCondition <- rownames(dataDF)[highContrib]
contribLeast1[[i]] <- perCondition
cat(condition[i], "\n", length(perCondition), "\n") #jus for debugging n compilin info
}
specificCols <- unique(unlist(contribLeast1))
return(specificCols)
}
\ No newline at end of file
makeDesignMasigpro <- function(targets, cnames.reduGnPnspC){
targets <- targets
cnames.reduGnPnscp <- cnames.reduGnPnspC
dLevels <- c("jejunum_8", "jejunum_55", "jejunum_176")
dT <- c("T1", "T2", "T3")
targets <- targets[unlist(lapply(dT, grep, targets$Condition)),]
targets <- targets[unlist(lapply(dLevels, grep, targets$Condition)),]
jnoPool <- unique(targets$Condition)[grep("jejunum", unique(targets$Condition))]
jpNames <- cnames.reduGnPnspC[grep("jejunum", cnames.reduGnPnspC)]
#generate the columns necessary for the matrix
names(jnoPool) <- c(1:9)
repl <- NULL
tpt <- NULL
tmt1 <- rep(0, 33)
tmt2 <- rep(0, 33)
tmt3 <- rep(0, 33)
for(i in 1:length(jnoPool)) {
fornames <- as.character(unlist(strsplit(jnoPool[i], split = "_|-")))
pos <- grep(jnoPool[i], jpNames)
repl <- append(repl,as.integer(rep(names(jnoPool[i]), each=length(pos))))
tpt <- append(tpt, as.integer(rep(fornames[2], each=length(pos))))
if(fornames[3] == "T1")
tmt1[pos] <- as.integer(rep(1, each=length(pos)))
if(fornames[3] == "T2")
tmt2[pos] <- as.integer(rep(1, each=length(pos)))
if(fornames[3] == "T3")
tmt3[pos] <- as.integer(rep(1, each=length(pos)))
}
jexpDesign <- as.matrix(cbind(tpt, repl, tmt1, tmt2, tmt3))
rownames(jexpDesign) <- jpNames
return(jexpDesign)
}
\ No newline at end of file
makeTableTopGO <- function(significantGO, annotatedGenes, fname){
significantGO <- significantGO
annotatedGenes <- annotatedGenes
nRow <- length(significantGO)
enrichTable <- as.data.frame(matrix(rep(0, 6*nRow), nrow=nRow))
colNames=c("RowNum", "GO.ID", "GO.Term","p.value", "GeneInList", "GeneInGO")
colnames(enrichTable) <- colNames
for(j in 1:nRow) {
genesTerm <- annotatedGenes[[significantGO[j]]]
tGenes <- intersect(selectEntrezIds, genesTerm)
enrichTable$RowNum[j] <-j
enrichTable$GO.ID[j] <-significantGO[j]
enrichTable$GO.Term[j] <- GOTERM[[significantGO[j]]]@Term
enrichTable$p.value[j] <- unname(pvalFis[j])
enrichTable$GeneInList[j] <- length(tGenes)
enrichTable$GeneInGO[j] <- length(genesTerm)
}
write.table(enrichTable, file=paste0(fname, ".txt"), row.names=F, quote=F, sep="\t")
}
\ No newline at end of file
plotMA3by2Mod <- function(normal) {
x <- class(normal)
if(x[[1]] == "EList"){
MA <- normal
MA$E <- as.matrix(MA$E)
narrays <- ncol(MA$E)
npages <- ceiling(narrays/6)
main <- colnames(MA)
width <- 6.5
height <- 10
xlim <- c(3, 20)
ylim <- c(-15, 15)
prefix <- 'MA for Elist'
ext <- "pdf"
xlab <- "A"
ylab <- "M"
status <- MA$genes$Status
for (ipage in 1:npages) {
i1 <- ipage * 6 - 5
i2 <- min(ipage * 6, narrays)
pname = paste(prefix, "-", i1, "-", i2, ".", ext, sep = "")
pdf(pname , height = height, width= width)
par(mfrow = c(3, 2))
for (i in i1:i2) {
plotMA(MA, array = i, xlim = xlim, ylim = ylim, legend = (i%%6 == 1), zero.weights = TRUE, main = main[i])
}
dev.off()
}
} else { print("This function was modified only for Elist objects")}
}
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment