11-multiomics-analysis.Rmd

output:
  pdf_document: default
  html_document: default
knitr::opts_chunk$set(echo      = TRUE,
                      message   = FALSE,
                      error     = FALSE,
                      cache     = TRUE,
                      out.width = "50%",
                      fig.width = 5,
                      fig.align = 'center')
csvfile <- system.file("extdata", "multi-omics", "COREAD_CMS13_gex.csv",
                       package="compGenomRData")
x1 <- read.csv(csvfile, row.names=1)
rownames(x1) <- sapply(strsplit(rownames(x1), "\\|"), function(x) x[1])
knitr::kable(head(t(head(x1))), caption="Gene expression data (head)")
csvfile <- system.file("extdata", "multi-omics", "COREAD_CMS13_muts.csv",
                       package="compGenomRData")
x2 <- read.csv(csvfile, row.names=1)
x2[x2>0]=1
knitr::kable(head(t(head(x2))), caption="Mutation data (head)")
csvfile <- system.file("extdata", "multi-omics", "COREAD_CMS13_cnv.csv",
                       package="compGenomRData")
x3 <- read.csv(csvfile, row.names=1)
knitr::kable(head(t(head(x3))), caption="Copy number data (head)")
csvfile <- system.file("extdata", "multi-omics", "COREAD_CMS13_subtypes.csv",
                       package="compGenomRData")
covariates <- read.csv(csvfile, row.names=1)
rownames(covariates) <- gsub(pattern = '-', replacement = '\\.', rownames(covariates))
covariates <- covariates[colnames(x1),]
anno_col <- data.frame(cms=as.factor(covariates$cms_label))
rownames(anno_col) <- rownames(covariates)
pheatmap::pheatmap(x1,
                   annotation_col = anno_col,
                   show_colnames = FALSE,
                   show_rownames = FALSE,
                   main="Gene expression data")
pheatmap::pheatmap(x2,
                   annotation_col = anno_col,
                   show_colnames = FALSE,
                   show_rownames = FALSE,
                   main="Mutation data")
pheatmap::pheatmap(x3,
                   annotation_col = anno_col,
                   show_colnames = FALSE,
                   show_rownames = FALSE,
                   main="Copy number data")
knitr::include_graphics("images/matrix_factorization.png" )
knitr::include_graphics("images/mfa.png" )
r.mfa <- FactoMineR::MFA(t(rbind(x1,x2,x3)),
                         c(dim(x1)[1], dim(x2)[1], dim(x3)[1]),
                         graph=FALSE)
mfa.h <- r.mfa$global.pca$ind$coord
mfa.w <- r.mfa$quanti.var$coord

mfa_df <- as.data.frame(mfa.h)
mfa_df$subtype <- factor(covariates[rownames(mfa_df),]$cms_label)
ggplot2::ggplot(mfa_df, ggplot2::aes(x=Dim.1, y=Dim.2, color=subtype)) +
ggplot2::geom_point() + ggplot2::ggtitle("Scatter plot of MFA")
pheatmap::pheatmap(t(mfa.h)[1:2,], annotation_col = anno_col,
                  show_colnames = FALSE,
                  main="MFA for multi-omics integration")
cnvs_with_neg <- as.data.frame(cbind(c(1,2,1),c(0,1,-2)))
colnames(cnvs_with_neg) <- c('seg1', 'seg2')
rownames(cnvs_with_neg) <- c('samp1', 'samp2', 'samp3')

cnvs_split_pos <- as.data.frame(cbind(c(1,2,1), c(0,0,0), c(0,1,0), c(0,0,2)))
colnames(cnvs_split_pos) <- c('seg1+', 'seg1-', 'seg2+', 'seg2-')
rownames(cnvs_split_pos) <- c('samp1', 'samp2', 'samp3')
knitr::kable(cnvs_with_neg, caption="Example copy number data. Data can be both positive (amplified regions) or negative (deleted regions).")
knitr::kable(cnvs_split_pos, caption="Example copy number data after splitting each column to a column representing copy number gains (+) and a column representing deletions (-). This data matrix is non-negative, and thus suitable for NMF algorithms.")
# For NMF to be usable on data types which allow negativity, we'll use a trick,
# where we split each column into two non-negative columns, one for the negative numbers
# and one for the positive numbers. Here's a function that does that:
split_neg_columns <- function(x) {
    new_cols <- list()
    for(i in seq_len(dim(x)[2])) {
        new_cols[[paste0(colnames(x)[i],'+')]] <- sapply(X = x[,i], function(x) max(0,x))
        new_cols[[paste0(colnames(x)[i],'-')]] <- sapply(X = -x[,i], function(x) max(0,x))
    }
    new_cols
    return(do.call(cbind, new_cols))
}

# and here's a test that shows the function above works
test_split_neg_columns <- function() {
    test.input <- as.data.frame(cbind(c(1,2,1),c(0,1,-2)))
    expected.output <- as.data.frame(cbind(c(1,2,1), c(0,0,0), c(0,1,0), c(0,0,2)))
    stopifnot(all(as.matrix(expected.output) == as.matrix(split_neg_columns(test.input))))
}
test_split_neg_columns()

# Here's a function that undoes the previous transformation,
# so it takes each pair of columns and merges them to one, where the values from the first column
# are taken as positives, and the values from the second column as negatives.
merge_neg_columns <- function(x) {
    new_cols <- list()
    for(i in seq_len(dim(x)[2]/2)) {
        pos_col <- x[,2*i-1]
        neg_col <- x[,2*i]
        merged_col <- pos_col
        merged_col[neg_col>0] <- -neg_col[neg_col>0]
        new_cols[[i]] <- merged_col
    }
    return(do.call(cbind, new_cols))
}

# And here's a test for the merging function.
test_merge_neg_columns <- function() {
    input1 <- as.data.frame(cbind(c(1,2,1),c(0,1,-2)))
    input2 <- split_neg_columns(input1)
    output <- merge_neg_columns(input2)
    stopifnot(all(output == input1))
}
test_merge_neg_columns()
# Feature-normalize the data
x1.featnorm <- x1 / rowSums(x1)
x2.featnorm <- x2 / rowSums(x2)
x3.featnorm <- x3 / rowSums(x3)

# Normalize by each omics type's frobenius norm
x1.featnorm.frobnorm <- x1.featnorm / norm(as.matrix(x1.featnorm), type="F")
x2.featnorm.frobnorm <- x2.featnorm / norm(as.matrix(x2.featnorm), type="F")
x3.featnorm.frobnorm <- x3.featnorm / norm(as.matrix(x3.featnorm), type="F")

# Split the features of the CNV matrix into two non-negative features each
x3.featnorm.frobnorm.nonneg <- t(split_neg_columns(t(x3.featnorm.frobnorm)))

require(NMF)
r.nmf <- nmf(t(rbind(x1.featnorm.frobnorm,
                     x2.featnorm.frobnorm,
                     x3.featnorm.frobnorm.nonneg)),
             2,
             method='Frobenius')
nmf.h <- NMF::basis(r.nmf)
nmf.w <- NMF::coef(r.nmf)
nmf_df <- as.data.frame(nmf.h)
colnames(nmf_df) <- c("dim1", "dim2")
nmf_df$subtype <- factor(covariates[rownames(nmf_df),]$cms_label)
ggplot2::ggplot(nmf_df, ggplot2::aes(x=dim1, y=dim2, color=subtype)) +
ggplot2::geom_point() +
ggplot2::ggtitle("Scatterplot of 2-component NMF for multi-omics integration")
pheatmap::pheatmap(t(nmf_df[,1:2]),
                   annotation_col = anno_col,
                   show_colnames=FALSE,
                   main="Heatmap of 2-component NMF")
knitr::include_graphics("images/icluster.png" )
r.icluster <- iClusterPlus::iClusterPlus(t(x1), t(x2), t(x3), type=c("gaussian", "binomial", "multinomial"), K=2, alpha=c(1,1,1), lambda=c(.03,.03,.03))

icluster.z <- r.icluster$meanZ
icluster.ws <- r.icluster$beta
icluster.clusters <- r.icluster$clusters

icp_df <- as.data.frame(icluster.z)
colnames(icp_df) <- c("dim1", "dim2")
rownames(icp_df) <- colnames(x1)
icp_df$subtype <- factor(covariates[rownames(icp_df),]$cms_label)
ggplot2::ggplot(icp_df, ggplot2::aes(x=dim1, y=dim2, color=subtype)) +
ggplot2::geom_point() +
ggplot2::ggtitle("Scatter plot of iCluster+ factors")
pheatmap::pheatmap(t(icp_df[,1:2]),
                   annotation_col = anno_col,
                   show_colnames = FALSE,
                   main="Heatmap of iCluster+ factors")
nmf.clusters <- max.col(nmf.h)
names(nmf.clusters) <- rownames(nmf.h)
anno_nmf_cl <- data.frame(
  nmf.cluster=factor(nmf.clusters),
  cms.subtype=factor(covariates[rownames(nmf.h),]$cms_label)
)
pheatmap::pheatmap(t(nmf.h[order(nmf.clusters),]),
  cluster_cols=FALSE, cluster_rows=FALSE,
  annotation_col = anno_nmf_cl,
  show_colnames = FALSE,
  main="Joint NMF factors with clusters and molecular subtypes")
rownames(icluster.z) <- rownames(covariates)
icluster.clusters <- kmeans(icluster.z, 2)$cluster
names(icluster.clusters) <- rownames(icluster.z)
anno_icluster_cl <- data.frame(
  iCluster=factor(icluster.clusters),
  cms.subtype=factor(covariates$cms_label))

pheatmap::pheatmap(
  t(icluster.z[order(icluster.clusters),]),
  cluster_cols=FALSE,
  cluster_rows=FALSE,
  show_colnames = FALSE,
  annotation_col = anno_icluster_cl,
  main="iCluster factors with clusters and molecular subtypes")
nmfw <- t(nmf.w)

data_anno <- data.frame(
  omics=c(rep('expression',dim(x1)[1]),
          rep('mut',dim(x2)[1]),
          rep('cnv',dim(x3.featnorm.frobnorm.nonneg)[1])))
rownames(data_anno) <- c(rownames(x1),
                         paste0("mut:", rownames(x2)),
                         rownames(x3.featnorm.frobnorm.nonneg))
rownames(nmfw) <- rownames(data_anno)

pheatmap::pheatmap(nmfw,
                   cluster_cols = FALSE,
                   annotation_row = data_anno,
                   main="NMF coefficients",
                   clustering_distance_rows = "manhattan",
                   fontsize_row = 1)
require(enrichR)
genes.factor.1 <- names(which(nmfw[1:dim(x1)[1],1] > nmfw[1:dim(x1)[1],2]))
genes.factor.2 <- names(which(nmfw[1:dim(x1)[1],1] < nmfw[1:dim(x1)[1],2]))

go.factor.1 <- enrichR::enrichr(genes.factor.1,
                                databases = c("GO_Biological_Process_2018")
                                )$GO_Biological_Process_2018
go.factor.2 <- enrichR::enrichr(genes.factor.2,
                                databases = c("GO_Biological_Process_2018")
                                )$GO_Biological_Process_2018
ggplot2::ggplot(head(go.factor.1),
                ggplot2::aes(y=-log10(Adjusted.P.value), x=Term)) +
  ggplot2::geom_bar(stat='identity') + ggplot2::coord_flip()
a <- data.frame(age=covariates$age,
                gender=as.numeric(covariates$gender),
                msi=covariates$msi)
b <- nmf.h
colnames(b) <- c('factor1', 'factor2')
cov_factor <- cbind(a,b)
ggplot2::ggplot(cov_factor, ggplot2::aes(x=msi, y=factor1, group=msi)) +
  ggplot2::geom_boxplot() +
  ggplot2::ggtitle("NMF factor 1 microsatellite instability")
ggplot2::ggplot(cov_factor, ggplot2::aes(x=msi, y=factor2, group=msi)) +
  ggplot2::geom_boxplot() +
  ggplot2::ggtitle("NMF factor 2 and microsatellite instability")
split_neg_columns <- function(x) {
    # your code here
}

# a test that shows the function above works
test_split_neg_columns <- function() {
    input <- as.data.frame(cbind(c(1,2,1),c(0,1,-2)))
    output <- as.data.frame(cbind(c(1,2,1), c(0,0,0), c(0,1,0), c(0,0,2)))
    stopifnot(all(output == split_neg_columns(input)))
}
test_split_neg_columns()
a <- data.frame(age=covariates$age, gender=as.factor(covariates$gender), msi=covariates$msi, cimp=as.factor(covariates$cimp))
b <- nmf.h
colnames(b) <- c('factor1', 'factor2')
cov_factor <- cbind(a,b)
ggplot2::ggplot(cov_factor, ggplot2::aes(x=cimp, y=factor1, group=cimp)) + ggplot2::geom_boxplot() + ggplot2::ggtitle("NMF factor 1 and CIMP status")
ggplot2::ggplot(cov_factor, ggplot2::aes(x=cimp, y=factor2, group=cimp)) + ggplot2::geom_boxplot() + ggplot2::ggtitle("NMF factor 2 and CIMP status")