-
Stege, Paul authoredStege, Paul authored
11_mimic_test_mimic.R 27.25 KiB
#=============================================================================================
#author Neeraj Kumar et al., 2021; https://github.com/ClavelLab/MiMiC
#date 02-2021
#adjusted by Paul Stege
#script Calculate minimal consorta (MiMiC script)
#software R ; R Core Team 2022
#package P.J. McMurdie and S. Holmes, 2012 ; phyloseq
# L. Lahti and S. Shetty, 2017 ; microbiome
# Raivo Kolde, 2019 ; pheatmap
#=============================================================================================
# description
#This script is used to calculate the minimal microbial consrotia for a metagenome based on the reference pfam vector of microbial species
#minimum number of species covering maximum number of metagenome
# usage
# Rscript --vanilla 11_mimic_s4_mimic.R -r root_dir -i iterationNumber -e exclude_species
start_time <- Sys.time()
#=============================================================================================
# install packages --------------------------------------------------------
paks <- c("tidyverse","ggsignif","cowplot","RColorBrewer", "ggcorrplot","vegan",
"ade4","colorspace","plyr","data.table","ggplot2","inflection","dplyr",
"readr","optparse", "gridExtra", "ComplexHeatmap", "RColorBrewer", "pheatmap",
"ggtext", "phyloseq", "microbiome")
paks<-lapply(paks, function(x) require(x, character.only = TRUE))
rm(paks)
# from Sudarshan A. Shetty, & Leo Lahti. (2020, October). microbiomeutilities: Utilities for Microbiome Analytics
func_aggregate_top_taxa2 <- function(x, top, level) {
x <- aggregate_taxa(x, level)
tops <- top_taxa(x, top)
tax <- tax_table(x)
inds <- which(!rownames(tax) %in% tops)
tax[inds, level] <- "Other"
tax_table(x) <- tax
tt <- tax_table(x)[, level]
tax_table(x) <- tax_table(tt)
aggregate_taxa(x, level)
}
# set variables ---------------------------------------------------------
#library("optparse")
# maiking option using package "optparse"
option_list = list(
make_option(c("-r", "--root_dir"), type="character", default="/home/",
help="directory for this project, containing all folders for reading and exporting data [default= %default]", metavar="character"),
make_option(c("-i", "--itNumber"), type="integer", default=10,
help="iteratioin number [default= %default]", metavar="number"),
make_option(c("-e", "--ex_spec"), type="character", default="NA",
help="a .txt-file containing single column with species to exclude from the minimal consortia. e.g. /spec_exclude.txt [default= %default]", metavar="character"),
make_option(c("-p", "--ex_pfam"), type="character", default="NA",
help="a .txt-file containing single column with pfam. Used in the mismatch penalty of mimic. e.g. /pfam_exclude.txt [default= %default]", metavar="character"))
opt_parser = OptionParser(option_list=option_list)
opt = parse_args(opt_parser)
# if (is.null(opt$file)){
# print_help(opt_parser)
# stop("At least one argument must be supplied (input file).n", call.=FALSE)
# }
# input tables------------------------------------------------------------------
#create DB to convert names to species
na.zero = function (x) {x[is.na(x)] = 0 ;return(x) }
Minimal_consortia_size=opt$itNumber
paste0(opt$root_dir,"/08_mimic_output/MiMiC_final_output_stats_it",opt$itNumber,".tab")
finalStats = data.table::fread(paste0(opt$root_dir,"/08_mimic_output/MiMiC_final_output_stats_it",opt$itNumber,".tab"),
header = TRUE ,stringsAsFactors = F,check.names = F, fill=TRUE, sep = "\t")
colnames(finalStats)
#create DB to convert names to species
species_meta = data.table::fread(paste0(opt$root_dir,"/07_spec_genomes/species_meta.txt"),
header = F ,stringsAsFactors = F,check.names = F, fill=TRUE,
sep = "\t")
species_meta = str_split_fixed(species_meta$V1, " ", 2)
species_meta=species_meta[!apply(species_meta == "", 1, all),] # remove empty rows
colnames(species_meta)=c("BacterialGenome","Species")
species_meta = as.data.frame(species_meta)
species_meta$BacterialGenome=trimws(species_meta$BacterialGenome, which="both") #remove leading/trailing spaces
species_meta=species_meta[which(duplicated(species_meta$BacterialGenome)==FALSE),] # remove duplicates
species_meta=species_meta[which(duplicated(species_meta$Species)==FALSE),] # remove duplicates
species_meta=dplyr::select(species_meta,BacterialGenome, Species)
species_meta[which(duplicated(species_meta$Species) == T), ]
MetaGenome <- data.table::fread(paste0(opt$root_dir,"/08_mimic_output/pfam_parsed_table_meta"),
sep = "\t",
header = TRUE ,
stringsAsFactors = FALSE,
check.names = FALSE)
MetaGenome <- as.data.frame(MetaGenome)
# option to exclude pfam # does not mean species are excluded if they contain these genes
# instead, these pfams are excluded from those that needed to be present, therefore adding to the mismatch score
if (opt$ex_pfam == "NA") {
# if no input has been given for pfam exclusion
print("no pfam list detected")
} else {
# in case a list is supplied with pfams to exclude
pfam_exclude = data.table::fread(paste0(opt$root_dir,opt$ex_pfam),
sep = NULL ,header=F,stringsAsFactors=F) %>% as.data.frame()
MetaGenome=MetaGenome[!MetaGenome$PfamID %in% pfam_exclusion[,1], ]
}
# continue formatting metagenome vector file
colnames(MetaGenome)=gsub(".faa.*","",colnames(MetaGenome))
row.names(MetaGenome) <- MetaGenome$PfamID # to make sure if rows name remain same
MetaGenome$PfamID <- NULL
MetaGenome=MetaGenome[,which(colSums(MetaGenome)>1000)] # remove low variance samples
# reading genome vector file
GenomeVector <- data.table::fread(paste0(opt$root_dir,"/08_mimic_output/pfam_parsed_table_single"),
sep = "\t",
header = T,
stringsAsFactors = FALSE)
GenomeVector <- as.data.frame(GenomeVector)
colnames(GenomeVector)=gsub(".faa.*","",colnames(GenomeVector))
# option to exclude species
if (opt$ex_spec == "NA") {
# if no input has been given for species exclusion
print("no species excluded from minimal consortia")
} else {
# in case a list is supplied with species to exclude
spec_exclude = data.table::fread(paste0(opt$root_dir,opt$ex_spec),
sep = NULL ,header=F,stringsAsFactors=F) %>% as.data.frame()
spec_exclude = dplyr::filter(species_meta,grepl(paste(spec_exclude[,1],collapse="|"),
Species, ignore.case=TRUE)) # covert species ID
GenomeVector=GenomeVector[,
-grep(paste(spec_exclude$BacterialGenome,collapse="|"),
colnames(GenomeVector))]
}
row.names(GenomeVector) <- GenomeVector$PfamID # skip if row names alreay assigned and PfamID is not available
GenomeVector$PfamID <- NULL
# Functions ---------------------------------------------------------------
na.zero = function (x) {x[is.na(x)] = 0 ;return(x) }
# MiMiC function to calculate minimal microbial consortia
#this function compare the genome vector profile to the metagenome vector profile
scoreDiff_weighted <- function(genomes, metagenome,Minimal_consortia_size){
g <- as.matrix(genomes)
y <- as.matrix(metagenome)
mscore <- sum(y)
mscore1 <- sum(y)
res <- NULL
for(i in 1:Minimal_consortia_size){
matchCount <- apply(g, 2, function(x) sum(y[x >= 1.0])) # ---> only count of Pfam match which present in genome and not in metagenome
totalpfamG <- apply(g,2,function(x) sum(x>=1)) # total number of Pfam present in microbial genome
misMatchCount <- totalpfamG - matchCount # number Of Pfams which are present in genome but not in Metagenome.
matchratio <- ((matchCount)/(matchCount+misMatchCount)) # (x)/(x+y) # ---> mimic_B
matchratio[matchratio=="NaN"]<- 0 # because of some values become 0, when 0/0 it gives NaN
bestHit <- matchCount[names(matchratio[matchratio==max(matchratio)])] #---> genome with maximum match count and maximum match mismatch ratio
SelectedBestHit <- names(bestHit[bestHit==max(bestHit)])[1] #---> names of the genome to be selected
print(paste ("number of itreation" ,i,sep="=")) #---> printing number of iteration
a<-g[,SelectedBestHit] #--> selected genome vector
ascore <- bestHit[bestHit==max(bestHit)][1] # score
pfamInGenomePerItration = unname(totalpfamG[SelectedBestHit]) # Total number of pfam used in the comparison
exclusiveMatch <- unname(matchCount[SelectedBestHit]) # number of Pfam match to metagenome
exclusiveMismatchPerIteration <- unname(misMatchCount[SelectedBestHit]) # number of Pfam that did not match to the metagenome
print(paste("pfam",pfamInGenomePerItration,"exclusiveMatch",exclusiveMatch,"exclusiveMismatchPerIteration",
exclusiveMismatchPerIteration,sep = "=")) # Printing the summary on screen at each iteration
#print the genome with the name
#print (ascore)
#unname(ascore)
b <- as.character(SelectedBestHit) # name of the selected genome
print(b)
g <-g[,-which(colnames(g)==b),drop=F] # exclude selected genome.
if(i==1){
covered <- sum(y[a>=1])
rscore <- sum(y[a>=1])
percent <- (covered/mscore)*100
percent_nw = (rscore/mscore)*100 # pstege; report novel functions in percentage
#uncovered<-(mscore-covered)
#uncovered_Percentage <-(uncovered/mscore)
} else {
rscore <- sum(y[a>=1])
covered <- covered+rscore # cumulative Pfam number covered in metagenome
percent <- (covered/mscore)*100 # cumulative percentage of Pfam covered in metagenome
percent_nw = (rscore/mscore)*100 # pstege; report novel functions in percentage
#uncovered<-(mscore-covered)
#uncovered_Percentage <-(uncovered/mscore)
}
test <- names(y[a>=1,]) # Pfams IDs common in genome and metagenome
#print(g)
g[test,] <- 0 # making all Pfam as "zero" in reference genome set ( those function were already taken )
y[a>=1]<-0 # makes all metagenome matching with highest scoring genome as '0'
mscore1 <- sum(y) #number of pfam in metagenome
if(rscore==0){break} # if no pfam match found further, then stop the script
#RelativeMatchPerIteration=(rscore / exclusiveMismatchPerIteration)
res <- rbind(res,cbind.data.frame(bname=SelectedBestHit,
percentage=percent,score=rscore,ScorePerIteration=ascore,
pfamInGenomePerItration=pfamInGenomePerItration,
NovelFunctPerc=percent_nw,
#RelativeMatchPerIteration=RelativeMatchPerIteration,
exclusiveMismatchPerIteration=exclusiveMismatchPerIteration)) # making output dataframe
}
return(res)
}
#compare reference genome to metaegenome
scorePfam <- function(genomes, metagenome){
g <- as.matrix(genomes)
y <- metagenome }
# from Sudarshan A. Shetty, & Leo Lahti. (2020, October). microbiomeutilities: Utilities for Microbiome Analytics
func_aggregate_top_taxa2 <- function(x, top, level) {
x <- aggregate_taxa(x, level)
tops <- top_taxa(x, top)
tax <- tax_table(x)
inds <- which(!rownames(tax) %in% tops)
tax[inds, level] <- "Other"
tax_table(x) <- tax
tt <- tax_table(x)[, level]
tax_table(x) <- tax_table(tt)
aggregate_taxa(x, level)
}
# Total number of pfam and match to metagenome ----------------------------
#total number of pfam in genome and metagenome
# GenomeBinary <- GenomeVector
# row.names(GenomeBinary) <- GenomeBinary$PfamID # if coulmn name has the pfam ids and rownames does not exists.
# GenomeBinary$PfamID <- NULL
colnames(GenomeVector)
colnames(MetaGenome)
total <- cbind(GenomeVector,MetaGenome)
total <- as.data.frame(colSums(total)) #skip description folder.and sums up all pfam for each genome.
colnames(total) <- "PfamTotal" #change name of columns # total number of pfams in each genome and metagenome.
total$genomes <- rownames(total) #adding column name to total score file.
rownames(total) <- NULL
temp <- cbind.data.frame(old=c(total$genomes),neu=
c(rep("BacterialGenome",ncol(GenomeVector)),
rep("MetaGenome",ncol(MetaGenome)))) #taging bacterial genome and metaGenome
total$name<- as.character(temp[match(total$genomes,temp$old),2]) # number of in all genomes and metagenome
# summary ---------------------------------------------
# extract part final stats in summary file
stats_summary = dplyr::select(finalStats,MetaGenome,Species, Percentage, NovelMatch, AbsoluteMatch, CumNovelMatch)
colnames(stats_summary)=c("MetaGenome","Species","CumuPercentage","NovelMatch","AbsoluteMatch","CumuNovelMatch")
# Minimal consortia final selection ------------------------------------------
Minimal_consortia_size
#calc abund as sum of highest val in cum novel match
df_abund= aggregate(stats_summary$CumuNovelMatch,
list(stats_summary$MetaGenome), FUN=max) # calc pfam per metagenome
colnames(df_abund)=c("MetaGenome","sample_sum_pfam")
df_abund=plyr::join(dplyr::select(stats_summary,MetaGenome,Species,NovelMatch),
df_abund,by="MetaGenome",type="left")
df_abund$rel_score=df_abund$NovelMatch/df_abund$sample_sum_pfam # normalize novel match score by pfam per metagenome
#prev filter
filter_prev = table(df_abund$Species) %>% as.data.frame()
prevThreshold = 0.25* length(unique(df_abund$MetaGenome)) # variable prevalence threshold
filter_prev = filter_prev[,"Var1"][(filter_prev$Freq >= prevThreshold)] %>% as.character()
#leave one out
df_specab = dplyr::select(df_abund,Species,rel_score)
df_specab = dplyr::group_by(df_specab,
Species) %>% dplyr::summarise_all(median) # take sum of duplicate values
df_specab=df_specab[df_specab$Species %in% filter_prev, ] # select only prev filtered
while (length(df_specab$Species) > Minimal_consortia_size) {
lowest_hit = df_specab[which.min(df_specab$rel_score),]
lowest_hit <- as.character(lowest_hit$Species) # name of the selected genome
df_specab=df_specab[!df_specab$Species %in% lowest_hit, ] # exclude selected genome.
}
# heatmap plot -----------------------
#Plot the percentage novel functions per metagenome (x) and genome (y)
phyl_plot = dplyr::select(finalStats,MetaGenome,Species, NovelFunctPerc)
phyl_plot = spread(phyl_plot,MetaGenome,NovelFunctPerc)
phyl_plot = na.zero(phyl_plot) #convert NA to zero
plot_height=round((length(rownames(phyl_plot))/9)+2.5,digits=1)
plot_width=round((length(colnames(phyl_plot))/9)+6,digits=1)
rownames(phyl_plot) = phyl_plot$Species
phyl_plot$Species= NULL
phyl_plot= as.matrix(phyl_plot,rownames.value=rownames(phyl_plot))
#phyl_plot = t(scale(t(phyl_plot))) # Scale by row manually
cols = colorRampPalette(c("white","#F0E442","darkorange","red2","darkred"))
cols = cols(10)
# heatmap without annotation
p2=pheatmap(phyl_plot, labels_row=rownames(phyl_plot), border_color=F, color=cols,
show_rownames=TRUE)
ggsave(paste0(opt$root_dir,"/08_mimic_output/heatmap_species_perc_novel_funct.tiff"),
plot=p2, units="in", width=plot_width, height=plot_height, dpi=300, compression = 'lzw')
# heatmap row annotation
myRows <- df_specab$Species # rows to highlight
row_idx <- which(rownames(phyl_plot) %in% myRows) # font/styling settings
fontsizes <- rep(10, nrow(phyl_plot))
fontsizes[row_idx] <- 10
fontcolors <- rep('black', nrow(phyl_plot))
fontcolors[row_idx] <- 'black'
fontfaces <- rep('plain',nrow(phyl_plot))
fontfaces[row_idx] <- 'bold'
rowAnno <- rowAnnotation(rows=anno_text(rownames(phyl_plot),
gp=gpar(fontsize=fontsizes, fontface = fontfaces,
col = fontcolors))) # Create text annotation
# heatmap boxes around our rows
dend <- reorder(as.dendrogram(hclust(dist(phyl_plot))),
-rowMeans(phyl_plot), agglo.FUN = mean) # create our own row dendrogram
dend_idx <- which(order.dendrogram(dend) %in% which(
rownames(phyl_plot) %in% myRows)) # Find rows in dendrogram
btm <- 1 - (dend_idx / nrow(phyl_plot)) # save position rows
top <- btm + (1/nrow(phyl_plot))
box_col <- 'black'
box_width <- 2
# draw heatmap
tiff(filename = paste0(opt$root_dir,"/08_mimic_output/heatmap_species_perc_novel_funct_rowanno.tiff"),
units="in", width=plot_width, height=plot_height, compression='lzw', res=300)
Heatmap(phyl_plot, name="ht",cluster_rows = dend,
right_annotation = rowAnno,col=cols,
show_row_names = FALSE)
decorate_heatmap_body("ht", { for (i in 1:length(myRows)) {
grid.lines(c(0, 1), c(top[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
grid.lines(c(0, 1), c(btm[i],btm[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
grid.lines(c(0, 0), c(btm[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
grid.lines(c(1, 1), c(btm[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
}
})
dev.off()
# Recalculate MM consortia using MiMiC function -------------------------
mimic_recalc = plyr::join(df_specab, species_meta, by="Species", type="left") # subset genome df
mimic_recalc = GenomeVector[,which(colnames(GenomeVector) %in% mimic_recalc$BacterialGenome)]
recalc_PfamTable <- as.data.frame(apply(as.data.frame(MetaGenome),2,
function(x) scorePfam(as.data.frame(mimic_recalc),
x))) #adding score to pfam table
recalc_PfamTable$BacterialGenome <- rownames(recalc_PfamTable)
mimic_recalc = apply(as.data.frame(MetaGenome),
2,function(x) scoreDiff_weighted(as.data.frame(mimic_recalc),
x,Minimal_consortia_size)) # making a list of the mimic calculation
mimic_recalc = ldply(mimic_recalc, data.frame) # converting list into dataframe
names(mimic_recalc) <- c("MetaGenome","BacterialGenome","Percentage","Score","ScorePerIteration",
"pfamInGenomePerItration","NovelMatchPer","exclusiveMismatchPerIteration") #naming the column in the dataframe
mimic_recalc = mimic_recalc %>%
dplyr::group_by(MetaGenome) %>%
dplyr::mutate(Index=1:length(MetaGenome)) # adding the rank of species in metagenome
#To plot metagenome coverage
p1 <- ggplot(mimic_recalc,aes(Index,Percentage,group=MetaGenome,color=MetaGenome))+
geom_point()+ geom_line()+
scale_x_continuous(breaks = seq(0, Minimal_consortia_size, by = 2))+
theme(legend.position = "none")+
ylab("Functions covered (%)") + xlab("Bacterial species index")# plot for pfam coverage of metagenome by minimal consortia
ggsave(paste0(opt$root_dir,"/08_mimic_output/pfam_coverage_plot_mmconsortia.tiff"),
plot=p1, units="in", width=14, height=8, dpi=300, compression = 'lzw')
#continue collecting table
mimic_recalc = dplyr::left_join(mimic_recalc,total[,c("PfamTotal","genomes")], by=c("BacterialGenome" = "genomes")) %>%
dplyr::left_join(.,reshape2::melt(recalc_PfamTable,
id.vars="BacterialGenome",
value.name="AbsoluteMatch",variable.name="MetaGenome"),
by=c("BacterialGenome","MetaGenome")) %>%
dplyr::rename(NovelMatch = Score, BactTotalPfam = PfamTotal) %>%
dplyr::group_by(MetaGenome) %>%
dplyr::mutate(AbsoluteMismatch = BactTotalPfam - AbsoluteMatch,
CumAbsoluteMismatch = cumsum(AbsoluteMismatch),
#AbsoluteRelativeMatch = AbsoluteMatch/AbsoluteMismatch,
CumNovelMatch = cumsum(NovelMatch)
#CumAbsoluteRelativeMatch = cumsum(AbsoluteRelativeMatch),
#CumExclusiveMismatchPerIteration = cumsum(exclusiveMismatchPerIteration),
#CumIterationRelativeMatch = cumsum(RelativeMatchPerIteration),
) %>% ungroup()
# add species names
mimic_recalc$BacterialGenome = sub('^([^.]+.[^.]+).*', '\\1',mimic_recalc$BacterialGenome)
mimic_recalc= plyr::join(mimic_recalc,species_meta, by="BacterialGenome", type="left") # add species names
# heatmap plot minimal consortia -----------------------
#Plot the percentage novel functions.
phyl_plot = dplyr::select(mimic_recalc,MetaGenome,Species, NovelMatchPer)
phyl_plot = spread(phyl_plot,MetaGenome,NovelMatchPer)
phyl_plot = na.zero(phyl_plot) #convert NA to zero
plot_height=round((length(rownames(phyl_plot))/10)+2.5,digits=1)
plot_width=round((length(colnames(phyl_plot))/12)+2,digits=1)
rownames(phyl_plot) = phyl_plot$Species
phyl_plot$Species= NULL
phyl_plot= as.matrix(phyl_plot)
#phyl_plot = t(scale(t(phyl_plot))) # Scale by row manually
cols = colorRampPalette(c("white","#F0E442","darkorange","red2","darkred"))
cols = cols(10)
# heatmap row annotation
myRows <- df_specab$Species # rows to highlight
row_idx <- which(rownames(phyl_plot) %in% myRows) # font/styling settings
fontsizes <- rep(10, nrow(phyl_plot))
fontsizes[row_idx] <- 10
fontcolors <- rep('black', nrow(phyl_plot))
fontcolors[row_idx] <- 'black'
fontfaces <- rep('plain',nrow(phyl_plot))
fontfaces[row_idx] <- 'plain'
rowAnno <- rowAnnotation(rows=anno_text(rownames(phyl_plot),
gp=gpar(fontsize=fontsizes, fontface = fontfaces,
col = fontcolors))) # Create text annotation
# heatmap boxes around our rows
dend <- reorder(as.dendrogram(hclust(dist(phyl_plot))),
-rowMeans(phyl_plot), agglo.FUN = mean) # create our own row dendrogram
dend_idx <- which(order.dendrogram(dend) %in% which(
rownames(phyl_plot) %in% myRows)) # Find rows in dendrogram
btm <- 1 - (dend_idx / nrow(phyl_plot)) # save position rows
top <- btm + (1/nrow(phyl_plot))
box_col <- 'black'
box_width <- 1
# draw heatmap
tiff(filename = paste0(opt$root_dir,"/08_mimic_output/heatmap_mmconsort_perc_novel_funct.tiff"),
units="in", width=plot_width, height=plot_height, compression='lzw', res=300)
Heatmap(phyl_plot, name="ht",cluster_rows = dend,
right_annotation = rowAnno,col=cols,
show_column_names = FALSE, column_title = "Metagenomic samples",
column_title_side = c("bottom"),
show_row_names = FALSE)
decorate_heatmap_body("ht", { for (i in 1:length(myRows)) {
grid.lines(c(0, 1), c(top[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
grid.lines(c(0, 1), c(btm[i],btm[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
grid.lines(c(0, 0), c(btm[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
grid.lines(c(1, 1), c(btm[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
}
})
dev.off()
# prepare environment for Kofam part -----------------------
dir.create(paste0(opt$root_dir,"/12_kofam_anno"))
dir.create(paste0(opt$root_dir,"/12_kofam_anno/single"))
dir.create(paste0(opt$root_dir,"/12_kofam_anno/meta"))
kofam = list.files(path=paste0(opt$root_dir,"/08_mimic_output/output_faa_single"),
pattern = ".faa") # list samples
write.table(kofam,file=paste0(opt$root_dir,"/12_kofam_anno/single/all_samples.tab"),
sep = "\t",row.names=FALSE, quote=FALSE, col.names=F) # export file
kofam = list.files(path=paste0(opt$root_dir,"/08_mimic_output/output_faa_meta"),
pattern = ".faa") # list samples
write.table(kofam,file=paste0(opt$root_dir,"/12_kofam_anno/meta/all_samples.tab"),
sep = "\t",row.names=FALSE, quote=FALSE, col.names=F) # export file
# Plot relative abundance species annotated -------------------------------------
#Plot species grouped by feed
phyl_plot = readRDS(paste(opt$root_dir,"06_taxa_output",
"physeq_kraken_reads_conf03_refseq_glomspec.rds",sep="/"))
phyl_plot = prune_taxa(taxa_sums(phyl_plot) > 1, phyl_plot) #remove taxa lacking reads
#Merge species and genus name
tax_table(phyl_plot)[,"Species"] = paste(tax_table(phyl_plot)[,"Genus"],
tax_table(phyl_plot)[,"Species"],
sep="") #paste genus name to species one
tax_table(phyl_plot)[,"Species"] = gsub("g_", "",tax_table(phyl_plot)[,"Species"])
tax_table(phyl_plot)[,"Species"] = gsub("s_", " ",tax_table(phyl_plot)[,"Species"])
taxa_names(phyl_plot) = tax_table(phyl_plot)[,"Species"] # rename identifyer by species
#select settings
phyl_plot = func_aggregate_top_taxa2(phyl_plot, "Species", top = 12) #reduce to species
phyl_plot = microbiome::transform(phyl_plot, "compositional") #compositional
mm_taxa = sub("^(\\S*\\s+\\S+).*", "\\1", df_specab$Species) # extract species MM consortia
taxa_names(phyl_plot) = sapply(tax_table(phyl_plot)[,"Species"],
function(name){ifelse(name %in% mm_taxa,
paste0("**",name,"**"),name)},
USE.NAMES = FALSE) # highlight names MM consortia in bold
order = names(sort(taxa_sums(phyl_plot), F)[1:13])
order = order[order != "Other"]
order = c("Other",order)
# plotting
plot_composition(phyl_plot,
sample.sort = NULL,
otu.sort=order, x.label="Sample") +
geom_bar(position="stack", stat="identity", color=NA)+
theme_bw() +
labs(title="Top 12 abundant bacterial species")+
ylab( "Abundance (%)")+ xlab( "Sample ID")+
theme(legend.justification = "top",
legend.text = element_markdown(), # required to highlight MM consortia in bold
legend.title = element_blank(),
legend.key.size = unit(4.5, 'mm'),
plot.title = element_text(size = 12),
axis.title.y = element_text(color="Grey1", size=13, face="bold"),
axis.text.y =element_text(color="Grey1", size=11),
axis.text.x =element_text(color="Grey1", size=9, angle = 90, vjust = 0.5),
axis.title.x = element_text(color="Grey1", size = 13, face="bold"),
panel.grid.major = element_blank(),
axis.ticks = element_blank())
ggsave(paste(opt$root_dir,"06_taxa_output","abund_kraken_top12_spec_annot.tiff",sep="/"),
units="in", width=18, height=8, dpi=300, compression = 'lzw')
cat("--------------- R script completed ---------------")
cat("\n\n")
end_time <- Sys.time()
cat("Duration of :")
end_time - start_time
cat("\n\n")
cat("--------------- Bacteria of interest ---------------")
cat("\n\n")
cat(paste0("aimed for minimal consortia size of: ",Minimal_consortia_size))
cat("\n")
cat(paste0("resulted in consortia size of : ",length(unique(df_specab$Species))))
cat("\n\n")
print(unique(df_specab$Species))
cat("\n\n")
cat("Please open the MiMiC_summary-report to interpret the significance")
cat("\n")