Skip to content
Snippets Groups Projects
Commit 324ec9ea authored by Stege, Paul's avatar Stege, Paul
Browse files

Upload New File

parent bd8d0481
Branches
No related tags found
No related merge requests found
#=============================================================================================
#author Neeraj Kumar et al., 2021; https://github.com/ClavelLab/MiMiC
#date 02-2021
#adjusted by Paul Stege
#script Calculate minimal consorta (MiMiC script)
#software R ; R Core Team 2022
#package P.J. McMurdie and S. Holmes, 2012 ; phyloseq
# L. Lahti and S. Shetty, 2017 ; microbiome
# Raivo Kolde, 2019 ; pheatmap
#=============================================================================================
# description
#This script is used to calculate the minimal microbial consrotia for a metagenome based on the reference pfam vector of microbial species
#minimum number of species covering maximum number of metagenome
# usage
# Rscript --vanilla 11_mimic_s4_mimic.R -r root_dir -i iterationNumber -e exclude_species
start_time <- Sys.time()
#=============================================================================================
# install packages --------------------------------------------------------
paks <- c("tidyverse","ggsignif","cowplot","RColorBrewer", "ggcorrplot","vegan",
"ade4","colorspace","plyr","data.table","ggplot2","inflection","dplyr",
"readr","optparse", "gridExtra", "ComplexHeatmap", "RColorBrewer", "pheatmap",
"ggtext", "phyloseq", "microbiome")
paks<-lapply(paks, function(x) require(x, character.only = TRUE))
rm(paks)
# from Sudarshan A. Shetty, & Leo Lahti. (2020, October). microbiomeutilities: Utilities for Microbiome Analytics
func_aggregate_top_taxa2 <- function(x, top, level) {
x <- aggregate_taxa(x, level)
tops <- top_taxa(x, top)
tax <- tax_table(x)
inds <- which(!rownames(tax) %in% tops)
tax[inds, level] <- "Other"
tax_table(x) <- tax
tt <- tax_table(x)[, level]
tax_table(x) <- tax_table(tt)
aggregate_taxa(x, level)
}
# set variables ---------------------------------------------------------
#library("optparse")
# maiking option using package "optparse"
option_list = list(
make_option(c("-r", "--root_dir"), type="character", default="/home/",
help="directory for this project, containing all folders for reading and exporting data [default= %default]", metavar="character"),
make_option(c("-i", "--itNumber"), type="integer", default=10,
help="iteratioin number [default= %default]", metavar="number"),
make_option(c("-e", "--ex_spec"), type="character", default="NA",
help="a .txt-file containing single column with species to exclude from the minimal consortia. e.g. /spec_exclude.txt [default= %default]", metavar="character"),
make_option(c("-p", "--ex_pfam"), type="character", default="NA",
help="a .txt-file containing single column with pfam. Used in the mismatch penalty of mimic. e.g. /pfam_exclude.txt [default= %default]", metavar="character"))
opt_parser = OptionParser(option_list=option_list)
opt = parse_args(opt_parser)
# if (is.null(opt$file)){
# print_help(opt_parser)
# stop("At least one argument must be supplied (input file).n", call.=FALSE)
# }
# input tables------------------------------------------------------------------
#create DB to convert names to species
na.zero = function (x) {x[is.na(x)] = 0 ;return(x) }
Minimal_consortia_size=opt$itNumber
paste0(opt$root_dir,"/08_mimic_output/MiMiC_final_output_stats_it",opt$itNumber,".tab")
finalStats = data.table::fread(paste0(opt$root_dir,"/08_mimic_output/MiMiC_final_output_stats_it",opt$itNumber,".tab"),
header = TRUE ,stringsAsFactors = F,check.names = F, fill=TRUE, sep = "\t")
colnames(finalStats)
#create DB to convert names to species
species_meta = data.table::fread(paste0(opt$root_dir,"/07_spec_genomes/species_meta.txt"),
header = F ,stringsAsFactors = F,check.names = F, fill=TRUE,
sep = "\t")
species_meta = str_split_fixed(species_meta$V1, " ", 2)
species_meta=species_meta[!apply(species_meta == "", 1, all),] # remove empty rows
colnames(species_meta)=c("BacterialGenome","Species")
species_meta = as.data.frame(species_meta)
species_meta$BacterialGenome=trimws(species_meta$BacterialGenome, which="both") #remove leading/trailing spaces
species_meta=species_meta[which(duplicated(species_meta$BacterialGenome)==FALSE),] # remove duplicates
species_meta=species_meta[which(duplicated(species_meta$Species)==FALSE),] # remove duplicates
species_meta=dplyr::select(species_meta,BacterialGenome, Species)
species_meta[which(duplicated(species_meta$Species) == T), ]
MetaGenome <- data.table::fread(paste0(opt$root_dir,"/08_mimic_output/pfam_parsed_table_meta"),
sep = "\t",
header = TRUE ,
stringsAsFactors = FALSE,
check.names = FALSE)
MetaGenome <- as.data.frame(MetaGenome)
# option to exclude pfam # does not mean species are excluded if they contain these genes
# instead, these pfams are excluded from those that needed to be present, therefore adding to the mismatch score
if (opt$ex_pfam == "NA") {
# if no input has been given for pfam exclusion
print("no pfam list detected")
} else {
# in case a list is supplied with pfams to exclude
pfam_exclude = data.table::fread(paste0(opt$root_dir,opt$ex_pfam),
sep = NULL ,header=F,stringsAsFactors=F) %>% as.data.frame()
MetaGenome=MetaGenome[!MetaGenome$PfamID %in% pfam_exclusion[,1], ]
}
# continue formatting metagenome vector file
colnames(MetaGenome)=gsub(".faa.*","",colnames(MetaGenome))
row.names(MetaGenome) <- MetaGenome$PfamID # to make sure if rows name remain same
MetaGenome$PfamID <- NULL
MetaGenome=MetaGenome[,which(colSums(MetaGenome)>1000)] # remove low variance samples
# reading genome vector file
GenomeVector <- data.table::fread(paste0(opt$root_dir,"/08_mimic_output/pfam_parsed_table_single"),
sep = "\t",
header = T,
stringsAsFactors = FALSE)
GenomeVector <- as.data.frame(GenomeVector)
colnames(GenomeVector)=gsub(".faa.*","",colnames(GenomeVector))
# option to exclude species
if (opt$ex_spec == "NA") {
# if no input has been given for species exclusion
print("no species excluded from minimal consortia")
} else {
# in case a list is supplied with species to exclude
spec_exclude = data.table::fread(paste0(opt$root_dir,opt$ex_spec),
sep = NULL ,header=F,stringsAsFactors=F) %>% as.data.frame()
spec_exclude = dplyr::filter(species_meta,grepl(paste(spec_exclude[,1],collapse="|"),
Species, ignore.case=TRUE)) # covert species ID
GenomeVector=GenomeVector[,
-grep(paste(spec_exclude$BacterialGenome,collapse="|"),
colnames(GenomeVector))]
}
row.names(GenomeVector) <- GenomeVector$PfamID # skip if row names alreay assigned and PfamID is not available
GenomeVector$PfamID <- NULL
# Functions ---------------------------------------------------------------
na.zero = function (x) {x[is.na(x)] = 0 ;return(x) }
# MiMiC function to calculate minimal microbial consortia
#this function compare the genome vector profile to the metagenome vector profile
scoreDiff_weighted <- function(genomes, metagenome,Minimal_consortia_size){
g <- as.matrix(genomes)
y <- as.matrix(metagenome)
mscore <- sum(y)
mscore1 <- sum(y)
res <- NULL
for(i in 1:Minimal_consortia_size){
matchCount <- apply(g, 2, function(x) sum(y[x >= 1.0])) # ---> only count of Pfam match which present in genome and not in metagenome
totalpfamG <- apply(g,2,function(x) sum(x>=1)) # total number of Pfam present in microbial genome
misMatchCount <- totalpfamG - matchCount # number Of Pfams which are present in genome but not in Metagenome.
matchratio <- ((matchCount)/(matchCount+misMatchCount)) # (x)/(x+y) # ---> mimic_B
matchratio[matchratio=="NaN"]<- 0 # because of some values become 0, when 0/0 it gives NaN
bestHit <- matchCount[names(matchratio[matchratio==max(matchratio)])] #---> genome with maximum match count and maximum match mismatch ratio
SelectedBestHit <- names(bestHit[bestHit==max(bestHit)])[1] #---> names of the genome to be selected
print(paste ("number of itreation" ,i,sep="=")) #---> printing number of iteration
a<-g[,SelectedBestHit] #--> selected genome vector
ascore <- bestHit[bestHit==max(bestHit)][1] # score
pfamInGenomePerItration = unname(totalpfamG[SelectedBestHit]) # Total number of pfam used in the comparison
exclusiveMatch <- unname(matchCount[SelectedBestHit]) # number of Pfam match to metagenome
exclusiveMismatchPerIteration <- unname(misMatchCount[SelectedBestHit]) # number of Pfam that did not match to the metagenome
print(paste("pfam",pfamInGenomePerItration,"exclusiveMatch",exclusiveMatch,"exclusiveMismatchPerIteration",
exclusiveMismatchPerIteration,sep = "=")) # Printing the summary on screen at each iteration
#print the genome with the name
#print (ascore)
#unname(ascore)
b <- as.character(SelectedBestHit) # name of the selected genome
print(b)
g <-g[,-which(colnames(g)==b),drop=F] # exclude selected genome.
if(i==1){
covered <- sum(y[a>=1])
rscore <- sum(y[a>=1])
percent <- (covered/mscore)*100
percent_nw = (rscore/mscore)*100 # pstege; report novel functions in percentage
#uncovered<-(mscore-covered)
#uncovered_Percentage <-(uncovered/mscore)
} else {
rscore <- sum(y[a>=1])
covered <- covered+rscore # cumulative Pfam number covered in metagenome
percent <- (covered/mscore)*100 # cumulative percentage of Pfam covered in metagenome
percent_nw = (rscore/mscore)*100 # pstege; report novel functions in percentage
#uncovered<-(mscore-covered)
#uncovered_Percentage <-(uncovered/mscore)
}
test <- names(y[a>=1,]) # Pfams IDs common in genome and metagenome
#print(g)
g[test,] <- 0 # making all Pfam as "zero" in reference genome set ( those function were already taken )
y[a>=1]<-0 # makes all metagenome matching with highest scoring genome as '0'
mscore1 <- sum(y) #number of pfam in metagenome
if(rscore==0){break} # if no pfam match found further, then stop the script
#RelativeMatchPerIteration=(rscore / exclusiveMismatchPerIteration)
res <- rbind(res,cbind.data.frame(bname=SelectedBestHit,
percentage=percent,score=rscore,ScorePerIteration=ascore,
pfamInGenomePerItration=pfamInGenomePerItration,
NovelFunctPerc=percent_nw,
#RelativeMatchPerIteration=RelativeMatchPerIteration,
exclusiveMismatchPerIteration=exclusiveMismatchPerIteration)) # making output dataframe
}
return(res)
}
#compare reference genome to metaegenome
scorePfam <- function(genomes, metagenome){
g <- as.matrix(genomes)
y <- metagenome }
# from Sudarshan A. Shetty, & Leo Lahti. (2020, October). microbiomeutilities: Utilities for Microbiome Analytics
func_aggregate_top_taxa2 <- function(x, top, level) {
x <- aggregate_taxa(x, level)
tops <- top_taxa(x, top)
tax <- tax_table(x)
inds <- which(!rownames(tax) %in% tops)
tax[inds, level] <- "Other"
tax_table(x) <- tax
tt <- tax_table(x)[, level]
tax_table(x) <- tax_table(tt)
aggregate_taxa(x, level)
}
# Total number of pfam and match to metagenome ----------------------------
#total number of pfam in genome and metagenome
# GenomeBinary <- GenomeVector
# row.names(GenomeBinary) <- GenomeBinary$PfamID # if coulmn name has the pfam ids and rownames does not exists.
# GenomeBinary$PfamID <- NULL
colnames(GenomeVector)
colnames(MetaGenome)
total <- cbind(GenomeVector,MetaGenome)
total <- as.data.frame(colSums(total)) #skip description folder.and sums up all pfam for each genome.
colnames(total) <- "PfamTotal" #change name of columns # total number of pfams in each genome and metagenome.
total$genomes <- rownames(total) #adding column name to total score file.
rownames(total) <- NULL
temp <- cbind.data.frame(old=c(total$genomes),neu=
c(rep("BacterialGenome",ncol(GenomeVector)),
rep("MetaGenome",ncol(MetaGenome)))) #taging bacterial genome and metaGenome
total$name<- as.character(temp[match(total$genomes,temp$old),2]) # number of in all genomes and metagenome
# summary ---------------------------------------------
# extract part final stats in summary file
stats_summary = dplyr::select(finalStats,MetaGenome,Species, Percentage, NovelMatch, AbsoluteMatch, CumNovelMatch)
colnames(stats_summary)=c("MetaGenome","Species","CumuPercentage","NovelMatch","AbsoluteMatch","CumuNovelMatch")
# Minimal consortia final selection ------------------------------------------
Minimal_consortia_size
#calc abund as sum of highest val in cum novel match
df_abund= aggregate(stats_summary$CumuNovelMatch,
list(stats_summary$MetaGenome), FUN=max) # calc pfam per metagenome
colnames(df_abund)=c("MetaGenome","sample_sum_pfam")
df_abund=plyr::join(dplyr::select(stats_summary,MetaGenome,Species,NovelMatch),
df_abund,by="MetaGenome",type="left")
df_abund$rel_score=df_abund$NovelMatch/df_abund$sample_sum_pfam # normalize novel match score by pfam per metagenome
#prev filter
filter_prev = table(df_abund$Species) %>% as.data.frame()
prevThreshold = 0.25* length(unique(df_abund$MetaGenome)) # variable prevalence threshold
filter_prev = filter_prev[,"Var1"][(filter_prev$Freq >= prevThreshold)] %>% as.character()
#leave one out
df_specab = dplyr::select(df_abund,Species,rel_score)
df_specab = dplyr::group_by(df_specab,
Species) %>% dplyr::summarise_all(median) # take sum of duplicate values
df_specab=df_specab[df_specab$Species %in% filter_prev, ] # select only prev filtered
while (length(df_specab$Species) > Minimal_consortia_size) {
lowest_hit = df_specab[which.min(df_specab$rel_score),]
lowest_hit <- as.character(lowest_hit$Species) # name of the selected genome
df_specab=df_specab[!df_specab$Species %in% lowest_hit, ] # exclude selected genome.
}
# heatmap plot -----------------------
#Plot the percentage novel functions per metagenome (x) and genome (y)
phyl_plot = dplyr::select(finalStats,MetaGenome,Species, NovelFunctPerc)
phyl_plot = spread(phyl_plot,MetaGenome,NovelFunctPerc)
phyl_plot = na.zero(phyl_plot) #convert NA to zero
plot_height=round((length(rownames(phyl_plot))/9)+2.5,digits=1)
plot_width=round((length(colnames(phyl_plot))/9)+6,digits=1)
rownames(phyl_plot) = phyl_plot$Species
phyl_plot$Species= NULL
phyl_plot= as.matrix(phyl_plot,rownames.value=rownames(phyl_plot))
#phyl_plot = t(scale(t(phyl_plot))) # Scale by row manually
cols = colorRampPalette(c("white","#F0E442","darkorange","red2","darkred"))
cols = cols(10)
# heatmap without annotation
p2=pheatmap(phyl_plot, labels_row=rownames(phyl_plot), border_color=F, color=cols,
show_rownames=TRUE)
ggsave(paste0(opt$root_dir,"/08_mimic_output/heatmap_species_perc_novel_funct.tiff"),
plot=p2, units="in", width=plot_width, height=plot_height, dpi=300, compression = 'lzw')
# heatmap row annotation
myRows <- df_specab$Species # rows to highlight
row_idx <- which(rownames(phyl_plot) %in% myRows) # font/styling settings
fontsizes <- rep(10, nrow(phyl_plot))
fontsizes[row_idx] <- 10
fontcolors <- rep('black', nrow(phyl_plot))
fontcolors[row_idx] <- 'black'
fontfaces <- rep('plain',nrow(phyl_plot))
fontfaces[row_idx] <- 'bold'
rowAnno <- rowAnnotation(rows=anno_text(rownames(phyl_plot),
gp=gpar(fontsize=fontsizes, fontface = fontfaces,
col = fontcolors))) # Create text annotation
# heatmap boxes around our rows
dend <- reorder(as.dendrogram(hclust(dist(phyl_plot))),
-rowMeans(phyl_plot), agglo.FUN = mean) # create our own row dendrogram
dend_idx <- which(order.dendrogram(dend) %in% which(
rownames(phyl_plot) %in% myRows)) # Find rows in dendrogram
btm <- 1 - (dend_idx / nrow(phyl_plot)) # save position rows
top <- btm + (1/nrow(phyl_plot))
box_col <- 'black'
box_width <- 2
# draw heatmap
tiff(filename = paste0(opt$root_dir,"/08_mimic_output/heatmap_species_perc_novel_funct_rowanno.tiff"),
units="in", width=plot_width, height=plot_height, compression='lzw', res=300)
Heatmap(phyl_plot, name="ht",cluster_rows = dend,
right_annotation = rowAnno,col=cols,
show_row_names = FALSE)
decorate_heatmap_body("ht", { for (i in 1:length(myRows)) {
grid.lines(c(0, 1), c(top[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
grid.lines(c(0, 1), c(btm[i],btm[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
grid.lines(c(0, 0), c(btm[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
grid.lines(c(1, 1), c(btm[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
}
})
dev.off()
# Recalculate MM consortia using MiMiC function -------------------------
mimic_recalc = plyr::join(df_specab, species_meta, by="Species", type="left") # subset genome df
mimic_recalc = GenomeVector[,which(colnames(GenomeVector) %in% mimic_recalc$BacterialGenome)]
recalc_PfamTable <- as.data.frame(apply(as.data.frame(MetaGenome),2,
function(x) scorePfam(as.data.frame(mimic_recalc),
x))) #adding score to pfam table
recalc_PfamTable$BacterialGenome <- rownames(recalc_PfamTable)
mimic_recalc = apply(as.data.frame(MetaGenome),
2,function(x) scoreDiff_weighted(as.data.frame(mimic_recalc),
x,Minimal_consortia_size)) # making a list of the mimic calculation
mimic_recalc = ldply(mimic_recalc, data.frame) # converting list into dataframe
names(mimic_recalc) <- c("MetaGenome","BacterialGenome","Percentage","Score","ScorePerIteration",
"pfamInGenomePerItration","NovelMatchPer","exclusiveMismatchPerIteration") #naming the column in the dataframe
mimic_recalc = mimic_recalc %>%
dplyr::group_by(MetaGenome) %>%
dplyr::mutate(Index=1:length(MetaGenome)) # adding the rank of species in metagenome
#To plot metagenome coverage
p1 <- ggplot(mimic_recalc,aes(Index,Percentage,group=MetaGenome,color=MetaGenome))+
geom_point()+ geom_line()+
scale_x_continuous(breaks = seq(0, Minimal_consortia_size, by = 2))+
theme(legend.position = "none")+
ylab("Functions covered (%)") + xlab("Bacterial species index")# plot for pfam coverage of metagenome by minimal consortia
ggsave(paste0(opt$root_dir,"/08_mimic_output/pfam_coverage_plot_mmconsortia.tiff"),
plot=p1, units="in", width=14, height=8, dpi=300, compression = 'lzw')
#continue collecting table
mimic_recalc = dplyr::left_join(mimic_recalc,total[,c("PfamTotal","genomes")], by=c("BacterialGenome" = "genomes")) %>%
dplyr::left_join(.,reshape2::melt(recalc_PfamTable,
id.vars="BacterialGenome",
value.name="AbsoluteMatch",variable.name="MetaGenome"),
by=c("BacterialGenome","MetaGenome")) %>%
dplyr::rename(NovelMatch = Score, BactTotalPfam = PfamTotal) %>%
dplyr::group_by(MetaGenome) %>%
dplyr::mutate(AbsoluteMismatch = BactTotalPfam - AbsoluteMatch,
CumAbsoluteMismatch = cumsum(AbsoluteMismatch),
#AbsoluteRelativeMatch = AbsoluteMatch/AbsoluteMismatch,
CumNovelMatch = cumsum(NovelMatch)
#CumAbsoluteRelativeMatch = cumsum(AbsoluteRelativeMatch),
#CumExclusiveMismatchPerIteration = cumsum(exclusiveMismatchPerIteration),
#CumIterationRelativeMatch = cumsum(RelativeMatchPerIteration),
) %>% ungroup()
# add species names
mimic_recalc$BacterialGenome = sub('^([^.]+.[^.]+).*', '\\1',mimic_recalc$BacterialGenome)
mimic_recalc= plyr::join(mimic_recalc,species_meta, by="BacterialGenome", type="left") # add species names
# heatmap plot minimal consortia -----------------------
#Plot the percentage novel functions.
phyl_plot = dplyr::select(mimic_recalc,MetaGenome,Species, NovelMatchPer)
phyl_plot = spread(phyl_plot,MetaGenome,NovelMatchPer)
phyl_plot = na.zero(phyl_plot) #convert NA to zero
plot_height=round((length(rownames(phyl_plot))/10)+2.5,digits=1)
plot_width=round((length(colnames(phyl_plot))/12)+2,digits=1)
rownames(phyl_plot) = phyl_plot$Species
phyl_plot$Species= NULL
phyl_plot= as.matrix(phyl_plot)
#phyl_plot = t(scale(t(phyl_plot))) # Scale by row manually
cols = colorRampPalette(c("white","#F0E442","darkorange","red2","darkred"))
cols = cols(10)
# heatmap row annotation
myRows <- df_specab$Species # rows to highlight
row_idx <- which(rownames(phyl_plot) %in% myRows) # font/styling settings
fontsizes <- rep(10, nrow(phyl_plot))
fontsizes[row_idx] <- 10
fontcolors <- rep('black', nrow(phyl_plot))
fontcolors[row_idx] <- 'black'
fontfaces <- rep('plain',nrow(phyl_plot))
fontfaces[row_idx] <- 'plain'
rowAnno <- rowAnnotation(rows=anno_text(rownames(phyl_plot),
gp=gpar(fontsize=fontsizes, fontface = fontfaces,
col = fontcolors))) # Create text annotation
# heatmap boxes around our rows
dend <- reorder(as.dendrogram(hclust(dist(phyl_plot))),
-rowMeans(phyl_plot), agglo.FUN = mean) # create our own row dendrogram
dend_idx <- which(order.dendrogram(dend) %in% which(
rownames(phyl_plot) %in% myRows)) # Find rows in dendrogram
btm <- 1 - (dend_idx / nrow(phyl_plot)) # save position rows
top <- btm + (1/nrow(phyl_plot))
box_col <- 'black'
box_width <- 1
# draw heatmap
tiff(filename = paste0(opt$root_dir,"/08_mimic_output/heatmap_mmconsort_perc_novel_funct.tiff"),
units="in", width=plot_width, height=plot_height, compression='lzw', res=300)
Heatmap(phyl_plot, name="ht",cluster_rows = dend,
right_annotation = rowAnno,col=cols,
show_column_names = FALSE, column_title = "Metagenomic samples",
column_title_side = c("bottom"),
show_row_names = FALSE)
decorate_heatmap_body("ht", { for (i in 1:length(myRows)) {
grid.lines(c(0, 1), c(top[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
grid.lines(c(0, 1), c(btm[i],btm[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
grid.lines(c(0, 0), c(btm[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
grid.lines(c(1, 1), c(btm[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
}
})
dev.off()
# prepare environment for Kofam part -----------------------
dir.create(paste0(opt$root_dir,"/12_kofam_anno"))
dir.create(paste0(opt$root_dir,"/12_kofam_anno/single"))
dir.create(paste0(opt$root_dir,"/12_kofam_anno/meta"))
kofam = list.files(path=paste0(opt$root_dir,"/08_mimic_output/output_faa_single"),
pattern = ".faa") # list samples
write.table(kofam,file=paste0(opt$root_dir,"/12_kofam_anno/single/all_samples.tab"),
sep = "\t",row.names=FALSE, quote=FALSE, col.names=F) # export file
kofam = list.files(path=paste0(opt$root_dir,"/08_mimic_output/output_faa_meta"),
pattern = ".faa") # list samples
write.table(kofam,file=paste0(opt$root_dir,"/12_kofam_anno/meta/all_samples.tab"),
sep = "\t",row.names=FALSE, quote=FALSE, col.names=F) # export file
# Plot relative abundance species annotated -------------------------------------
#Plot species grouped by feed
phyl_plot = readRDS(paste(opt$root_dir,"06_taxa_output",
"physeq_kraken_reads_conf03_refseq_glomspec.rds",sep="/"))
phyl_plot = prune_taxa(taxa_sums(phyl_plot) > 1, phyl_plot) #remove taxa lacking reads
#Merge species and genus name
tax_table(phyl_plot)[,"Species"] = paste(tax_table(phyl_plot)[,"Genus"],
tax_table(phyl_plot)[,"Species"],
sep="") #paste genus name to species one
tax_table(phyl_plot)[,"Species"] = gsub("g_", "",tax_table(phyl_plot)[,"Species"])
tax_table(phyl_plot)[,"Species"] = gsub("s_", " ",tax_table(phyl_plot)[,"Species"])
taxa_names(phyl_plot) = tax_table(phyl_plot)[,"Species"] # rename identifyer by species
#select settings
phyl_plot = func_aggregate_top_taxa2(phyl_plot, "Species", top = 12) #reduce to species
phyl_plot = microbiome::transform(phyl_plot, "compositional") #compositional
mm_taxa = sub("^(\\S*\\s+\\S+).*", "\\1", df_specab$Species) # extract species MM consortia
taxa_names(phyl_plot) = sapply(tax_table(phyl_plot)[,"Species"],
function(name){ifelse(name %in% mm_taxa,
paste0("**",name,"**"),name)},
USE.NAMES = FALSE) # highlight names MM consortia in bold
order = names(sort(taxa_sums(phyl_plot), F)[1:13])
order = order[order != "Other"]
order = c("Other",order)
# plotting
plot_composition(phyl_plot,
sample.sort = NULL,
otu.sort=order, x.label="Sample") +
geom_bar(position="stack", stat="identity", color=NA)+
theme_bw() +
labs(title="Top 12 abundant bacterial species")+
ylab( "Abundance (%)")+ xlab( "Sample ID")+
theme(legend.justification = "top",
legend.text = element_markdown(), # required to highlight MM consortia in bold
legend.title = element_blank(),
legend.key.size = unit(4.5, 'mm'),
plot.title = element_text(size = 12),
axis.title.y = element_text(color="Grey1", size=13, face="bold"),
axis.text.y =element_text(color="Grey1", size=11),
axis.text.x =element_text(color="Grey1", size=9, angle = 90, vjust = 0.5),
axis.title.x = element_text(color="Grey1", size = 13, face="bold"),
panel.grid.major = element_blank(),
axis.ticks = element_blank())
ggsave(paste(opt$root_dir,"06_taxa_output","abund_kraken_top12_spec_annot.tiff",sep="/"),
units="in", width=18, height=8, dpi=300, compression = 'lzw')
cat("--------------- R script completed ---------------")
cat("\n\n")
end_time <- Sys.time()
cat("Duration of :")
end_time - start_time
cat("\n\n")
cat("--------------- Bacteria of interest ---------------")
cat("\n\n")
cat(paste0("aimed for minimal consortia size of: ",Minimal_consortia_size))
cat("\n")
cat(paste0("resulted in consortia size of : ",length(unique(df_specab$Species))))
cat("\n\n")
print(unique(df_specab$Species))
cat("\n\n")
cat("Please open the MiMiC_summary-report to interpret the significance")
cat("\n")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment