Upload New File

324ec9ea · Stege, Paul · bd8d0481 · 324ec9ea
Commit 324ec9ea authored 2 years ago by Stege, Paul
--- a/11_mimic_test_mimic.R
+++ b/11_mimic_test_mimic.R
+#=============================================================================================
+#author			Neeraj Kumar et al., 2021; https://github.com/ClavelLab/MiMiC
+#date			02-2021
+#adjusted by		Paul Stege
+#script			Calculate minimal consorta (MiMiC script)
+#software		R ; R Core Team 2022
+#package		P.J. McMurdie and S. Holmes, 2012 ; phyloseq
+#			L. Lahti and S. Shetty, 2017 ; microbiome
+#			Raivo Kolde, 2019 ; pheatmap
+#=============================================================================================
+# description 
+  #This script is used to calculate the minimal microbial consrotia for a metagenome based on the reference pfam vector of microbial species
+  #minimum number of species covering maximum number of metagenome
+
+# usage
+  # Rscript --vanilla 11_mimic_s4_mimic.R -r root_dir -i iterationNumber -e exclude_species
+   start_time <- Sys.time()
+#=============================================================================================
+
+# install packages --------------------------------------------------------
+  
+  paks <- c("tidyverse","ggsignif","cowplot","RColorBrewer", "ggcorrplot","vegan",
+            "ade4","colorspace","plyr","data.table","ggplot2","inflection","dplyr",
+            "readr","optparse", "gridExtra", "ComplexHeatmap", "RColorBrewer", "pheatmap",
+	    "ggtext", "phyloseq", "microbiome")
+
+  paks<-lapply(paks, function(x) require(x, character.only = TRUE))
+  rm(paks)
+
+  # from Sudarshan A. Shetty, & Leo Lahti. (2020, October). microbiomeutilities: Utilities for Microbiome Analytics
+   func_aggregate_top_taxa2 <- function(x, top, level) {
+   x <- aggregate_taxa(x, level)
+
+   tops <- top_taxa(x, top)
+   tax <- tax_table(x)
+
+   inds <- which(!rownames(tax) %in% tops)
+
+   tax[inds, level] <- "Other"
+
+   tax_table(x) <- tax
+
+   tt <- tax_table(x)[, level]
+   tax_table(x) <- tax_table(tt)
+
+   aggregate_taxa(x, level)
+   }
+
+  
+# set variables ---------------------------------------------------------
+  #library("optparse")
+  # maiking option using package "optparse"
+  option_list = list(
+    make_option(c("-r", "--root_dir"), type="character", default="/home/", 
+       	          	help="directory for this project, containing all folders for reading and exporting data [default= %default]", metavar="character"),
+    make_option(c("-i", "--itNumber"), type="integer", default=10, 
+                 	help="iteratioin number [default= %default]", metavar="number"),
+    make_option(c("-e", "--ex_spec"), type="character", default="NA", 
+       	          	help="a .txt-file containing single column with species to exclude from the minimal consortia. e.g. /spec_exclude.txt [default= %default]", metavar="character"),
+    make_option(c("-p", "--ex_pfam"), type="character", default="NA", 
+       	          	help="a .txt-file containing single column with pfam. Used in the mismatch penalty of mimic. e.g. /pfam_exclude.txt [default= %default]", metavar="character"))
+    
+  opt_parser = OptionParser(option_list=option_list)
+  opt = parse_args(opt_parser)
+
+  # if (is.null(opt$file)){
+  #   print_help(opt_parser)
+  #   stop("At least one argument must be supplied (input file).n", call.=FALSE)
+  # }
+
+
+# input  tables------------------------------------------------------------------
+  #create DB to convert names to species
+  na.zero = function (x) {x[is.na(x)] = 0 ;return(x)  }
+  Minimal_consortia_size=opt$itNumber
+ 
+  paste0(opt$root_dir,"/08_mimic_output/MiMiC_final_output_stats_it",opt$itNumber,".tab")
+  finalStats = data.table::fread(paste0(opt$root_dir,"/08_mimic_output/MiMiC_final_output_stats_it",opt$itNumber,".tab"),
+				   header = TRUE ,stringsAsFactors = F,check.names = F, fill=TRUE, sep = "\t")
+  colnames(finalStats)
+
+  #create DB to convert names to species
+  species_meta = data.table::fread(paste0(opt$root_dir,"/07_spec_genomes/species_meta.txt"),
+				   header = F ,stringsAsFactors = F,check.names = F, fill=TRUE,
+				   sep = "\t")
+  
+  species_meta = str_split_fixed(species_meta$V1, " ", 2)
+  species_meta=species_meta[!apply(species_meta == "", 1, all),]                  # remove empty rows
+  colnames(species_meta)=c("BacterialGenome","Species")
+  species_meta = as.data.frame(species_meta)
+  species_meta$BacterialGenome=trimws(species_meta$BacterialGenome, which="both") #remove leading/trailing spaces
+  species_meta=species_meta[which(duplicated(species_meta$BacterialGenome)==FALSE),]  # remove duplicates
+  species_meta=species_meta[which(duplicated(species_meta$Species)==FALSE),]  # remove duplicates
+
+  species_meta=dplyr::select(species_meta,BacterialGenome, Species)
+  species_meta[which(duplicated(species_meta$Species) == T), ]
+
+  MetaGenome <- data.table::fread(paste0(opt$root_dir,"/08_mimic_output/pfam_parsed_table_meta"),
+				  sep = "\t",
+                                  header = TRUE ,
+                                  stringsAsFactors = FALSE,
+                                  check.names = FALSE)
+
+  MetaGenome <- as.data.frame(MetaGenome) 
+	# option to exclude pfam   # does not mean species are excluded if they contain these genes
+  	# instead, these pfams are excluded from those that needed to be present, therefore adding to the mismatch score
+  	  if (opt$ex_pfam == "NA") {
+	  # if no input has been given for pfam exclusion
+	  print("no pfam list detected") 
+	   } else {
+
+            # in case a list is supplied with pfams to exclude
+              pfam_exclude = data.table::fread(paste0(opt$root_dir,opt$ex_pfam),
+			sep = NULL ,header=F,stringsAsFactors=F) %>% as.data.frame()
+
+            MetaGenome=MetaGenome[!MetaGenome$PfamID %in% pfam_exclusion[,1], ]
+           } 
+
+  # continue formatting metagenome vector file
+  colnames(MetaGenome)=gsub(".faa.*","",colnames(MetaGenome))
+  row.names(MetaGenome) <- MetaGenome$PfamID # to make sure if rows name remain same 
+  MetaGenome$PfamID <- NULL
+  MetaGenome=MetaGenome[,which(colSums(MetaGenome)>1000)]         # remove low variance samples
+
+  # reading genome vector file
+  GenomeVector <- data.table::fread(paste0(opt$root_dir,"/08_mimic_output/pfam_parsed_table_single"),
+				 sep = "\t",
+	                         header = T,
+	                         stringsAsFactors = FALSE)
+
+  GenomeVector <- as.data.frame(GenomeVector)
+  colnames(GenomeVector)=gsub(".faa.*","",colnames(GenomeVector))
+
+  # option to exclude species
+  if (opt$ex_spec == "NA") {
+	# if no input has been given for species exclusion
+	  print("no species excluded from minimal consortia") 
+	} else {
+
+        # in case a list is supplied with species to exclude
+          spec_exclude = data.table::fread(paste0(opt$root_dir,opt$ex_spec),
+			sep = NULL ,header=F,stringsAsFactors=F) %>% as.data.frame()
+
+          spec_exclude = dplyr::filter(species_meta,grepl(paste(spec_exclude[,1],collapse="|"),
+							  Species, ignore.case=TRUE)) # covert species ID
+           GenomeVector=GenomeVector[,
+                                      -grep(paste(spec_exclude$BacterialGenome,collapse="|"), 
+                                      colnames(GenomeVector))]
+       } 
+
+  row.names(GenomeVector) <- GenomeVector$PfamID # skip if row names alreay assigned and PfamID is not available
+  GenomeVector$PfamID <- NULL
+	
+# Functions ---------------------------------------------------------------
+  na.zero = function (x) {x[is.na(x)] = 0 ;return(x)  }
+  
+  # MiMiC function to calculate minimal microbial consortia
+	#this function compare the genome vector profile to the metagenome vector profile 
+		scoreDiff_weighted <- function(genomes, metagenome,Minimal_consortia_size){
+	  g <- as.matrix(genomes)
+	  y <- as.matrix(metagenome)
+	  mscore <- sum(y)
+	  mscore1 <- sum(y)
+	  res <- NULL
+	 
+	  for(i in 1:Minimal_consortia_size){
+	    matchCount <- apply(g, 2, function(x) sum(y[x >= 1.0])) # ---> only count of Pfam match which present in genome and not in metagenome 
+	    totalpfamG <- apply(g,2,function(x) sum(x>=1)) # total number of Pfam present in microbial genome 
+	    misMatchCount <- totalpfamG - matchCount # number Of Pfams which are present in genome but not in Metagenome.
+	    matchratio <- ((matchCount)/(matchCount+misMatchCount)) # (x)/(x+y) # ---> mimic_B             
+	    matchratio[matchratio=="NaN"]<- 0 # because of some values become 0, when 0/0 it gives NaN
+	    
+	    bestHit <- matchCount[names(matchratio[matchratio==max(matchratio)])] #---> genome with maximum match count and maximum match mismatch ratio
+	    SelectedBestHit <- names(bestHit[bestHit==max(bestHit)])[1] #---> names of the genome to be selected
+	    
+	    print(paste ("number of itreation" ,i,sep="=")) #---> printing number of iteration
+	   
+	    a<-g[,SelectedBestHit] #--> selected genome vector
+	   
+	    ascore <- bestHit[bestHit==max(bestHit)][1] # score
+	   
+	    pfamInGenomePerItration = unname(totalpfamG[SelectedBestHit]) # Total number of pfam used in the comparison 
+	    exclusiveMatch <- unname(matchCount[SelectedBestHit]) # number of Pfam match to metagenome
+	    exclusiveMismatchPerIteration <- unname(misMatchCount[SelectedBestHit]) # number of Pfam that did not match to the metagenome 
+	    
+	    print(paste("pfam",pfamInGenomePerItration,"exclusiveMatch",exclusiveMatch,"exclusiveMismatchPerIteration",
+	                exclusiveMismatchPerIteration,sep = "=")) # Printing the summary on screen at each iteration
+	    #print the genome with the name
+	    #print (ascore)
+	    #unname(ascore)
+	    b <- as.character(SelectedBestHit) # name of the selected genome
+	    print(b)
+	    g <-g[,-which(colnames(g)==b),drop=F] # exclude selected genome.
+	    if(i==1){
+	      covered <- sum(y[a>=1])
+	      rscore <- sum(y[a>=1])
+	      percent <- (covered/mscore)*100
+	      percent_nw = (rscore/mscore)*100  # pstege; report novel functions in percentage
+	      #uncovered<-(mscore-covered)
+	      #uncovered_Percentage <-(uncovered/mscore)
+	    } else {
+	      rscore <- sum(y[a>=1])
+	      covered <- covered+rscore # cumulative Pfam number covered in metagenome
+	      percent <- (covered/mscore)*100 # cumulative percentage of Pfam covered in metagenome 
+	      percent_nw = (rscore/mscore)*100  # pstege; report novel functions in percentage
+	      #uncovered<-(mscore-covered)
+	      #uncovered_Percentage <-(uncovered/mscore)
+	    }
+	    
+	    test <- names(y[a>=1,]) #  Pfams IDs common in genome and metagenome  
+	    #print(g)
+	    g[test,] <- 0 # making all Pfam as "zero" in reference genome set ( those function were already taken ) 
+	    y[a>=1]<-0 # makes all metagenome matching with highest scoring genome as '0'
+	   	mscore1 <- sum(y) #number of pfam in metagenome
+	   
+	    if(rscore==0){break} # if no pfam match found further, then stop the script
+	    #RelativeMatchPerIteration=(rscore / exclusiveMismatchPerIteration)
+	    
+	    res <- rbind(res,cbind.data.frame(bname=SelectedBestHit,
+	                                      percentage=percent,score=rscore,ScorePerIteration=ascore,
+	                                      pfamInGenomePerItration=pfamInGenomePerItration,
+	                                      NovelFunctPerc=percent_nw,
+	                                      #RelativeMatchPerIteration=RelativeMatchPerIteration,
+	                                      exclusiveMismatchPerIteration=exclusiveMismatchPerIteration)) # making output dataframe
+	    
+	  }
+	  return(res)
+	}
+
+ #compare reference genome to metaegenome
+  scorePfam <- function(genomes, metagenome){
+    	g <- as.matrix(genomes)
+    	y <- metagenome	} 
+
+
+  # from Sudarshan A. Shetty, & Leo Lahti. (2020, October). microbiomeutilities: Utilities for Microbiome Analytics
+   func_aggregate_top_taxa2 <- function(x, top, level) {
+   x <- aggregate_taxa(x, level)
+
+   tops <- top_taxa(x, top)
+   tax <- tax_table(x)
+
+   inds <- which(!rownames(tax) %in% tops)
+
+   tax[inds, level] <- "Other"
+
+   tax_table(x) <- tax
+
+   tt <- tax_table(x)[, level]
+   tax_table(x) <- tax_table(tt)
+
+   aggregate_taxa(x, level)
+   }
+
+# Total number of pfam and match to metagenome ----------------------------
+
+  #total number of pfam in genome and metagenome 
+  #	GenomeBinary <- GenomeVector
+  #	row.names(GenomeBinary) <- GenomeBinary$PfamID # if coulmn name has the pfam ids and rownames does not exists.
+  #	GenomeBinary$PfamID <- NULL
+
+  colnames(GenomeVector)
+  colnames(MetaGenome)
+  total <- cbind(GenomeVector,MetaGenome)
+  total <- as.data.frame(colSums(total))  #skip description folder.and sums up all pfam for each genome.
+
+  colnames(total) <- "PfamTotal" #change name of columns # total number of pfams in each genome and metagenome.
+  total$genomes <- rownames(total) #adding column name to total score file.
+  rownames(total) <- NULL
+
+  temp <- cbind.data.frame(old=c(total$genomes),neu=
+                             c(rep("BacterialGenome",ncol(GenomeVector)),
+                               rep("MetaGenome",ncol(MetaGenome)))) #taging bacterial genome and metaGenome
+  total$name<- as.character(temp[match(total$genomes,temp$old),2]) # number of in all genomes and metagenome
+
+
+# summary ---------------------------------------------
+
+  # extract part final stats in summary file
+  stats_summary = dplyr::select(finalStats,MetaGenome,Species, Percentage, NovelMatch, AbsoluteMatch, CumNovelMatch)
+  colnames(stats_summary)=c("MetaGenome","Species","CumuPercentage","NovelMatch","AbsoluteMatch","CumuNovelMatch")
+
+
+# Minimal consortia final selection ------------------------------------------
+  Minimal_consortia_size
+
+  #calc abund as sum of highest val in cum novel match
+  df_abund= aggregate(stats_summary$CumuNovelMatch, 
+			list(stats_summary$MetaGenome), FUN=max)  # calc pfam per metagenome
+
+  colnames(df_abund)=c("MetaGenome","sample_sum_pfam")
+
+  df_abund=plyr::join(dplyr::select(stats_summary,MetaGenome,Species,NovelMatch),
+                      	df_abund,by="MetaGenome",type="left")
+
+  df_abund$rel_score=df_abund$NovelMatch/df_abund$sample_sum_pfam # normalize novel match score by pfam per metagenome
+  
+  #prev filter
+  filter_prev =  table(df_abund$Species) %>% as.data.frame()
+  prevThreshold = 0.25* length(unique(df_abund$MetaGenome))          # variable prevalence threshold
+  filter_prev = filter_prev[,"Var1"][(filter_prev$Freq >= prevThreshold)] %>% as.character()
+
+  #leave one out
+  df_specab = dplyr::select(df_abund,Species,rel_score)   
+  df_specab = dplyr::group_by(df_specab,
+				  Species) %>% dplyr::summarise_all(median)   # take sum of duplicate values
+  df_specab=df_specab[df_specab$Species %in% filter_prev, ]                      # select only prev filtered
+
+  while (length(df_specab$Species) > Minimal_consortia_size) {
+    
+    lowest_hit = df_specab[which.min(df_specab$rel_score),]
+    lowest_hit <- as.character(lowest_hit$Species)                              # name of the selected genome
+
+  
+    df_specab=df_specab[!df_specab$Species %in% lowest_hit, ]                  # exclude selected genome.
+  }
+
+
+# heatmap plot -----------------------
+#Plot the percentage novel functions per metagenome (x) and genome (y)
+
+  phyl_plot = dplyr::select(finalStats,MetaGenome,Species, NovelFunctPerc)
+  phyl_plot = spread(phyl_plot,MetaGenome,NovelFunctPerc)
+  phyl_plot = na.zero(phyl_plot) #convert NA to zero
+  plot_height=round((length(rownames(phyl_plot))/9)+2.5,digits=1)
+  plot_width=round((length(colnames(phyl_plot))/9)+6,digits=1)
+  rownames(phyl_plot) = phyl_plot$Species
+  phyl_plot$Species= NULL
+  phyl_plot= as.matrix(phyl_plot,rownames.value=rownames(phyl_plot))
+  #phyl_plot = t(scale(t(phyl_plot)))   # Scale by row manually
+  cols = colorRampPalette(c("white","#F0E442","darkorange","red2","darkred"))
+  cols = cols(10)
+  
+  # heatmap without annotation
+  p2=pheatmap(phyl_plot, labels_row=rownames(phyl_plot), border_color=F, color=cols,
+		show_rownames=TRUE)
+  
+  ggsave(paste0(opt$root_dir,"/08_mimic_output/heatmap_species_perc_novel_funct.tiff"),
+	plot=p2, units="in", width=plot_width, height=plot_height, dpi=300, compression = 'lzw')
+    
+  # heatmap row annotation
+  myRows <- df_specab$Species                      	# rows to highlight
+  row_idx <- which(rownames(phyl_plot) %in% myRows)      # font/styling settings
+  fontsizes <- rep(10, nrow(phyl_plot))
+  fontsizes[row_idx] <- 10
+  fontcolors <- rep('black', nrow(phyl_plot))
+  fontcolors[row_idx] <- 'black'
+  fontfaces <- rep('plain',nrow(phyl_plot))
+  fontfaces[row_idx] <- 'bold'
+
+   rowAnno <- rowAnnotation(rows=anno_text(rownames(phyl_plot),
+                          gp=gpar(fontsize=fontsizes, fontface = fontfaces,
+                                  col = fontcolors)))  # Create text annotation  
+  # heatmap boxes around our rows
+  dend <- reorder(as.dendrogram(hclust(dist(phyl_plot))),
+                  -rowMeans(phyl_plot), agglo.FUN = mean) # create our own row dendrogram
+  
+  dend_idx <- which(order.dendrogram(dend) %in% which(
+    rownames(phyl_plot) %in% myRows))                     # Find rows in dendrogram
+  
+  btm <- 1 - (dend_idx / nrow(phyl_plot))                 # save position rows 
+  top <- btm + (1/nrow(phyl_plot))
+  box_col <- 'black'
+  box_width <- 2
+  
+  # draw heatmap
+  tiff(filename = paste0(opt$root_dir,"/08_mimic_output/heatmap_species_perc_novel_funct_rowanno.tiff"),
+	units="in", width=plot_width, height=plot_height, compression='lzw', res=300)
+
+  Heatmap(phyl_plot, name="ht",cluster_rows = dend, 
+          right_annotation = rowAnno,col=cols,
+          show_row_names = FALSE)              
+  
+  decorate_heatmap_body("ht", { for (i in 1:length(myRows)) {
+    grid.lines(c(0, 1), c(top[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
+    grid.lines(c(0, 1), c(btm[i],btm[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
+    grid.lines(c(0, 0), c(btm[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
+    grid.lines(c(1, 1), c(btm[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
+  }
+  })
+  dev.off()
+
+# Recalculate MM consortia using MiMiC function -------------------------
+
+  mimic_recalc = plyr::join(df_specab, species_meta, by="Species", type="left")        # subset genome df
+  mimic_recalc = GenomeVector[,which(colnames(GenomeVector) %in% mimic_recalc$BacterialGenome)]
+
+  recalc_PfamTable <- as.data.frame(apply(as.data.frame(MetaGenome),2,
+                                   function(x) scorePfam(as.data.frame(mimic_recalc),
+				    x))) #adding score to pfam table
+  
+  recalc_PfamTable$BacterialGenome <- rownames(recalc_PfamTable)
+
+  mimic_recalc = apply(as.data.frame(MetaGenome),
+                        2,function(x) scoreDiff_weighted(as.data.frame(mimic_recalc),
+                                                         x,Minimal_consortia_size)) # making a list of the mimic calculation
+  
+  mimic_recalc = ldply(mimic_recalc, data.frame) # converting list into dataframe
+  names(mimic_recalc) <- c("MetaGenome","BacterialGenome","Percentage","Score","ScorePerIteration",
+                   "pfamInGenomePerItration","NovelMatchPer","exclusiveMismatchPerIteration") #naming the column in the dataframe
+  
+  mimic_recalc = mimic_recalc %>% 
+    dplyr::group_by(MetaGenome) %>% 
+    dplyr::mutate(Index=1:length(MetaGenome)) # adding the rank of species in metagenome
+  
+  #To plot metagenome coverage  
+  p1 <- ggplot(mimic_recalc,aes(Index,Percentage,group=MetaGenome,color=MetaGenome))+ 
+    geom_point()+ geom_line()+
+    scale_x_continuous(breaks = seq(0, Minimal_consortia_size, by = 2))+
+    theme(legend.position = "none")+
+    ylab("Functions covered (%)") + xlab("Bacterial species index")# plot for pfam coverage of metagenome by minimal consortia
+  
+     ggsave(paste0(opt$root_dir,"/08_mimic_output/pfam_coverage_plot_mmconsortia.tiff"),
+	plot=p1, units="in", width=14, height=8, dpi=300, compression = 'lzw')
+
+  #continue collecting table
+  mimic_recalc = dplyr::left_join(mimic_recalc,total[,c("PfamTotal","genomes")], by=c("BacterialGenome" = "genomes")) %>%
+              dplyr::left_join(.,reshape2::melt(recalc_PfamTable,
+                                      id.vars="BacterialGenome",
+                                      value.name="AbsoluteMatch",variable.name="MetaGenome"),
+				      by=c("BacterialGenome","MetaGenome")) %>%
+               dplyr::rename(NovelMatch = Score, BactTotalPfam = PfamTotal) %>%
+              dplyr::group_by(MetaGenome) %>% 
+            dplyr::mutate(AbsoluteMismatch = BactTotalPfam - AbsoluteMatch,
+                  CumAbsoluteMismatch = cumsum(AbsoluteMismatch),
+                  #AbsoluteRelativeMatch = AbsoluteMatch/AbsoluteMismatch,
+                  CumNovelMatch = cumsum(NovelMatch)
+                  #CumAbsoluteRelativeMatch = cumsum(AbsoluteRelativeMatch),
+                  #CumExclusiveMismatchPerIteration = cumsum(exclusiveMismatchPerIteration),
+                  #CumIterationRelativeMatch = cumsum(RelativeMatchPerIteration),
+    ) %>% ungroup()
+  
+  
+  # add species names
+  mimic_recalc$BacterialGenome = sub('^([^.]+.[^.]+).*', '\\1',mimic_recalc$BacterialGenome)		
+  mimic_recalc= plyr::join(mimic_recalc,species_meta, by="BacterialGenome", type="left") # add species names
+
+  # heatmap plot minimal consortia  -----------------------
+  #Plot the percentage novel functions. 
+  phyl_plot = dplyr::select(mimic_recalc,MetaGenome,Species, NovelMatchPer)
+  phyl_plot = spread(phyl_plot,MetaGenome,NovelMatchPer)
+  phyl_plot = na.zero(phyl_plot) #convert NA to zero
+  plot_height=round((length(rownames(phyl_plot))/10)+2.5,digits=1)
+  plot_width=round((length(colnames(phyl_plot))/12)+2,digits=1)
+    rownames(phyl_plot) = phyl_plot$Species
+  phyl_plot$Species= NULL
+  phyl_plot= as.matrix(phyl_plot)
+  #phyl_plot = t(scale(t(phyl_plot)))   # Scale by row manually
+  cols = colorRampPalette(c("white","#F0E442","darkorange","red2","darkred"))
+  cols = cols(10)
+  
+  # heatmap row annotation
+  myRows <- df_specab$Species                      # rows to highlight
+  row_idx <- which(rownames(phyl_plot) %in% myRows)      # font/styling settings
+  fontsizes <- rep(10, nrow(phyl_plot))
+  fontsizes[row_idx] <- 10
+  fontcolors <- rep('black', nrow(phyl_plot))
+  fontcolors[row_idx] <- 'black'
+  fontfaces <- rep('plain',nrow(phyl_plot))
+  fontfaces[row_idx] <- 'plain'
+  
+  rowAnno <- rowAnnotation(rows=anno_text(rownames(phyl_plot),
+                                          gp=gpar(fontsize=fontsizes, fontface = fontfaces,
+                                                  col = fontcolors)))  # Create text annotation
+  
+  # heatmap boxes around our rows
+  dend <- reorder(as.dendrogram(hclust(dist(phyl_plot))),
+                  -rowMeans(phyl_plot), agglo.FUN = mean) # create our own row dendrogram
+  
+  dend_idx <- which(order.dendrogram(dend) %in% which(
+    rownames(phyl_plot) %in% myRows))                     # Find rows in dendrogram
+  
+  btm <- 1 - (dend_idx / nrow(phyl_plot))                 # save position rows 
+  top <- btm + (1/nrow(phyl_plot))
+  box_col <- 'black'
+  box_width <- 1
+  
+  # draw heatmap
+  tiff(filename = paste0(opt$root_dir,"/08_mimic_output/heatmap_mmconsort_perc_novel_funct.tiff"),
+	units="in", width=plot_width, height=plot_height, compression='lzw', res=300)
+  
+  Heatmap(phyl_plot, name="ht",cluster_rows = dend, 
+          right_annotation = rowAnno,col=cols,
+          show_column_names = FALSE, column_title = "Metagenomic samples",
+          column_title_side = c("bottom"),
+          show_row_names = FALSE)              
+  
+  decorate_heatmap_body("ht", { for (i in 1:length(myRows)) {
+    grid.lines(c(0, 1), c(top[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
+    grid.lines(c(0, 1), c(btm[i],btm[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
+    grid.lines(c(0, 0), c(btm[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
+    grid.lines(c(1, 1), c(btm[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
+  }
+  })
+    dev.off()
+
+# prepare environment for Kofam part -----------------------
+  dir.create(paste0(opt$root_dir,"/12_kofam_anno"))
+  dir.create(paste0(opt$root_dir,"/12_kofam_anno/single"))
+  dir.create(paste0(opt$root_dir,"/12_kofam_anno/meta"))
+
+  kofam = list.files(path=paste0(opt$root_dir,"/08_mimic_output/output_faa_single"), 
+                      pattern = ".faa")    			    	# list samples
+
+  write.table(kofam,file=paste0(opt$root_dir,"/12_kofam_anno/single/all_samples.tab"),
+		sep = "\t",row.names=FALSE, quote=FALSE, col.names=F)	# export file
+
+  kofam = list.files(path=paste0(opt$root_dir,"/08_mimic_output/output_faa_meta"), 
+                      pattern = ".faa")    			    	# list samples
+
+  write.table(kofam,file=paste0(opt$root_dir,"/12_kofam_anno/meta/all_samples.tab"),
+		sep = "\t",row.names=FALSE, quote=FALSE, col.names=F)	# export file
+
+
+# Plot relative abundance species annotated -------------------------------------
+ 
+  #Plot species grouped by feed
+  phyl_plot = readRDS(paste(opt$root_dir,"06_taxa_output",
+ 	  "physeq_kraken_reads_conf03_refseq_glomspec.rds",sep="/"))
+
+  phyl_plot = prune_taxa(taxa_sums(phyl_plot) > 1, phyl_plot) #remove taxa lacking reads
+
+  #Merge species and genus name
+  tax_table(phyl_plot)[,"Species"] = paste(tax_table(phyl_plot)[,"Genus"],
+                                         tax_table(phyl_plot)[,"Species"],
+                                         sep="") #paste genus name to species one
+
+  tax_table(phyl_plot)[,"Species"] = gsub("g_", "",tax_table(phyl_plot)[,"Species"])
+  tax_table(phyl_plot)[,"Species"] = gsub("s_", " ",tax_table(phyl_plot)[,"Species"]) 
+  taxa_names(phyl_plot) = tax_table(phyl_plot)[,"Species"] # rename identifyer by species
+  
+  #select settings
+  phyl_plot = func_aggregate_top_taxa2(phyl_plot, "Species", top = 12) #reduce to species
+  phyl_plot = microbiome::transform(phyl_plot, "compositional") #compositional
+  mm_taxa = sub("^(\\S*\\s+\\S+).*", "\\1", df_specab$Species)		# extract species MM consortia
+  taxa_names(phyl_plot) = sapply(tax_table(phyl_plot)[,"Species"],
+                                 function(name){ifelse(name %in% mm_taxa,
+								paste0("**",name,"**"),name)},
+                                 USE.NAMES = FALSE) # highlight names MM consortia in bold                  
+
+  order = names(sort(taxa_sums(phyl_plot), F)[1:13]) 
+  order = order[order != "Other"]
+  order = c("Other",order)
+
+  # plotting
+  plot_composition(phyl_plot,
+                sample.sort = NULL,
+                otu.sort=order, x.label="Sample") +
+		   geom_bar(position="stack", stat="identity", color=NA)+
+                theme_bw() + 
+                labs(title="Top 12 abundant bacterial species")+
+                ylab( "Abundance (%)")+ xlab( "Sample ID")+
+                theme(legend.justification  = "top",
+                legend.text = element_markdown(),				# required to highlight MM consortia in bold
+                legend.title = element_blank(),
+                legend.key.size = unit(4.5, 'mm'),
+                plot.title = element_text(size = 12),
+                axis.title.y = element_text(color="Grey1", size=13, face="bold"),
+                axis.text.y =element_text(color="Grey1", size=11),
+                axis.text.x =element_text(color="Grey1", size=9, angle = 90, vjust = 0.5),
+                axis.title.x = element_text(color="Grey1", size = 13, face="bold"),
+                panel.grid.major = element_blank(),
+                axis.ticks = element_blank())
+
+  ggsave(paste(opt$root_dir,"06_taxa_output","abund_kraken_top12_spec_annot.tiff",sep="/"),
+          units="in", width=18, height=8, dpi=300, compression = 'lzw')
+        
+cat("--------------- R script completed ---------------")
+cat("\n\n")
+
+	end_time <- Sys.time()
+  cat("Duration of :")
+	end_time - start_time
+
+cat("\n\n")
+
+cat("--------------- Bacteria of interest ---------------")
+cat("\n\n")
+
+cat(paste0("aimed for minimal consortia size of: ",Minimal_consortia_size))
+cat("\n")
+
+cat(paste0("resulted in consortia size of : ",length(unique(df_specab$Species))))
+cat("\n\n")
+
+print(unique(df_specab$Species))
+cat("\n\n")
+
+cat("Please open the MiMiC_summary-report to interpret the significance") 
+cat("\n")
+