11_mimic_test_mimic.R

#=============================================================================================
#author			Neeraj Kumar et al., 2021; https://github.com/ClavelLab/MiMiC
#date			02-2021
#adjusted by		Paul Stege
#script			Calculate minimal consorta (MiMiC script)
#software		R ; R Core Team 2022
#package		P.J. McMurdie and S. Holmes, 2012 ; phyloseq
#			L. Lahti and S. Shetty, 2017 ; microbiome
#			Raivo Kolde, 2019 ; pheatmap
#=============================================================================================
# description 
  #This script is used to calculate the minimal microbial consrotia for a metagenome based on the reference pfam vector of microbial species
  #minimum number of species covering maximum number of metagenome

# usage
  # Rscript --vanilla 11_mimic_s4_mimic.R -r root_dir -i iterationNumber -e exclude_species
   start_time <- Sys.time()
#=============================================================================================

# install packages --------------------------------------------------------
  
  paks <- c("tidyverse","ggsignif","cowplot","RColorBrewer", "ggcorrplot","vegan",
            "ade4","colorspace","plyr","data.table","ggplot2","inflection","dplyr",
            "readr","optparse", "gridExtra", "ComplexHeatmap", "RColorBrewer", "pheatmap",
	    "ggtext", "phyloseq", "microbiome")

  paks<-lapply(paks, function(x) require(x, character.only = TRUE))
  rm(paks)

  # from Sudarshan A. Shetty, & Leo Lahti. (2020, October). microbiomeutilities: Utilities for Microbiome Analytics
   func_aggregate_top_taxa2 <- function(x, top, level) {
   x <- aggregate_taxa(x, level)

   tops <- top_taxa(x, top)
   tax <- tax_table(x)

   inds <- which(!rownames(tax) %in% tops)

   tax[inds, level] <- "Other"

   tax_table(x) <- tax

   tt <- tax_table(x)[, level]
   tax_table(x) <- tax_table(tt)

   aggregate_taxa(x, level)
   }

  
# set variables ---------------------------------------------------------
  #library("optparse")
  # maiking option using package "optparse"
  option_list = list(
    make_option(c("-r", "--root_dir"), type="character", default="/home/", 
       	          	help="directory for this project, containing all folders for reading and exporting data [default= %default]", metavar="character"),
    make_option(c("-i", "--itNumber"), type="integer", default=10, 
                 	help="iteratioin number [default= %default]", metavar="number"),
    make_option(c("-e", "--ex_spec"), type="character", default="NA", 
       	          	help="a .txt-file containing single column with species to exclude from the minimal consortia. e.g. /spec_exclude.txt [default= %default]", metavar="character"),
    make_option(c("-p", "--ex_pfam"), type="character", default="NA", 
       	          	help="a .txt-file containing single column with pfam. Used in the mismatch penalty of mimic. e.g. /pfam_exclude.txt [default= %default]", metavar="character"))
    
  opt_parser = OptionParser(option_list=option_list)
  opt = parse_args(opt_parser)

  # if (is.null(opt$file)){
  #   print_help(opt_parser)
  #   stop("At least one argument must be supplied (input file).n", call.=FALSE)
  # }

# input  tables------------------------------------------------------------------
  #create DB to convert names to species
  na.zero = function (x) {x[is.na(x)] = 0 ;return(x)  }
  Minimal_consortia_size=opt$itNumber
 
  paste0(opt$root_dir,"/08_mimic_output/MiMiC_final_output_stats_it",opt$itNumber,".tab")
  finalStats = data.table::fread(paste0(opt$root_dir,"/08_mimic_output/MiMiC_final_output_stats_it",opt$itNumber,".tab"),
				   header = TRUE ,stringsAsFactors = F,check.names = F, fill=TRUE, sep = "\t")
  colnames(finalStats)

  #create DB to convert names to species
  species_meta = data.table::fread(paste0(opt$root_dir,"/07_spec_genomes/species_meta.txt"),
				   header = F ,stringsAsFactors = F,check.names = F, fill=TRUE,
				   sep = "\t")
  
  species_meta = str_split_fixed(species_meta$V1, " ", 2)
  species_meta=species_meta[!apply(species_meta == "", 1, all),]                  # remove empty rows
  colnames(species_meta)=c("BacterialGenome","Species")
  species_meta = as.data.frame(species_meta)
  species_meta$BacterialGenome=trimws(species_meta$BacterialGenome, which="both") #remove leading/trailing spaces
  species_meta=species_meta[which(duplicated(species_meta$BacterialGenome)==FALSE),]  # remove duplicates
  species_meta=species_meta[which(duplicated(species_meta$Species)==FALSE),]  # remove duplicates

  species_meta=dplyr::select(species_meta,BacterialGenome, Species)
  species_meta[which(duplicated(species_meta$Species) == T), ]

  MetaGenome <- data.table::fread(paste0(opt$root_dir,"/08_mimic_output/pfam_parsed_table_meta"),
				  sep = "\t",
                                  header = TRUE ,
                                  stringsAsFactors = FALSE,
                                  check.names = FALSE)

  MetaGenome <- as.data.frame(MetaGenome) 
	# option to exclude pfam   # does not mean species are excluded if they contain these genes
  	# instead, these pfams are excluded from those that needed to be present, therefore adding to the mismatch score
  	  if (opt$ex_pfam == "NA") {
	  # if no input has been given for pfam exclusion
	  print("no pfam list detected") 
	   } else {

            # in case a list is supplied with pfams to exclude
              pfam_exclude = data.table::fread(paste0(opt$root_dir,opt$ex_pfam),
			sep = NULL ,header=F,stringsAsFactors=F) %>% as.data.frame()

            MetaGenome=MetaGenome[!MetaGenome$PfamID %in% pfam_exclusion[,1], ]
           } 

  # continue formatting metagenome vector file
  colnames(MetaGenome)=gsub(".faa.*","",colnames(MetaGenome))
  row.names(MetaGenome) <- MetaGenome$PfamID # to make sure if rows name remain same 
  MetaGenome$PfamID <- NULL
  MetaGenome=MetaGenome[,which(colSums(MetaGenome)>1000)]         # remove low variance samples

  # reading genome vector file
  GenomeVector <- data.table::fread(paste0(opt$root_dir,"/08_mimic_output/pfam_parsed_table_single"),
				 sep = "\t",
	                         header = T,
	                         stringsAsFactors = FALSE)

  GenomeVector <- as.data.frame(GenomeVector)
  colnames(GenomeVector)=gsub(".faa.*","",colnames(GenomeVector))

  # option to exclude species
  if (opt$ex_spec == "NA") {
	# if no input has been given for species exclusion
	  print("no species excluded from minimal consortia") 
	} else {

        # in case a list is supplied with species to exclude
          spec_exclude = data.table::fread(paste0(opt$root_dir,opt$ex_spec),
			sep = NULL ,header=F,stringsAsFactors=F) %>% as.data.frame()

          spec_exclude = dplyr::filter(species_meta,grepl(paste(spec_exclude[,1],collapse="|"),
							  Species, ignore.case=TRUE)) # covert species ID
           GenomeVector=GenomeVector[,
                                      -grep(paste(spec_exclude$BacterialGenome,collapse="|"), 
                                      colnames(GenomeVector))]
       } 

  row.names(GenomeVector) <- GenomeVector$PfamID # skip if row names alreay assigned and PfamID is not available
  GenomeVector$PfamID <- NULL
	
# Functions ---------------------------------------------------------------
  na.zero = function (x) {x[is.na(x)] = 0 ;return(x)  }
  
  # MiMiC function to calculate minimal microbial consortia
	#this function compare the genome vector profile to the metagenome vector profile 
		scoreDiff_weighted <- function(genomes, metagenome,Minimal_consortia_size){
	  g <- as.matrix(genomes)
	  y <- as.matrix(metagenome)
	  mscore <- sum(y)
	  mscore1 <- sum(y)
	  res <- NULL
	 
	  for(i in 1:Minimal_consortia_size){
	    matchCount <- apply(g, 2, function(x) sum(y[x >= 1.0])) # ---> only count of Pfam match which present in genome and not in metagenome 
	    totalpfamG <- apply(g,2,function(x) sum(x>=1)) # total number of Pfam present in microbial genome 
	    misMatchCount <- totalpfamG - matchCount # number Of Pfams which are present in genome but not in Metagenome.
	    matchratio <- ((matchCount)/(matchCount+misMatchCount)) # (x)/(x+y) # ---> mimic_B             
	    matchratio[matchratio=="NaN"]<- 0 # because of some values become 0, when 0/0 it gives NaN
	    
	    bestHit <- matchCount[names(matchratio[matchratio==max(matchratio)])] #---> genome with maximum match count and maximum match mismatch ratio
	    SelectedBestHit <- names(bestHit[bestHit==max(bestHit)])[1] #---> names of the genome to be selected
	    
	    print(paste ("number of itreation" ,i,sep="=")) #---> printing number of iteration
	   
	    a<-g[,SelectedBestHit] #--> selected genome vector
	   
	    ascore <- bestHit[bestHit==max(bestHit)][1] # score
	   
	    pfamInGenomePerItration = unname(totalpfamG[SelectedBestHit]) # Total number of pfam used in the comparison 
	    exclusiveMatch <- unname(matchCount[SelectedBestHit]) # number of Pfam match to metagenome
	    exclusiveMismatchPerIteration <- unname(misMatchCount[SelectedBestHit]) # number of Pfam that did not match to the metagenome 
	    
	    print(paste("pfam",pfamInGenomePerItration,"exclusiveMatch",exclusiveMatch,"exclusiveMismatchPerIteration",
	                exclusiveMismatchPerIteration,sep = "=")) # Printing the summary on screen at each iteration
	    #print the genome with the name
	    #print (ascore)
	    #unname(ascore)
	    b <- as.character(SelectedBestHit) # name of the selected genome
	    print(b)
	    g <-g[,-which(colnames(g)==b),drop=F] # exclude selected genome.
	    if(i==1){
	      covered <- sum(y[a>=1])
	      rscore <- sum(y[a>=1])
	      percent <- (covered/mscore)*100
	      percent_nw = (rscore/mscore)*100  # pstege; report novel functions in percentage
	      #uncovered<-(mscore-covered)
	      #uncovered_Percentage <-(uncovered/mscore)
	    } else {
	      rscore <- sum(y[a>=1])
	      covered <- covered+rscore # cumulative Pfam number covered in metagenome
	      percent <- (covered/mscore)*100 # cumulative percentage of Pfam covered in metagenome 
	      percent_nw = (rscore/mscore)*100  # pstege; report novel functions in percentage
	      #uncovered<-(mscore-covered)
	      #uncovered_Percentage <-(uncovered/mscore)
	    }
	    
	    test <- names(y[a>=1,]) #  Pfams IDs common in genome and metagenome
	    #print(g)
	    g[test,] <- 0 # making all Pfam as "zero" in reference genome set ( those function were already taken ) 
	    y[a>=1]<-0 # makes all metagenome matching with highest scoring genome as '0'
	   	mscore1 <- sum(y) #number of pfam in metagenome
	   
	    if(rscore==0){break} # if no pfam match found further, then stop the script
	    #RelativeMatchPerIteration=(rscore / exclusiveMismatchPerIteration)
	    
	    res <- rbind(res,cbind.data.frame(bname=SelectedBestHit,
	                                      percentage=percent,score=rscore,ScorePerIteration=ascore,
	                                      pfamInGenomePerItration=pfamInGenomePerItration,
	                                      NovelFunctPerc=percent_nw,
	                                      #RelativeMatchPerIteration=RelativeMatchPerIteration,
	                                      exclusiveMismatchPerIteration=exclusiveMismatchPerIteration)) # making output dataframe
	    
	  }
	  return(res)
	}

 #compare reference genome to metaegenome
  scorePfam <- function(genomes, metagenome){
    	g <- as.matrix(genomes)
    	y <- metagenome	} 


  # from Sudarshan A. Shetty, & Leo Lahti. (2020, October). microbiomeutilities: Utilities for Microbiome Analytics
   func_aggregate_top_taxa2 <- function(x, top, level) {
   x <- aggregate_taxa(x, level)

   tops <- top_taxa(x, top)
   tax <- tax_table(x)

   inds <- which(!rownames(tax) %in% tops)

   tax[inds, level] <- "Other"

   tax_table(x) <- tax

   tt <- tax_table(x)[, level]
   tax_table(x) <- tax_table(tt)

   aggregate_taxa(x, level)
   }

# Total number of pfam and match to metagenome ----------------------------

  #total number of pfam in genome and metagenome 
  #	GenomeBinary <- GenomeVector
  #	row.names(GenomeBinary) <- GenomeBinary$PfamID # if coulmn name has the pfam ids and rownames does not exists.
  #	GenomeBinary$PfamID <- NULL

  colnames(GenomeVector)
  colnames(MetaGenome)
  total <- cbind(GenomeVector,MetaGenome)
  total <- as.data.frame(colSums(total))  #skip description folder.and sums up all pfam for each genome.

  colnames(total) <- "PfamTotal" #change name of columns # total number of pfams in each genome and metagenome.
  total$genomes <- rownames(total) #adding column name to total score file.
  rownames(total) <- NULL

  temp <- cbind.data.frame(old=c(total$genomes),neu=
                             c(rep("BacterialGenome",ncol(GenomeVector)),
                               rep("MetaGenome",ncol(MetaGenome)))) #taging bacterial genome and metaGenome
  total$name<- as.character(temp[match(total$genomes,temp$old),2]) # number of in all genomes and metagenome


# summary ---------------------------------------------

  # extract part final stats in summary file
  stats_summary = dplyr::select(finalStats,MetaGenome,Species, Percentage, NovelMatch, AbsoluteMatch, CumNovelMatch)
  colnames(stats_summary)=c("MetaGenome","Species","CumuPercentage","NovelMatch","AbsoluteMatch","CumuNovelMatch")


# Minimal consortia final selection ------------------------------------------
  Minimal_consortia_size

  #calc abund as sum of highest val in cum novel match
  df_abund= aggregate(stats_summary$CumuNovelMatch, 
			list(stats_summary$MetaGenome), FUN=max)  # calc pfam per metagenome

  colnames(df_abund)=c("MetaGenome","sample_sum_pfam")

  df_abund=plyr::join(dplyr::select(stats_summary,MetaGenome,Species,NovelMatch),
                      	df_abund,by="MetaGenome",type="left")

  df_abund$rel_score=df_abund$NovelMatch/df_abund$sample_sum_pfam # normalize novel match score by pfam per metagenome
  
  #prev filter
  filter_prev =  table(df_abund$Species) %>% as.data.frame()
  prevThreshold = 0.25* length(unique(df_abund$MetaGenome))          # variable prevalence threshold
  filter_prev = filter_prev[,"Var1"][(filter_prev$Freq >= prevThreshold)] %>% as.character()

  #leave one out
  df_specab = dplyr::select(df_abund,Species,rel_score)   
  df_specab = dplyr::group_by(df_specab,
				  Species) %>% dplyr::summarise_all(median)   # take sum of duplicate values
  df_specab=df_specab[df_specab$Species %in% filter_prev, ]                      # select only prev filtered

  while (length(df_specab$Species) > Minimal_consortia_size) {
    
    lowest_hit = df_specab[which.min(df_specab$rel_score),]
    lowest_hit <- as.character(lowest_hit$Species)                              # name of the selected genome

  
    df_specab=df_specab[!df_specab$Species %in% lowest_hit, ]                  # exclude selected genome.
  }


# heatmap plot -----------------------
#Plot the percentage novel functions per metagenome (x) and genome (y)

  phyl_plot = dplyr::select(finalStats,MetaGenome,Species, NovelFunctPerc)
  phyl_plot = spread(phyl_plot,MetaGenome,NovelFunctPerc)
  phyl_plot = na.zero(phyl_plot) #convert NA to zero
  plot_height=round((length(rownames(phyl_plot))/9)+2.5,digits=1)
  plot_width=round((length(colnames(phyl_plot))/9)+6,digits=1)
  rownames(phyl_plot) = phyl_plot$Species
  phyl_plot$Species= NULL
  phyl_plot= as.matrix(phyl_plot,rownames.value=rownames(phyl_plot))
  #phyl_plot = t(scale(t(phyl_plot)))   # Scale by row manually
  cols = colorRampPalette(c("white","#F0E442","darkorange","red2","darkred"))
  cols = cols(10)
  
  # heatmap without annotation
  p2=pheatmap(phyl_plot, labels_row=rownames(phyl_plot), border_color=F, color=cols,
		show_rownames=TRUE)
  
  ggsave(paste0(opt$root_dir,"/08_mimic_output/heatmap_species_perc_novel_funct.tiff"),
	plot=p2, units="in", width=plot_width, height=plot_height, dpi=300, compression = 'lzw')
    
  # heatmap row annotation
  myRows <- df_specab$Species                      	# rows to highlight
  row_idx <- which(rownames(phyl_plot) %in% myRows)      # font/styling settings
  fontsizes <- rep(10, nrow(phyl_plot))
  fontsizes[row_idx] <- 10
  fontcolors <- rep('black', nrow(phyl_plot))
  fontcolors[row_idx] <- 'black'
  fontfaces <- rep('plain',nrow(phyl_plot))
  fontfaces[row_idx] <- 'bold'
   rowAnno <- rowAnnotation(rows=anno_text(rownames(phyl_plot),
                          gp=gpar(fontsize=fontsizes, fontface = fontfaces,
                                  col = fontcolors)))  # Create text annotation  
  # heatmap boxes around our rows
  dend <- reorder(as.dendrogram(hclust(dist(phyl_plot))),
                  -rowMeans(phyl_plot), agglo.FUN = mean) # create our own row dendrogram
  
  dend_idx <- which(order.dendrogram(dend) %in% which(
    rownames(phyl_plot) %in% myRows))                     # Find rows in dendrogram
  
  btm <- 1 - (dend_idx / nrow(phyl_plot))                 # save position rows 
  top <- btm + (1/nrow(phyl_plot))
  box_col <- 'black'
  box_width <- 2
  
  # draw heatmap
  tiff(filename = paste0(opt$root_dir,"/08_mimic_output/heatmap_species_perc_novel_funct_rowanno.tiff"),
	units="in", width=plot_width, height=plot_height, compression='lzw', res=300)

  Heatmap(phyl_plot, name="ht",cluster_rows = dend, 
          right_annotation = rowAnno,col=cols,
          show_row_names = FALSE)              
  
  decorate_heatmap_body("ht", { for (i in 1:length(myRows)) {
    grid.lines(c(0, 1), c(top[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
    grid.lines(c(0, 1), c(btm[i],btm[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
    grid.lines(c(0, 0), c(btm[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
    grid.lines(c(1, 1), c(btm[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
  }
  })
  dev.off()

# Recalculate MM consortia using MiMiC function -------------------------

  mimic_recalc = plyr::join(df_specab, species_meta, by="Species", type="left")        # subset genome df
  mimic_recalc = GenomeVector[,which(colnames(GenomeVector) %in% mimic_recalc$BacterialGenome)]

  recalc_PfamTable <- as.data.frame(apply(as.data.frame(MetaGenome),2,
                                   function(x) scorePfam(as.data.frame(mimic_recalc),
				    x))) #adding score to pfam table
  
  recalc_PfamTable$BacterialGenome <- rownames(recalc_PfamTable)

  mimic_recalc = apply(as.data.frame(MetaGenome),
                        2,function(x) scoreDiff_weighted(as.data.frame(mimic_recalc),
                                                         x,Minimal_consortia_size)) # making a list of the mimic calculation
  
  mimic_recalc = ldply(mimic_recalc, data.frame) # converting list into dataframe
  names(mimic_recalc) <- c("MetaGenome","BacterialGenome","Percentage","Score","ScorePerIteration",
                   "pfamInGenomePerItration","NovelMatchPer","exclusiveMismatchPerIteration") #naming the column in the dataframe
  
  mimic_recalc = mimic_recalc %>% 
    dplyr::group_by(MetaGenome) %>% 
    dplyr::mutate(Index=1:length(MetaGenome)) # adding the rank of species in metagenome
  
  #To plot metagenome coverage  
  p1 <- ggplot(mimic_recalc,aes(Index,Percentage,group=MetaGenome,color=MetaGenome))+ 
    geom_point()+ geom_line()+
    scale_x_continuous(breaks = seq(0, Minimal_consortia_size, by = 2))+
    theme(legend.position = "none")+
    ylab("Functions covered (%)") + xlab("Bacterial species index")# plot for pfam coverage of metagenome by minimal consortia
  
     ggsave(paste0(opt$root_dir,"/08_mimic_output/pfam_coverage_plot_mmconsortia.tiff"),
	plot=p1, units="in", width=14, height=8, dpi=300, compression = 'lzw')

  #continue collecting table
  mimic_recalc = dplyr::left_join(mimic_recalc,total[,c("PfamTotal","genomes")], by=c("BacterialGenome" = "genomes")) %>%
              dplyr::left_join(.,reshape2::melt(recalc_PfamTable,
                                      id.vars="BacterialGenome",
                                      value.name="AbsoluteMatch",variable.name="MetaGenome"),
				      by=c("BacterialGenome","MetaGenome")) %>%
               dplyr::rename(NovelMatch = Score, BactTotalPfam = PfamTotal) %>%
              dplyr::group_by(MetaGenome) %>% 
            dplyr::mutate(AbsoluteMismatch = BactTotalPfam - AbsoluteMatch,
                  CumAbsoluteMismatch = cumsum(AbsoluteMismatch),
                  #AbsoluteRelativeMatch = AbsoluteMatch/AbsoluteMismatch,
                  CumNovelMatch = cumsum(NovelMatch)
                  #CumAbsoluteRelativeMatch = cumsum(AbsoluteRelativeMatch),
                  #CumExclusiveMismatchPerIteration = cumsum(exclusiveMismatchPerIteration),
                  #CumIterationRelativeMatch = cumsum(RelativeMatchPerIteration),
    ) %>% ungroup()
  
  
  # add species names
  mimic_recalc$BacterialGenome = sub('^([^.]+.[^.]+).*', '\\1',mimic_recalc$BacterialGenome)		
  mimic_recalc= plyr::join(mimic_recalc,species_meta, by="BacterialGenome", type="left") # add species names

  # heatmap plot minimal consortia  -----------------------
  #Plot the percentage novel functions. 
  phyl_plot = dplyr::select(mimic_recalc,MetaGenome,Species, NovelMatchPer)
  phyl_plot = spread(phyl_plot,MetaGenome,NovelMatchPer)
  phyl_plot = na.zero(phyl_plot) #convert NA to zero
  plot_height=round((length(rownames(phyl_plot))/10)+2.5,digits=1)
  plot_width=round((length(colnames(phyl_plot))/12)+2,digits=1)
    rownames(phyl_plot) = phyl_plot$Species
  phyl_plot$Species= NULL
  phyl_plot= as.matrix(phyl_plot)
  #phyl_plot = t(scale(t(phyl_plot)))   # Scale by row manually
  cols = colorRampPalette(c("white","#F0E442","darkorange","red2","darkred"))
  cols = cols(10)
  
  # heatmap row annotation
  myRows <- df_specab$Species                      # rows to highlight
  row_idx <- which(rownames(phyl_plot) %in% myRows)      # font/styling settings
  fontsizes <- rep(10, nrow(phyl_plot))
  fontsizes[row_idx] <- 10
  fontcolors <- rep('black', nrow(phyl_plot))
  fontcolors[row_idx] <- 'black'
  fontfaces <- rep('plain',nrow(phyl_plot))
  fontfaces[row_idx] <- 'plain'
  
  rowAnno <- rowAnnotation(rows=anno_text(rownames(phyl_plot),
                                          gp=gpar(fontsize=fontsizes, fontface = fontfaces,
                                                  col = fontcolors)))  # Create text annotation
  
  # heatmap boxes around our rows
  dend <- reorder(as.dendrogram(hclust(dist(phyl_plot))),
                  -rowMeans(phyl_plot), agglo.FUN = mean) # create our own row dendrogram
  
  dend_idx <- which(order.dendrogram(dend) %in% which(
    rownames(phyl_plot) %in% myRows))                     # Find rows in dendrogram
  
  btm <- 1 - (dend_idx / nrow(phyl_plot))                 # save position rows 
  top <- btm + (1/nrow(phyl_plot))
  box_col <- 'black'
  box_width <- 1
  
  # draw heatmap
  tiff(filename = paste0(opt$root_dir,"/08_mimic_output/heatmap_mmconsort_perc_novel_funct.tiff"),
	units="in", width=plot_width, height=plot_height, compression='lzw', res=300)
  
  Heatmap(phyl_plot, name="ht",cluster_rows = dend, 
          right_annotation = rowAnno,col=cols,
          show_column_names = FALSE, column_title = "Metagenomic samples",
          column_title_side = c("bottom"),
          show_row_names = FALSE)              
  
  decorate_heatmap_body("ht", { for (i in 1:length(myRows)) {
    grid.lines(c(0, 1), c(top[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
    grid.lines(c(0, 1), c(btm[i],btm[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
    grid.lines(c(0, 0), c(btm[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
    grid.lines(c(1, 1), c(btm[i],top[i]), gp = gpar(lty = 1, lwd = box_width, col = box_col))
  }
  })
    dev.off()

# prepare environment for Kofam part -----------------------
  dir.create(paste0(opt$root_dir,"/12_kofam_anno"))
  dir.create(paste0(opt$root_dir,"/12_kofam_anno/single"))
  dir.create(paste0(opt$root_dir,"/12_kofam_anno/meta"))

  kofam = list.files(path=paste0(opt$root_dir,"/08_mimic_output/output_faa_single"), 
                      pattern = ".faa")    			    	# list samples

  write.table(kofam,file=paste0(opt$root_dir,"/12_kofam_anno/single/all_samples.tab"),
		sep = "\t",row.names=FALSE, quote=FALSE, col.names=F)	# export file

  kofam = list.files(path=paste0(opt$root_dir,"/08_mimic_output/output_faa_meta"), 
                      pattern = ".faa")    			    	# list samples

  write.table(kofam,file=paste0(opt$root_dir,"/12_kofam_anno/meta/all_samples.tab"),
		sep = "\t",row.names=FALSE, quote=FALSE, col.names=F)	# export file


# Plot relative abundance species annotated -------------------------------------
 
  #Plot species grouped by feed
  phyl_plot = readRDS(paste(opt$root_dir,"06_taxa_output",
 	  "physeq_kraken_reads_conf03_refseq_glomspec.rds",sep="/"))

  phyl_plot = prune_taxa(taxa_sums(phyl_plot) > 1, phyl_plot) #remove taxa lacking reads

  #Merge species and genus name
  tax_table(phyl_plot)[,"Species"] = paste(tax_table(phyl_plot)[,"Genus"],
                                         tax_table(phyl_plot)[,"Species"],
                                         sep="") #paste genus name to species one

  tax_table(phyl_plot)[,"Species"] = gsub("g_", "",tax_table(phyl_plot)[,"Species"])
  tax_table(phyl_plot)[,"Species"] = gsub("s_", " ",tax_table(phyl_plot)[,"Species"]) 
  taxa_names(phyl_plot) = tax_table(phyl_plot)[,"Species"] # rename identifyer by species
  
  #select settings
  phyl_plot = func_aggregate_top_taxa2(phyl_plot, "Species", top = 12) #reduce to species
  phyl_plot = microbiome::transform(phyl_plot, "compositional") #compositional
  mm_taxa = sub("^(\\S*\\s+\\S+).*", "\\1", df_specab$Species)		# extract species MM consortia
  taxa_names(phyl_plot) = sapply(tax_table(phyl_plot)[,"Species"],
                                 function(name){ifelse(name %in% mm_taxa,
								paste0("**",name,"**"),name)},
                                 USE.NAMES = FALSE) # highlight names MM consortia in bold                  

  order = names(sort(taxa_sums(phyl_plot), F)[1:13]) 
  order = order[order != "Other"]
  order = c("Other",order)

  # plotting
  plot_composition(phyl_plot,
                sample.sort = NULL,
                otu.sort=order, x.label="Sample") +
		   geom_bar(position="stack", stat="identity", color=NA)+
                theme_bw() + 
                labs(title="Top 12 abundant bacterial species")+
                ylab( "Abundance (%)")+ xlab( "Sample ID")+
                theme(legend.justification  = "top",
                legend.text = element_markdown(),				# required to highlight MM consortia in bold
                legend.title = element_blank(),
                legend.key.size = unit(4.5, 'mm'),
                plot.title = element_text(size = 12),
                axis.title.y = element_text(color="Grey1", size=13, face="bold"),
                axis.text.y =element_text(color="Grey1", size=11),
                axis.text.x =element_text(color="Grey1", size=9, angle = 90, vjust = 0.5),
                axis.title.x = element_text(color="Grey1", size = 13, face="bold"),
                panel.grid.major = element_blank(),
                axis.ticks = element_blank())

  ggsave(paste(opt$root_dir,"06_taxa_output","abund_kraken_top12_spec_annot.tiff",sep="/"),
          units="in", width=18, height=8, dpi=300, compression = 'lzw')
        
cat("--------------- R script completed ---------------")
cat("\n\n")

	end_time <- Sys.time()
  cat("Duration of :")
	end_time - start_time

cat("\n\n")

cat("--------------- Bacteria of interest ---------------")
cat("\n\n")

cat(paste0("aimed for minimal consortia size of: ",Minimal_consortia_size))
cat("\n")

cat(paste0("resulted in consortia size of : ",length(unique(df_specab$Species))))
cat("\n\n")

print(unique(df_specab$Species))
cat("\n\n")

cat("Please open the MiMiC_summary-report to interpret the significance") 
cat("\n")