Upload New File

dfa93e7b · Stege, Paul · a43a750b · dfa93e7b
Commit dfa93e7b authored 2 years ago by Stege, Paul
--- a/06_R_abund_prev_analysis.R
+++ b/06_R_abund_prev_analysis.R
+#=============================================================================================
+#author			Paul Stege
+#date			11-2022
+#script			Analyse output kraken for abundance and prevalence
+#software		R ; R Core Team 2022
+#package		P.J. McMurdie and S. Holmes, 2012 ; phyloseq
+#			L. Lahti and S. Shetty, 2017 ; microbiome
+#=============================================================================================
+start_time <- Sys.time()
+
+# install packages --------------------------------------------------------
+  
+  paks <- c("dplyr","plyr","tidyverse","data.table", "phyloseq","ggplot2",
+            "microbiome","optparse")
+
+  paks<-lapply(paks, function(x) suppressMessages(require(x, character.only = TRUE)))
+  rm(paks)
+
+
+# load function --------------------------------------------------------
+  # from Sudarshan A. Shetty, & Leo Lahti. (2020, October). microbiomeutilities: Utilities for Microbiome Analytics
+   func_aggregate_top_taxa2 <- function(x, top, level) {
+   x <- aggregate_taxa(x, level)
+
+   tops <- top_taxa(x, top)
+   tax <- tax_table(x)
+
+   inds <- which(!rownames(tax) %in% tops)
+
+   tax[inds, level] <- "Other"
+
+   tax_table(x) <- tax
+
+   tt <- tax_table(x)[, level]
+   tax_table(x) <- tax_table(tt)
+
+   aggregate_taxa(x, level)
+   }
+
+ 
+# Configure scrpt settings ---------------------------------------------------------
+  # script format is Rscript -a abundance_cutoff -p prevelance_cutoff -r root_dir -i input_biom_file
+ 
+  # apply package "optparse"
+  option_list = list(
+    		make_option(c("-a", "--abundance_cutoff"), type="numeric", default=0.5, 
+                	help="abundance cutoff on scale of 0-1, where 0.0001 represents 0.01%", metavar="number"),
+    		make_option(c("-p", "--prevalence_cutoff"), type="numeric", default=0.0001, 
+                	help="prevalencecutoff on scale of 0-1, where 0.5 represents 50%", metavar="number"),
+  		make_option(c("-r", "--root_dir"), type="character", default="/home/", 
+       	          	help="directory for this project, containing all folders for reading and exporting data [default= %default]", metavar="character"),
+  		make_option(c("-i", "--input_biom_file"), type="character", default="*.biom", 
+       	          	help="input file, which is the biom kraken output file [default= %default]", metavar="character")
+    			)
+ 
+  opt_parser = OptionParser(option_list=option_list)
+  opt = parse_args(opt_parser)
+
+### 1 - Load data ------------------------------------------------------------------
+  
+  #import kraken biom file as phyloseq
+  print(paste("importing from ",						  # test if variables set correctly
+	paste(opt$root_dir,"05_kraken_testset",opt$input_biom_file, sep="/"),
+		 sep=""))
+
+  phyl_kraken = import_biom(paste(opt$root_dir,"05_kraken_anno",opt$input_biom_file, sep="/"))
+
+  #adjust otu file
+  phyl_otu = as(otu_table(phyl_kraken),"matrix") %>% as.data.frame()
+  rownames(phyl_otu)=paste("OTU",rownames(phyl_otu), sep="")
+
+  #adjust taxonomy
+  phyl_tax = as(tax_table(phyl_kraken),"matrix") %>% as.data.frame()   # check tax file 
+  colnames(phyl_tax)=c("Kingdom","Phylum","Class","Order","Family","Genus","Species")
+  phyl_tax$OTUID=paste("OTU",rownames(phyl_tax), sep="")
+  phyl_tax = sapply(phyl_tax, function(x) {gsub("__","_", as.character(x))}) %>% as.data.frame()
+  rownames(phyl_tax)=phyl_tax$OTUID
+  phyl_tax$Phylum = gsub('^p_$', "", phyl_tax$Phylum, ignore.case=T)	# replace empty p_
+  phyl_tax$Class = gsub('^c_$', "", phyl_tax$Class, ignore.case=T)	# replace empty c_
+  phyl_tax$Order = gsub('^o_$', "", phyl_tax$Order, ignore.case=T)	# replace empty o_
+  phyl_tax$Family = gsub('^f_$', "", phyl_tax$Family, ignore.case=T)	# replace empty f_
+  phyl_tax$Genus = gsub('^g_$', "", phyl_tax$Genus, ignore.case=T)	# replace empty g_
+  phyl_tax$Species = gsub('^s_$', "", phyl_tax$Species, ignore.case=T)	# replace empty s_
+
+  #build mapping file
+  phyl_met = read.delim(paste(opt$root,"all_samples.tab",sep="/"), header = FALSE)
+  colnames(phyl_met)="sample_ID"
+  rownames(phyl_met)=phyl_met$sample_ID
+
+  #shape phyloseq object
+  phyl_otu = otu_table(phyl_otu, taxa_are_rows = TRUE)                  # otu file
+  phyl_tax = tax_table(as.matrix(phyl_tax))                             # taxonomy file
+  phyl_met = sample_data(phyl_met, errorIfNULL = T)                     # mapping file
+
+  phyl_kraken = phyloseq(phyl_otu, phyl_tax)                            # create test phyloseq object
+  phyl_kraken 
+
+  phyl_kraken <- phyloseq(phyl_otu, phyl_tax, phyl_met)                 # including mapping 
+  phyl_kraken
+
+  #check files and export
+  dir.create(file.path(
+	     paste(opt$root,"06_taxa_output",sep="/"))) 		        # create directory
+
+  write.table(                                                          # write otu file
+	      (as(otu_table(phyl_kraken),"matrix") %>% as.data.frame()),
+	       paste(opt$root,"06_taxa_output","phyl_tax.tab",sep="/"),
+                sep="\t", row.names = T, col.names=T)
+
+  write.table(                                                          # write tax file
+	      (as(tax_table(phyl_kraken),"matrix") %>% as.data.frame()),
+	       paste(opt$root,"06_taxa_output","phyl_tax.tab",sep="/"),
+                sep="\t", row.names = T, col.names=T)
+
+  saveRDS(phyl_kraken,
+	  paste(opt$root,"06_taxa_output",
+ 	  "physeq_kraken_reads_conf03_refseq.rds",sep="/"))     # phyloseq file
+
+  #collapse to BACTERIAL species level tax_glom
+  phyl_kraken = subset_taxa(phyl_kraken, Kingdom %in% "k_Bacteria")     #only bacteria
+  phyl_glom = tax_glom(phyl_kraken, "Species",NArm=T)                   # collapse species, remove unassigned
+  
+  saveRDS(phyl_glom,
+	  paste(opt$root,"06_taxa_output",
+ 	  "physeq_kraken_reads_conf03_refseq_glomspec.rds",sep="/")) # phyloseq file
+
+### 2 - Prevalence and abudance cutoff --------------------------------------------
+ 
+  #load data
+  phyl_filter= readRDS(paste(opt$root,"06_taxa_output",
+ 	  "physeq_kraken_reads_conf03_refseq_glomspec.rds",sep="/"))
+
+  print(paste("total number of species before cutoffs = ", 		# number of unique species
+		 length(taxa_sums(phyl_filter)),sep=""))
+  #Filter prevalence
+  tax_table = as(tax_table(phyl_filter),"matrix") %>% as.data.frame()
+
+  filter_prev = data.frame(Prevalence =
+                           apply(X = otu_table(phyl_filter),
+                           MARGIN = ifelse(taxa_are_rows(phyl_filter), yes = 1, no = 2),
+                           FUN = function(x){sum(x > 0)}),
+                           tax_table)
+
+  prevThreshold = opt$prevalence_cutoff* nsamples(phyl_filter)          # variable prevalence threshold
+  keepTaxa = rownames(filter_prev)[(filter_prev  >= prevThreshold)]
+  phyl_ancom = prune_taxa(keepTaxa, phyl_filter)
+
+  print(								# number of unique species
+	paste("total number of species after a prevalence cutoff of ",
+	       opt$prevalence_cutoff*100,"%, is: ",
+		 length(taxa_sums(phyl_ancom)),sep=""))
+
+
+  #filter abundance
+  minTotRelAbun = opt$abundance_cutoff                                  # variable abundance threshold
+  sum_taxa = taxa_sums(phyl_ancom)
+  keepTaxa = taxa_names(phyl_ancom)[which((sum_taxa / sum(sum_taxa)) > minTotRelAbun)]
+  phyl_ancom = prune_taxa(keepTaxa, phyl_ancom)
+  spec_list = as(tax_table(phyl_ancom),"matrix")%>% as.data.frame() # check map file 
+  print(								# number of unique species
+	paste("total number of species before after additional abundance cutoff of ",
+	       opt$abundance_cutoff*100,"%, is: ",
+		 length(spec_list$Species),sep=""))
+
+  #Merge species and genus name
+  spec_list[,"Species"] = paste(spec_list[,"Genus"],
+                             spec_list[,"Species"],sep="") #paste genus name to species one
+
+  spec_list[,"Species"] = gsub("g_", "",spec_list[,"Species"])
+  spec_list[,"Species"] = gsub("s_", " ",spec_list[,"Species"]) 
+  spec_list=spec_list$Species
+
+  write.table(spec_list,						# write species list
+	       paste(opt$root,"06_taxa_output","spec_list_core_microbiota.tab",sep="/"),
+                sep="\t", row.names = F, col.names=F,quote = F)
+
+
+### 3 - Plot relative abundance species -------------------------------------
+ 
+  #Plot species grouped by feed
+  phyl_plot = readRDS(paste(opt$root,"06_taxa_output",
+ 	  "physeq_kraken_reads_conf03_refseq_glomspec.rds",sep="/"))
+
+  phyl_plot = prune_taxa(taxa_sums(phyl_plot) > 1, phyl_plot) #remove taxa lacking reads
+
+  #Merge species and genus name
+  tax_table(phyl_plot)[,"Species"] = paste(tax_table(phyl_plot)[,"Genus"],
+                                         tax_table(phyl_plot)[,"Species"],
+                                         sep="") #paste genus name to species one
+
+  tax_table(phyl_plot)[,"Species"] = gsub("g_", "",tax_table(phyl_plot)[,"Species"])
+  tax_table(phyl_plot)[,"Species"] = gsub("s_", " ",tax_table(phyl_plot)[,"Species"]) 
+  taxa_names(phyl_plot) = tax_table(phyl_plot)[,"Species"] # rename identifyer by species
+
+  #select settings
+  phyl_plot = func_aggregate_top_taxa2(phyl_plot, "Species", top = 12) #reduce to species
+  phyl_plot = microbiome::transform(phyl_plot, "compositional") #compositional
+  order = names(sort(taxa_sums(phyl_plot), F)[1:13]) 
+  order = order[order != "Other"]
+  order = c("Other",order)
+
+  # plotting
+  plot_composition(phyl_plot,
+                sample.sort = NULL,
+                otu.sort=order, x.label="Sample") +
+		geom_bar(position="stack", stat="identity", color=NA)+
+                theme_bw() + 
+                labs(title="Top 12 abundant bacterial species")+
+                ylab( "Abundance (%)")+ xlab( "Sample ID")+
+                theme(legend.justification  = "top",
+                legend.title = element_blank(),
+                legend.key.size = unit(4.5, 'mm'),
+                plot.title = element_text(size = 12),
+                axis.title.y = element_text(color="Grey1", size=13, face="bold"),
+                axis.text.y =element_text(color="Grey1", size=11),
+                axis.text.x =element_text(color="Grey1", size=9, angle = 90, vjust = 0.5),
+                axis.title.x = element_text(color="Grey1", size = 13, face="bold"),
+                panel.grid.major = element_blank(),
+                axis.ticks = element_blank())
+
+  ggsave(paste(opt$root,"06_taxa_output","abund_kraken_top12_spec.tiff",sep="/"),
+          units="in", width=18, height=8, dpi=300, compression = 'lzw')
+
+########### Ends here    ################################
+	end_time <- Sys.time()
+  print(" Duration of R processing:")
+	end_time - start_time
+	# 
\ No newline at end of file