From dfa93e7b40fb991fa4b241738c46e00bbd3f16fc Mon Sep 17 00:00:00 2001 From: "Stege, Paul" <paul.stege@wur.nl> Date: Thu, 3 Nov 2022 17:03:04 +0000 Subject: [PATCH] Upload New File --- 06_R_abund_prev_analysis.R | 228 +++++++++++++++++++++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 06_R_abund_prev_analysis.R diff --git a/06_R_abund_prev_analysis.R b/06_R_abund_prev_analysis.R new file mode 100644 index 0000000..0fe4635 --- /dev/null +++ b/06_R_abund_prev_analysis.R @@ -0,0 +1,228 @@ +#============================================================================================= +#author Paul Stege +#date 11-2022 +#script Analyse output kraken for abundance and prevalence +#software R ; R Core Team 2022 +#package P.J. McMurdie and S. Holmes, 2012 ; phyloseq +# L. Lahti and S. Shetty, 2017 ; microbiome +#============================================================================================= +start_time <- Sys.time() + +# install packages -------------------------------------------------------- + + paks <- c("dplyr","plyr","tidyverse","data.table", "phyloseq","ggplot2", + "microbiome","optparse") + + paks<-lapply(paks, function(x) suppressMessages(require(x, character.only = TRUE))) + rm(paks) + + +# load function -------------------------------------------------------- + # from Sudarshan A. Shetty, & Leo Lahti. (2020, October). microbiomeutilities: Utilities for Microbiome Analytics + func_aggregate_top_taxa2 <- function(x, top, level) { + x <- aggregate_taxa(x, level) + + tops <- top_taxa(x, top) + tax <- tax_table(x) + + inds <- which(!rownames(tax) %in% tops) + + tax[inds, level] <- "Other" + + tax_table(x) <- tax + + tt <- tax_table(x)[, level] + tax_table(x) <- tax_table(tt) + + aggregate_taxa(x, level) + } + + +# Configure scrpt settings --------------------------------------------------------- + # script format is Rscript -a abundance_cutoff -p prevelance_cutoff -r root_dir -i input_biom_file + + # apply package "optparse" + option_list = list( + make_option(c("-a", "--abundance_cutoff"), type="numeric", default=0.5, + help="abundance cutoff on scale of 0-1, where 0.0001 represents 0.01%", metavar="number"), + make_option(c("-p", "--prevalence_cutoff"), type="numeric", default=0.0001, + help="prevalencecutoff on scale of 0-1, where 0.5 represents 50%", metavar="number"), + make_option(c("-r", "--root_dir"), type="character", default="/home/", + help="directory for this project, containing all folders for reading and exporting data [default= %default]", metavar="character"), + make_option(c("-i", "--input_biom_file"), type="character", default="*.biom", + help="input file, which is the biom kraken output file [default= %default]", metavar="character") + ) + + opt_parser = OptionParser(option_list=option_list) + opt = parse_args(opt_parser) + +### 1 - Load data ------------------------------------------------------------------ + + #import kraken biom file as phyloseq + print(paste("importing from ", # test if variables set correctly + paste(opt$root_dir,"05_kraken_testset",opt$input_biom_file, sep="/"), + sep="")) + + phyl_kraken = import_biom(paste(opt$root_dir,"05_kraken_anno",opt$input_biom_file, sep="/")) + + #adjust otu file + phyl_otu = as(otu_table(phyl_kraken),"matrix") %>% as.data.frame() + rownames(phyl_otu)=paste("OTU",rownames(phyl_otu), sep="") + + #adjust taxonomy + phyl_tax = as(tax_table(phyl_kraken),"matrix") %>% as.data.frame() # check tax file + colnames(phyl_tax)=c("Kingdom","Phylum","Class","Order","Family","Genus","Species") + phyl_tax$OTUID=paste("OTU",rownames(phyl_tax), sep="") + phyl_tax = sapply(phyl_tax, function(x) {gsub("__","_", as.character(x))}) %>% as.data.frame() + rownames(phyl_tax)=phyl_tax$OTUID + phyl_tax$Phylum = gsub('^p_$', "", phyl_tax$Phylum, ignore.case=T) # replace empty p_ + phyl_tax$Class = gsub('^c_$', "", phyl_tax$Class, ignore.case=T) # replace empty c_ + phyl_tax$Order = gsub('^o_$', "", phyl_tax$Order, ignore.case=T) # replace empty o_ + phyl_tax$Family = gsub('^f_$', "", phyl_tax$Family, ignore.case=T) # replace empty f_ + phyl_tax$Genus = gsub('^g_$', "", phyl_tax$Genus, ignore.case=T) # replace empty g_ + phyl_tax$Species = gsub('^s_$', "", phyl_tax$Species, ignore.case=T) # replace empty s_ + + #build mapping file + phyl_met = read.delim(paste(opt$root,"all_samples.tab",sep="/"), header = FALSE) + colnames(phyl_met)="sample_ID" + rownames(phyl_met)=phyl_met$sample_ID + + #shape phyloseq object + phyl_otu = otu_table(phyl_otu, taxa_are_rows = TRUE) # otu file + phyl_tax = tax_table(as.matrix(phyl_tax)) # taxonomy file + phyl_met = sample_data(phyl_met, errorIfNULL = T) # mapping file + + phyl_kraken = phyloseq(phyl_otu, phyl_tax) # create test phyloseq object + phyl_kraken + + phyl_kraken <- phyloseq(phyl_otu, phyl_tax, phyl_met) # including mapping + phyl_kraken + + #check files and export + dir.create(file.path( + paste(opt$root,"06_taxa_output",sep="/"))) # create directory + + write.table( # write otu file + (as(otu_table(phyl_kraken),"matrix") %>% as.data.frame()), + paste(opt$root,"06_taxa_output","phyl_tax.tab",sep="/"), + sep="\t", row.names = T, col.names=T) + + write.table( # write tax file + (as(tax_table(phyl_kraken),"matrix") %>% as.data.frame()), + paste(opt$root,"06_taxa_output","phyl_tax.tab",sep="/"), + sep="\t", row.names = T, col.names=T) + + saveRDS(phyl_kraken, + paste(opt$root,"06_taxa_output", + "physeq_kraken_reads_conf03_refseq.rds",sep="/")) # phyloseq file + + #collapse to BACTERIAL species level tax_glom + phyl_kraken = subset_taxa(phyl_kraken, Kingdom %in% "k_Bacteria") #only bacteria + phyl_glom = tax_glom(phyl_kraken, "Species",NArm=T) # collapse species, remove unassigned + + saveRDS(phyl_glom, + paste(opt$root,"06_taxa_output", + "physeq_kraken_reads_conf03_refseq_glomspec.rds",sep="/")) # phyloseq file + +### 2 - Prevalence and abudance cutoff -------------------------------------------- + + #load data + phyl_filter= readRDS(paste(opt$root,"06_taxa_output", + "physeq_kraken_reads_conf03_refseq_glomspec.rds",sep="/")) + + print(paste("total number of species before cutoffs = ", # number of unique species + length(taxa_sums(phyl_filter)),sep="")) + #Filter prevalence + tax_table = as(tax_table(phyl_filter),"matrix") %>% as.data.frame() + + filter_prev = data.frame(Prevalence = + apply(X = otu_table(phyl_filter), + MARGIN = ifelse(taxa_are_rows(phyl_filter), yes = 1, no = 2), + FUN = function(x){sum(x > 0)}), + tax_table) + + prevThreshold = opt$prevalence_cutoff* nsamples(phyl_filter) # variable prevalence threshold + keepTaxa = rownames(filter_prev)[(filter_prev >= prevThreshold)] + phyl_ancom = prune_taxa(keepTaxa, phyl_filter) + + print( # number of unique species + paste("total number of species after a prevalence cutoff of ", + opt$prevalence_cutoff*100,"%, is: ", + length(taxa_sums(phyl_ancom)),sep="")) + + + #filter abundance + minTotRelAbun = opt$abundance_cutoff # variable abundance threshold + sum_taxa = taxa_sums(phyl_ancom) + keepTaxa = taxa_names(phyl_ancom)[which((sum_taxa / sum(sum_taxa)) > minTotRelAbun)] + phyl_ancom = prune_taxa(keepTaxa, phyl_ancom) + spec_list = as(tax_table(phyl_ancom),"matrix")%>% as.data.frame() # check map file + print( # number of unique species + paste("total number of species before after additional abundance cutoff of ", + opt$abundance_cutoff*100,"%, is: ", + length(spec_list$Species),sep="")) + + #Merge species and genus name + spec_list[,"Species"] = paste(spec_list[,"Genus"], + spec_list[,"Species"],sep="") #paste genus name to species one + + spec_list[,"Species"] = gsub("g_", "",spec_list[,"Species"]) + spec_list[,"Species"] = gsub("s_", " ",spec_list[,"Species"]) + spec_list=spec_list$Species + + write.table(spec_list, # write species list + paste(opt$root,"06_taxa_output","spec_list_core_microbiota.tab",sep="/"), + sep="\t", row.names = F, col.names=F,quote = F) + + +### 3 - Plot relative abundance species ------------------------------------- + + #Plot species grouped by feed + phyl_plot = readRDS(paste(opt$root,"06_taxa_output", + "physeq_kraken_reads_conf03_refseq_glomspec.rds",sep="/")) + + phyl_plot = prune_taxa(taxa_sums(phyl_plot) > 1, phyl_plot) #remove taxa lacking reads + + #Merge species and genus name + tax_table(phyl_plot)[,"Species"] = paste(tax_table(phyl_plot)[,"Genus"], + tax_table(phyl_plot)[,"Species"], + sep="") #paste genus name to species one + + tax_table(phyl_plot)[,"Species"] = gsub("g_", "",tax_table(phyl_plot)[,"Species"]) + tax_table(phyl_plot)[,"Species"] = gsub("s_", " ",tax_table(phyl_plot)[,"Species"]) + taxa_names(phyl_plot) = tax_table(phyl_plot)[,"Species"] # rename identifyer by species + + #select settings + phyl_plot = func_aggregate_top_taxa2(phyl_plot, "Species", top = 12) #reduce to species + phyl_plot = microbiome::transform(phyl_plot, "compositional") #compositional + order = names(sort(taxa_sums(phyl_plot), F)[1:13]) + order = order[order != "Other"] + order = c("Other",order) + + # plotting + plot_composition(phyl_plot, + sample.sort = NULL, + otu.sort=order, x.label="Sample") + + geom_bar(position="stack", stat="identity", color=NA)+ + theme_bw() + + labs(title="Top 12 abundant bacterial species")+ + ylab( "Abundance (%)")+ xlab( "Sample ID")+ + theme(legend.justification = "top", + legend.title = element_blank(), + legend.key.size = unit(4.5, 'mm'), + plot.title = element_text(size = 12), + axis.title.y = element_text(color="Grey1", size=13, face="bold"), + axis.text.y =element_text(color="Grey1", size=11), + axis.text.x =element_text(color="Grey1", size=9, angle = 90, vjust = 0.5), + axis.title.x = element_text(color="Grey1", size = 13, face="bold"), + panel.grid.major = element_blank(), + axis.ticks = element_blank()) + + ggsave(paste(opt$root,"06_taxa_output","abund_kraken_top12_spec.tiff",sep="/"), + units="in", width=18, height=8, dpi=300, compression = 'lzw') + +########### Ends here ################################ + end_time <- Sys.time() + print(" Duration of R processing:") + end_time - start_time + # \ No newline at end of file -- GitLab