Skip to content
Snippets Groups Projects
Commit dfa93e7b authored by Stege, Paul's avatar Stege, Paul
Browse files

Upload New File

parent a43a750b
No related branches found
No related tags found
No related merge requests found
#=============================================================================================
#author Paul Stege
#date 11-2022
#script Analyse output kraken for abundance and prevalence
#software R ; R Core Team 2022
#package P.J. McMurdie and S. Holmes, 2012 ; phyloseq
# L. Lahti and S. Shetty, 2017 ; microbiome
#=============================================================================================
start_time <- Sys.time()
# install packages --------------------------------------------------------
paks <- c("dplyr","plyr","tidyverse","data.table", "phyloseq","ggplot2",
"microbiome","optparse")
paks<-lapply(paks, function(x) suppressMessages(require(x, character.only = TRUE)))
rm(paks)
# load function --------------------------------------------------------
# from Sudarshan A. Shetty, & Leo Lahti. (2020, October). microbiomeutilities: Utilities for Microbiome Analytics
func_aggregate_top_taxa2 <- function(x, top, level) {
x <- aggregate_taxa(x, level)
tops <- top_taxa(x, top)
tax <- tax_table(x)
inds <- which(!rownames(tax) %in% tops)
tax[inds, level] <- "Other"
tax_table(x) <- tax
tt <- tax_table(x)[, level]
tax_table(x) <- tax_table(tt)
aggregate_taxa(x, level)
}
# Configure scrpt settings ---------------------------------------------------------
# script format is Rscript -a abundance_cutoff -p prevelance_cutoff -r root_dir -i input_biom_file
# apply package "optparse"
option_list = list(
make_option(c("-a", "--abundance_cutoff"), type="numeric", default=0.5,
help="abundance cutoff on scale of 0-1, where 0.0001 represents 0.01%", metavar="number"),
make_option(c("-p", "--prevalence_cutoff"), type="numeric", default=0.0001,
help="prevalencecutoff on scale of 0-1, where 0.5 represents 50%", metavar="number"),
make_option(c("-r", "--root_dir"), type="character", default="/home/",
help="directory for this project, containing all folders for reading and exporting data [default= %default]", metavar="character"),
make_option(c("-i", "--input_biom_file"), type="character", default="*.biom",
help="input file, which is the biom kraken output file [default= %default]", metavar="character")
)
opt_parser = OptionParser(option_list=option_list)
opt = parse_args(opt_parser)
### 1 - Load data ------------------------------------------------------------------
#import kraken biom file as phyloseq
print(paste("importing from ", # test if variables set correctly
paste(opt$root_dir,"05_kraken_testset",opt$input_biom_file, sep="/"),
sep=""))
phyl_kraken = import_biom(paste(opt$root_dir,"05_kraken_anno",opt$input_biom_file, sep="/"))
#adjust otu file
phyl_otu = as(otu_table(phyl_kraken),"matrix") %>% as.data.frame()
rownames(phyl_otu)=paste("OTU",rownames(phyl_otu), sep="")
#adjust taxonomy
phyl_tax = as(tax_table(phyl_kraken),"matrix") %>% as.data.frame() # check tax file
colnames(phyl_tax)=c("Kingdom","Phylum","Class","Order","Family","Genus","Species")
phyl_tax$OTUID=paste("OTU",rownames(phyl_tax), sep="")
phyl_tax = sapply(phyl_tax, function(x) {gsub("__","_", as.character(x))}) %>% as.data.frame()
rownames(phyl_tax)=phyl_tax$OTUID
phyl_tax$Phylum = gsub('^p_$', "", phyl_tax$Phylum, ignore.case=T) # replace empty p_
phyl_tax$Class = gsub('^c_$', "", phyl_tax$Class, ignore.case=T) # replace empty c_
phyl_tax$Order = gsub('^o_$', "", phyl_tax$Order, ignore.case=T) # replace empty o_
phyl_tax$Family = gsub('^f_$', "", phyl_tax$Family, ignore.case=T) # replace empty f_
phyl_tax$Genus = gsub('^g_$', "", phyl_tax$Genus, ignore.case=T) # replace empty g_
phyl_tax$Species = gsub('^s_$', "", phyl_tax$Species, ignore.case=T) # replace empty s_
#build mapping file
phyl_met = read.delim(paste(opt$root,"all_samples.tab",sep="/"), header = FALSE)
colnames(phyl_met)="sample_ID"
rownames(phyl_met)=phyl_met$sample_ID
#shape phyloseq object
phyl_otu = otu_table(phyl_otu, taxa_are_rows = TRUE) # otu file
phyl_tax = tax_table(as.matrix(phyl_tax)) # taxonomy file
phyl_met = sample_data(phyl_met, errorIfNULL = T) # mapping file
phyl_kraken = phyloseq(phyl_otu, phyl_tax) # create test phyloseq object
phyl_kraken
phyl_kraken <- phyloseq(phyl_otu, phyl_tax, phyl_met) # including mapping
phyl_kraken
#check files and export
dir.create(file.path(
paste(opt$root,"06_taxa_output",sep="/"))) # create directory
write.table( # write otu file
(as(otu_table(phyl_kraken),"matrix") %>% as.data.frame()),
paste(opt$root,"06_taxa_output","phyl_tax.tab",sep="/"),
sep="\t", row.names = T, col.names=T)
write.table( # write tax file
(as(tax_table(phyl_kraken),"matrix") %>% as.data.frame()),
paste(opt$root,"06_taxa_output","phyl_tax.tab",sep="/"),
sep="\t", row.names = T, col.names=T)
saveRDS(phyl_kraken,
paste(opt$root,"06_taxa_output",
"physeq_kraken_reads_conf03_refseq.rds",sep="/")) # phyloseq file
#collapse to BACTERIAL species level tax_glom
phyl_kraken = subset_taxa(phyl_kraken, Kingdom %in% "k_Bacteria") #only bacteria
phyl_glom = tax_glom(phyl_kraken, "Species",NArm=T) # collapse species, remove unassigned
saveRDS(phyl_glom,
paste(opt$root,"06_taxa_output",
"physeq_kraken_reads_conf03_refseq_glomspec.rds",sep="/")) # phyloseq file
### 2 - Prevalence and abudance cutoff --------------------------------------------
#load data
phyl_filter= readRDS(paste(opt$root,"06_taxa_output",
"physeq_kraken_reads_conf03_refseq_glomspec.rds",sep="/"))
print(paste("total number of species before cutoffs = ", # number of unique species
length(taxa_sums(phyl_filter)),sep=""))
#Filter prevalence
tax_table = as(tax_table(phyl_filter),"matrix") %>% as.data.frame()
filter_prev = data.frame(Prevalence =
apply(X = otu_table(phyl_filter),
MARGIN = ifelse(taxa_are_rows(phyl_filter), yes = 1, no = 2),
FUN = function(x){sum(x > 0)}),
tax_table)
prevThreshold = opt$prevalence_cutoff* nsamples(phyl_filter) # variable prevalence threshold
keepTaxa = rownames(filter_prev)[(filter_prev >= prevThreshold)]
phyl_ancom = prune_taxa(keepTaxa, phyl_filter)
print( # number of unique species
paste("total number of species after a prevalence cutoff of ",
opt$prevalence_cutoff*100,"%, is: ",
length(taxa_sums(phyl_ancom)),sep=""))
#filter abundance
minTotRelAbun = opt$abundance_cutoff # variable abundance threshold
sum_taxa = taxa_sums(phyl_ancom)
keepTaxa = taxa_names(phyl_ancom)[which((sum_taxa / sum(sum_taxa)) > minTotRelAbun)]
phyl_ancom = prune_taxa(keepTaxa, phyl_ancom)
spec_list = as(tax_table(phyl_ancom),"matrix")%>% as.data.frame() # check map file
print( # number of unique species
paste("total number of species before after additional abundance cutoff of ",
opt$abundance_cutoff*100,"%, is: ",
length(spec_list$Species),sep=""))
#Merge species and genus name
spec_list[,"Species"] = paste(spec_list[,"Genus"],
spec_list[,"Species"],sep="") #paste genus name to species one
spec_list[,"Species"] = gsub("g_", "",spec_list[,"Species"])
spec_list[,"Species"] = gsub("s_", " ",spec_list[,"Species"])
spec_list=spec_list$Species
write.table(spec_list, # write species list
paste(opt$root,"06_taxa_output","spec_list_core_microbiota.tab",sep="/"),
sep="\t", row.names = F, col.names=F,quote = F)
### 3 - Plot relative abundance species -------------------------------------
#Plot species grouped by feed
phyl_plot = readRDS(paste(opt$root,"06_taxa_output",
"physeq_kraken_reads_conf03_refseq_glomspec.rds",sep="/"))
phyl_plot = prune_taxa(taxa_sums(phyl_plot) > 1, phyl_plot) #remove taxa lacking reads
#Merge species and genus name
tax_table(phyl_plot)[,"Species"] = paste(tax_table(phyl_plot)[,"Genus"],
tax_table(phyl_plot)[,"Species"],
sep="") #paste genus name to species one
tax_table(phyl_plot)[,"Species"] = gsub("g_", "",tax_table(phyl_plot)[,"Species"])
tax_table(phyl_plot)[,"Species"] = gsub("s_", " ",tax_table(phyl_plot)[,"Species"])
taxa_names(phyl_plot) = tax_table(phyl_plot)[,"Species"] # rename identifyer by species
#select settings
phyl_plot = func_aggregate_top_taxa2(phyl_plot, "Species", top = 12) #reduce to species
phyl_plot = microbiome::transform(phyl_plot, "compositional") #compositional
order = names(sort(taxa_sums(phyl_plot), F)[1:13])
order = order[order != "Other"]
order = c("Other",order)
# plotting
plot_composition(phyl_plot,
sample.sort = NULL,
otu.sort=order, x.label="Sample") +
geom_bar(position="stack", stat="identity", color=NA)+
theme_bw() +
labs(title="Top 12 abundant bacterial species")+
ylab( "Abundance (%)")+ xlab( "Sample ID")+
theme(legend.justification = "top",
legend.title = element_blank(),
legend.key.size = unit(4.5, 'mm'),
plot.title = element_text(size = 12),
axis.title.y = element_text(color="Grey1", size=13, face="bold"),
axis.text.y =element_text(color="Grey1", size=11),
axis.text.x =element_text(color="Grey1", size=9, angle = 90, vjust = 0.5),
axis.title.x = element_text(color="Grey1", size = 13, face="bold"),
panel.grid.major = element_blank(),
axis.ticks = element_blank())
ggsave(paste(opt$root,"06_taxa_output","abund_kraken_top12_spec.tiff",sep="/"),
units="in", width=18, height=8, dpi=300, compression = 'lzw')
########### Ends here ################################
end_time <- Sys.time()
print(" Duration of R processing:")
end_time - start_time
#
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment