Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
R
Rational_designed_microbiome
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Package registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
WBVR_PathogenOmics
Rational_designed_microbiome
Commits
dfa93e7b
Commit
dfa93e7b
authored
2 years ago
by
Stege, Paul
Browse files
Options
Downloads
Patches
Plain Diff
Upload New File
parent
a43a750b
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
06_R_abund_prev_analysis.R
+228
-0
228 additions, 0 deletions
06_R_abund_prev_analysis.R
with
228 additions
and
0 deletions
06_R_abund_prev_analysis.R
0 → 100644
+
228
−
0
View file @
dfa93e7b
#=============================================================================================
#author Paul Stege
#date 11-2022
#script Analyse output kraken for abundance and prevalence
#software R ; R Core Team 2022
#package P.J. McMurdie and S. Holmes, 2012 ; phyloseq
# L. Lahti and S. Shetty, 2017 ; microbiome
#=============================================================================================
start_time
<-
Sys.time
()
# install packages --------------------------------------------------------
paks
<-
c
(
"dplyr"
,
"plyr"
,
"tidyverse"
,
"data.table"
,
"phyloseq"
,
"ggplot2"
,
"microbiome"
,
"optparse"
)
paks
<-
lapply
(
paks
,
function
(
x
)
suppressMessages
(
require
(
x
,
character.only
=
TRUE
)))
rm
(
paks
)
# load function --------------------------------------------------------
# from Sudarshan A. Shetty, & Leo Lahti. (2020, October). microbiomeutilities: Utilities for Microbiome Analytics
func_aggregate_top_taxa2
<-
function
(
x
,
top
,
level
)
{
x
<-
aggregate_taxa
(
x
,
level
)
tops
<-
top_taxa
(
x
,
top
)
tax
<-
tax_table
(
x
)
inds
<-
which
(
!
rownames
(
tax
)
%in%
tops
)
tax
[
inds
,
level
]
<-
"Other"
tax_table
(
x
)
<-
tax
tt
<-
tax_table
(
x
)[,
level
]
tax_table
(
x
)
<-
tax_table
(
tt
)
aggregate_taxa
(
x
,
level
)
}
# Configure scrpt settings ---------------------------------------------------------
# script format is Rscript -a abundance_cutoff -p prevelance_cutoff -r root_dir -i input_biom_file
# apply package "optparse"
option_list
=
list
(
make_option
(
c
(
"-a"
,
"--abundance_cutoff"
),
type
=
"numeric"
,
default
=
0.5
,
help
=
"abundance cutoff on scale of 0-1, where 0.0001 represents 0.01%"
,
metavar
=
"number"
),
make_option
(
c
(
"-p"
,
"--prevalence_cutoff"
),
type
=
"numeric"
,
default
=
0.0001
,
help
=
"prevalencecutoff on scale of 0-1, where 0.5 represents 50%"
,
metavar
=
"number"
),
make_option
(
c
(
"-r"
,
"--root_dir"
),
type
=
"character"
,
default
=
"/home/"
,
help
=
"directory for this project, containing all folders for reading and exporting data [default= %default]"
,
metavar
=
"character"
),
make_option
(
c
(
"-i"
,
"--input_biom_file"
),
type
=
"character"
,
default
=
"*.biom"
,
help
=
"input file, which is the biom kraken output file [default= %default]"
,
metavar
=
"character"
)
)
opt_parser
=
OptionParser
(
option_list
=
option_list
)
opt
=
parse_args
(
opt_parser
)
### 1 - Load data ------------------------------------------------------------------
#import kraken biom file as phyloseq
print
(
paste
(
"importing from "
,
# test if variables set correctly
paste
(
opt
$
root_dir
,
"05_kraken_testset"
,
opt
$
input_biom_file
,
sep
=
"/"
),
sep
=
""
))
phyl_kraken
=
import_biom
(
paste
(
opt
$
root_dir
,
"05_kraken_anno"
,
opt
$
input_biom_file
,
sep
=
"/"
))
#adjust otu file
phyl_otu
=
as
(
otu_table
(
phyl_kraken
),
"matrix"
)
%>%
as.data.frame
()
rownames
(
phyl_otu
)
=
paste
(
"OTU"
,
rownames
(
phyl_otu
),
sep
=
""
)
#adjust taxonomy
phyl_tax
=
as
(
tax_table
(
phyl_kraken
),
"matrix"
)
%>%
as.data.frame
()
# check tax file
colnames
(
phyl_tax
)
=
c
(
"Kingdom"
,
"Phylum"
,
"Class"
,
"Order"
,
"Family"
,
"Genus"
,
"Species"
)
phyl_tax
$
OTUID
=
paste
(
"OTU"
,
rownames
(
phyl_tax
),
sep
=
""
)
phyl_tax
=
sapply
(
phyl_tax
,
function
(
x
)
{
gsub
(
"__"
,
"_"
,
as.character
(
x
))})
%>%
as.data.frame
()
rownames
(
phyl_tax
)
=
phyl_tax
$
OTUID
phyl_tax
$
Phylum
=
gsub
(
'^p_$'
,
""
,
phyl_tax
$
Phylum
,
ignore.case
=
T
)
# replace empty p_
phyl_tax
$
Class
=
gsub
(
'^c_$'
,
""
,
phyl_tax
$
Class
,
ignore.case
=
T
)
# replace empty c_
phyl_tax
$
Order
=
gsub
(
'^o_$'
,
""
,
phyl_tax
$
Order
,
ignore.case
=
T
)
# replace empty o_
phyl_tax
$
Family
=
gsub
(
'^f_$'
,
""
,
phyl_tax
$
Family
,
ignore.case
=
T
)
# replace empty f_
phyl_tax
$
Genus
=
gsub
(
'^g_$'
,
""
,
phyl_tax
$
Genus
,
ignore.case
=
T
)
# replace empty g_
phyl_tax
$
Species
=
gsub
(
'^s_$'
,
""
,
phyl_tax
$
Species
,
ignore.case
=
T
)
# replace empty s_
#build mapping file
phyl_met
=
read.delim
(
paste
(
opt
$
root
,
"all_samples.tab"
,
sep
=
"/"
),
header
=
FALSE
)
colnames
(
phyl_met
)
=
"sample_ID"
rownames
(
phyl_met
)
=
phyl_met
$
sample_ID
#shape phyloseq object
phyl_otu
=
otu_table
(
phyl_otu
,
taxa_are_rows
=
TRUE
)
# otu file
phyl_tax
=
tax_table
(
as.matrix
(
phyl_tax
))
# taxonomy file
phyl_met
=
sample_data
(
phyl_met
,
errorIfNULL
=
T
)
# mapping file
phyl_kraken
=
phyloseq
(
phyl_otu
,
phyl_tax
)
# create test phyloseq object
phyl_kraken
phyl_kraken
<-
phyloseq
(
phyl_otu
,
phyl_tax
,
phyl_met
)
# including mapping
phyl_kraken
#check files and export
dir.create
(
file.path
(
paste
(
opt
$
root
,
"06_taxa_output"
,
sep
=
"/"
)))
# create directory
write.table
(
# write otu file
(
as
(
otu_table
(
phyl_kraken
),
"matrix"
)
%>%
as.data.frame
()),
paste
(
opt
$
root
,
"06_taxa_output"
,
"phyl_tax.tab"
,
sep
=
"/"
),
sep
=
"\t"
,
row.names
=
T
,
col.names
=
T
)
write.table
(
# write tax file
(
as
(
tax_table
(
phyl_kraken
),
"matrix"
)
%>%
as.data.frame
()),
paste
(
opt
$
root
,
"06_taxa_output"
,
"phyl_tax.tab"
,
sep
=
"/"
),
sep
=
"\t"
,
row.names
=
T
,
col.names
=
T
)
saveRDS
(
phyl_kraken
,
paste
(
opt
$
root
,
"06_taxa_output"
,
"physeq_kraken_reads_conf03_refseq.rds"
,
sep
=
"/"
))
# phyloseq file
#collapse to BACTERIAL species level tax_glom
phyl_kraken
=
subset_taxa
(
phyl_kraken
,
Kingdom
%in%
"k_Bacteria"
)
#only bacteria
phyl_glom
=
tax_glom
(
phyl_kraken
,
"Species"
,
NArm
=
T
)
# collapse species, remove unassigned
saveRDS
(
phyl_glom
,
paste
(
opt
$
root
,
"06_taxa_output"
,
"physeq_kraken_reads_conf03_refseq_glomspec.rds"
,
sep
=
"/"
))
# phyloseq file
### 2 - Prevalence and abudance cutoff --------------------------------------------
#load data
phyl_filter
=
readRDS
(
paste
(
opt
$
root
,
"06_taxa_output"
,
"physeq_kraken_reads_conf03_refseq_glomspec.rds"
,
sep
=
"/"
))
print
(
paste
(
"total number of species before cutoffs = "
,
# number of unique species
length
(
taxa_sums
(
phyl_filter
)),
sep
=
""
))
#Filter prevalence
tax_table
=
as
(
tax_table
(
phyl_filter
),
"matrix"
)
%>%
as.data.frame
()
filter_prev
=
data.frame
(
Prevalence
=
apply
(
X
=
otu_table
(
phyl_filter
),
MARGIN
=
ifelse
(
taxa_are_rows
(
phyl_filter
),
yes
=
1
,
no
=
2
),
FUN
=
function
(
x
){
sum
(
x
>
0
)}),
tax_table
)
prevThreshold
=
opt
$
prevalence_cutoff
*
nsamples
(
phyl_filter
)
# variable prevalence threshold
keepTaxa
=
rownames
(
filter_prev
)[(
filter_prev
>=
prevThreshold
)]
phyl_ancom
=
prune_taxa
(
keepTaxa
,
phyl_filter
)
print
(
# number of unique species
paste
(
"total number of species after a prevalence cutoff of "
,
opt
$
prevalence_cutoff
*
100
,
"%, is: "
,
length
(
taxa_sums
(
phyl_ancom
)),
sep
=
""
))
#filter abundance
minTotRelAbun
=
opt
$
abundance_cutoff
# variable abundance threshold
sum_taxa
=
taxa_sums
(
phyl_ancom
)
keepTaxa
=
taxa_names
(
phyl_ancom
)[
which
((
sum_taxa
/
sum
(
sum_taxa
))
>
minTotRelAbun
)]
phyl_ancom
=
prune_taxa
(
keepTaxa
,
phyl_ancom
)
spec_list
=
as
(
tax_table
(
phyl_ancom
),
"matrix"
)
%>%
as.data.frame
()
# check map file
print
(
# number of unique species
paste
(
"total number of species before after additional abundance cutoff of "
,
opt
$
abundance_cutoff
*
100
,
"%, is: "
,
length
(
spec_list
$
Species
),
sep
=
""
))
#Merge species and genus name
spec_list
[,
"Species"
]
=
paste
(
spec_list
[,
"Genus"
],
spec_list
[,
"Species"
],
sep
=
""
)
#paste genus name to species one
spec_list
[,
"Species"
]
=
gsub
(
"g_"
,
""
,
spec_list
[,
"Species"
])
spec_list
[,
"Species"
]
=
gsub
(
"s_"
,
" "
,
spec_list
[,
"Species"
])
spec_list
=
spec_list
$
Species
write.table
(
spec_list
,
# write species list
paste
(
opt
$
root
,
"06_taxa_output"
,
"spec_list_core_microbiota.tab"
,
sep
=
"/"
),
sep
=
"\t"
,
row.names
=
F
,
col.names
=
F
,
quote
=
F
)
### 3 - Plot relative abundance species -------------------------------------
#Plot species grouped by feed
phyl_plot
=
readRDS
(
paste
(
opt
$
root
,
"06_taxa_output"
,
"physeq_kraken_reads_conf03_refseq_glomspec.rds"
,
sep
=
"/"
))
phyl_plot
=
prune_taxa
(
taxa_sums
(
phyl_plot
)
>
1
,
phyl_plot
)
#remove taxa lacking reads
#Merge species and genus name
tax_table
(
phyl_plot
)[,
"Species"
]
=
paste
(
tax_table
(
phyl_plot
)[,
"Genus"
],
tax_table
(
phyl_plot
)[,
"Species"
],
sep
=
""
)
#paste genus name to species one
tax_table
(
phyl_plot
)[,
"Species"
]
=
gsub
(
"g_"
,
""
,
tax_table
(
phyl_plot
)[,
"Species"
])
tax_table
(
phyl_plot
)[,
"Species"
]
=
gsub
(
"s_"
,
" "
,
tax_table
(
phyl_plot
)[,
"Species"
])
taxa_names
(
phyl_plot
)
=
tax_table
(
phyl_plot
)[,
"Species"
]
# rename identifyer by species
#select settings
phyl_plot
=
func_aggregate_top_taxa2
(
phyl_plot
,
"Species"
,
top
=
12
)
#reduce to species
phyl_plot
=
microbiome
::
transform
(
phyl_plot
,
"compositional"
)
#compositional
order
=
names
(
sort
(
taxa_sums
(
phyl_plot
),
F
)[
1
:
13
])
order
=
order
[
order
!=
"Other"
]
order
=
c
(
"Other"
,
order
)
# plotting
plot_composition
(
phyl_plot
,
sample.sort
=
NULL
,
otu.sort
=
order
,
x.label
=
"Sample"
)
+
geom_bar
(
position
=
"stack"
,
stat
=
"identity"
,
color
=
NA
)
+
theme_bw
()
+
labs
(
title
=
"Top 12 abundant bacterial species"
)
+
ylab
(
"Abundance (%)"
)
+
xlab
(
"Sample ID"
)
+
theme
(
legend.justification
=
"top"
,
legend.title
=
element_blank
(),
legend.key.size
=
unit
(
4.5
,
'mm'
),
plot.title
=
element_text
(
size
=
12
),
axis.title.y
=
element_text
(
color
=
"Grey1"
,
size
=
13
,
face
=
"bold"
),
axis.text.y
=
element_text
(
color
=
"Grey1"
,
size
=
11
),
axis.text.x
=
element_text
(
color
=
"Grey1"
,
size
=
9
,
angle
=
90
,
vjust
=
0.5
),
axis.title.x
=
element_text
(
color
=
"Grey1"
,
size
=
13
,
face
=
"bold"
),
panel.grid.major
=
element_blank
(),
axis.ticks
=
element_blank
())
ggsave
(
paste
(
opt
$
root
,
"06_taxa_output"
,
"abund_kraken_top12_spec.tiff"
,
sep
=
"/"
),
units
=
"in"
,
width
=
18
,
height
=
8
,
dpi
=
300
,
compression
=
'lzw'
)
########### Ends here ################################
end_time
<-
Sys.time
()
print
(
" Duration of R processing:"
)
end_time
-
start_time
#
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment