Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Durairaj, Janani
caretta
Commits
06eb4b94
Commit
06eb4b94
authored
Oct 24, 2019
by
Ninjani
Browse files
pdb download
parents
59879f11
55708cdd
Changes
8
Expand all
Hide whitespace changes
Inline
Side-by-side
LICENSE
0 → 100644
View file @
06eb4b94
BSD 3-Clause License
Copyright (c) 2019, Janani Durairaj, Mehmet Akdel
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
README.md
View file @
06eb4b94
...
...
@@ -24,16 +24,11 @@ cd caretta
### Install both the command-line interface and the web-application:
```
bash
pip
install
-e
".[GUI]"
cd
bin
chmod
+x caretta-cli
chmod
+x caretta-app
```
### Install only the command-line interface:
```
bash
pip
install
.
cd
bin
chmod
+x caretta-cli
```
### Environment variables:
...
...
@@ -47,14 +42,14 @@ export NUMBA_NUM_THREADS=20 # change to required number of threads
### Command-line Usage
```
bash
./
caretta-cli input_pdb_folder
# e.g.
./
caretta-cli
../
test_data
caretta-cli input_pdb_folder
# e.g. caretta-cli test_data
# caretta-cli -h for more options
```
### Web-application Usage
```
bash
./
caretta-app <host-ip> <port>
caretta-app <host-ip> <port>
# e.g. caretta-app localhost 8091
```
bin/caretta-app
View file @
06eb4b94
...
...
@@ -474,7 +474,7 @@ def run_server(host="0.0.0.0", port=8888):
port
port
"""
app
.
run_server
(
host
=
host
,
port
=
port
,
debug
=
True
)
app
.
run_server
(
host
=
host
,
port
=
port
)
if
__name__
==
'__main__'
:
...
...
bin/caretta-app-demo
View file @
06eb4b94
This diff is collapsed.
Click to expand it.
caretta/feature_extraction.py
View file @
06eb4b94
import
multiprocessing
import
subprocess
from
pathlib
import
Path
import
numpy
as
np
...
...
@@ -112,14 +111,15 @@ def get_features(pdb_file: str, dssp_dir: str, only_dssp=True, force_overwrite=T
"""
pdb_file
=
str
(
pdb_file
)
_
,
name
,
_
=
helper
.
get_file_parts
(
pdb_file
)
protein
=
pd
.
parsePDB
(
pdb_file
).
select
(
"protein"
)
# if Path(pdb_file).suffix != ".pdb":
pdb_file
=
str
(
Path
(
dssp_dir
)
/
f
"
{
name
}
.pdb"
)
pd
.
writePDB
(
pdb_file
,
protein
)
protein
=
pd
.
parsePDB
(
pdb_file
)
if
Path
(
pdb_file
).
suffix
!=
".pdb"
:
pdb_file
=
str
(
Path
(
dssp_dir
)
/
f
"
{
name
}
.pdb"
)
pd
.
writePDB
(
pdb_file
,
protein
)
dssp_file
=
Path
(
dssp_dir
)
/
f
"
{
name
}
.dssp"
if
force_overwrite
or
not
dssp_file
.
exists
():
dssp_file
=
pd
.
execDSSP
(
str
(
pdb_file
),
outputname
=
name
,
outputdir
=
str
(
dssp_dir
))
protein
=
pd
.
parseDSSP
(
dssp
=
dssp_file
,
ag
=
protein
,
parseall
=
True
)
protein
=
pd
.
parseDSSP
(
dssp
=
str
(
dssp_file
)
,
ag
=
protein
,
parseall
=
True
)
data
=
get_dssp_features
(
protein
)
if
only_dssp
:
return
data
...
...
@@ -175,153 +175,3 @@ def get_dssp_features(protein_dssp):
return
data
def
get_electrostatics
(
protein
:
pd
.
AtomGroup
,
pdb_file
:
str
,
es_dir
:
str
,
overwrite
=
False
):
"""
Gets born and coulomb electrostatics for given protein
Parameters
----------
protein
pdb_file
es_dir
overwrite
Returns
-------
dict of born/coulomb _ Energy/x-force/y-force/z-force _ ca/cb/mean/min/max
"""
es_dir
=
str
(
es_dir
)
pdb_file
=
str
(
pdb_file
)
data_born
=
_get_electrostatics
(
protein
,
pdb_file
,
es_dir
,
es_type
=
"born"
,
overwrite
=
overwrite
)
data_coulomb
=
_get_electrostatics
(
protein
,
pdb_file
,
es_dir
,
es_type
=
"coulomb"
,
overwrite
=
overwrite
)
return
{
**
data_born
,
**
data_coulomb
}
def
_run_electrostatics
(
pdb_file
,
es_dir
:
str
,
es_type
:
str
,
overwrite
=
False
)
->
tuple
:
"""
Run apbs-born / apbs-coulomb on protein
NOTE: born is 0-indexed and coulomb is 1-indexed
Parameters
----------
pdb_file
es_dir
es_type
overwrite
Returns
-------
(es_file, pqr_file, add)
"""
assert
es_type
==
"born"
or
es_type
==
"coulomb"
_
,
name
,
_
=
helper
.
get_file_parts
(
pdb_file
)
pqr_file
=
Path
(
es_dir
)
/
f
"
{
name
}
.pqr"
if
not
pqr_file
.
exists
():
pdb2pqr
(
pdb_file
,
pqr_file
)
es_file
=
Path
(
es_dir
)
/
f
"
{
name
}
_
{
es_type
}
.txt"
if
es_type
==
"born"
:
add
=
0
if
overwrite
or
not
es_file
.
exists
():
apbs_born
(
str
(
pqr_file
),
str
(
es_file
))
else
:
add
=
1
if
overwrite
or
not
es_file
.
exists
():
apbs_coulomb
(
str
(
pqr_file
),
str
(
es_file
))
return
es_file
,
pqr_file
,
add
def
_get_electrostatics
(
protein
:
pd
.
AtomGroup
,
pdb_file
:
str
,
es_dir
:
str
,
es_type
:
str
=
"born"
,
overwrite
=
False
):
"""
Run apbs-born / apbs-coulomb on protein
NOTE: born is 0-indexed and coulomb is 1-indexed
Parameters
----------
protein
pdb_file
es_dir
es_type
overwrite
Returns
-------
dict of born/coulomb _ Energy/x-force/y-force/z-force _ ca/cb/mean/ #min/max#
"""
if
not
Path
(
pdb_file
).
exists
():
pd
.
writePDB
(
pdb_file
,
protein
)
es_file
,
pqr_file
,
add
=
_run_electrostatics
(
pdb_file
,
es_dir
,
es_type
,
overwrite
)
pqr_protein
=
pd
.
parsePQR
(
str
(
pqr_file
))
residue_splits
=
helper
.
group_indices
(
pqr_protein
.
getResindices
())
values
,
value_types
=
parse_electrostatics_file
(
es_file
)
data
=
{}
for
value_type
in
value_types
:
data
[
f
"
{
es_type
}
_
{
value_type
}
_ca"
]
=
np
.
array
(
[
values
[
index
+
add
][
value_type
]
if
value_type
in
values
[
index
+
add
]
else
0
for
index
in
helper
.
get_alpha_indices
(
pqr_protein
)])
data
[
f
"
{
es_type
}
_
{
value_type
}
_cb"
]
=
np
.
array
(
[
values
[
index
+
add
][
value_type
]
if
value_type
in
values
[
index
+
add
]
else
0
for
index
in
helper
.
get_beta_indices
(
pqr_protein
)])
data
[
f
"
{
es_type
}
_
{
value_type
}
_mean"
]
=
np
.
array
(
[
np
.
nanmean
([
values
[
x
+
add
][
value_type
]
for
x
in
split
if
(
x
+
add
)
in
values
and
value_type
in
values
[
x
+
add
]])
for
split
in
residue_splits
])
return
data
def
pdb2pqr
(
pdb_file
,
pqr_file
):
"""
Convert pdb file to pqr file
"""
exe
=
"/mnt/nexenta/durai001/programs/pdb2pqr-2.1.1/pdb2pqr"
command
=
f
"
{
exe
}
--ff=amber --apbs-input
{
pdb_file
}
{
pqr_file
}
"
subprocess
.
check_call
(
command
,
shell
=
True
)
def
apbs_born
(
pqr_file
,
output_file
,
epsilon
:
int
=
80
):
"""
Runs born electrostatics
"""
exe
=
"/mnt/nexenta/durai001/programs/APBS-1.5/share/apbs/tools/bin/born"
command
=
f
"
{
exe
}
-v -f
{
epsilon
}
{
pqr_file
}
>
{
output_file
}
"
subprocess
.
check_call
(
command
,
shell
=
True
)
def
apbs_coulomb
(
pqr_file
,
output_file
):
"""
Runs coulomb electrostatics
"""
exe
=
"/mnt/nexenta/durai001/programs/APBS-1.5/share/apbs/tools/bin/coulomb"
command
=
f
"
{
exe
}
-e -f
{
pqr_file
}
>
{
output_file
}
"
subprocess
.
check_call
(
command
,
shell
=
True
)
def
parse_electrostatics_file
(
filename
)
->
tuple
:
"""
Parse file returned by running apbs_born or apbs_coulomb
Parameters
----------
filename
Returns
-------
(dict(atom_number: dict(value_type: value)), set(value_types))
"""
values
=
{}
value_types
=
set
()
with
open
(
filename
)
as
f
:
for
i
,
line
in
enumerate
(
f
):
if
i
<
10
:
continue
if
"Atom"
not
in
line
:
break
line
=
line
.
lstrip
().
rstrip
()
atom_number
=
int
(
line
.
split
(
":"
)[
0
][
5
:])
value
=
float
(
line
.
split
(
"="
)[
1
].
split
()[
0
])
value_type
=
line
.
split
(
":"
)[
1
].
split
(
"="
)[
0
].
lstrip
().
rstrip
()
value_types
.
add
(
value_type
)
if
atom_number
not
in
values
:
values
[
atom_number
]
=
{}
values
[
atom_number
][
value_type
]
=
value
return
values
,
value_types
caretta/msa_numba.py
View file @
06eb4b94
...
...
@@ -3,7 +3,6 @@ import typing
from
dataclasses
import
dataclass
,
field
from
pathlib
import
Path
import
matplotlib.pyplot
as
plt
import
numba
as
nb
import
numpy
as
np
import
prody
as
pd
...
...
@@ -166,6 +165,20 @@ class Structure:
features
=
feature_extraction
.
get_features
(
str
(
pdb_file
),
str
(
dssp_dir
),
only_dssp
=
only_dssp
,
force_overwrite
=
force_overwrite
)
return
cls
(
pdb_name
,
pdb_file
,
sequence
,
helper
.
secondary_to_array
(
features
[
"secondary"
]),
features
,
coordinates
)
@
classmethod
def
from_pdb_id
(
cls
,
pdb_name
:
str
,
chain
:
str
,
dssp_dir
=
"caretta_tmp"
,
extract_all_features
=
True
,
force_overwrite
=
False
):
pdb
=
pd
.
parsePDB
(
pdb_name
).
select
(
"protein"
).
select
(
f
"chain
{
chain
}
"
)
pdb_file
=
pd
.
writePDB
(
pdb_name
,
pdb
)
alpha_indices
=
helper
.
get_alpha_indices
(
pdb
)
sequence
=
pdb
[
alpha_indices
].
getSequence
()
coordinates
=
pdb
[
alpha_indices
].
getCoords
().
astype
(
np
.
float64
)
only_dssp
=
(
not
extract_all_features
)
features
=
feature_extraction
.
get_features
(
str
(
Path
(
pdb_file
)),
str
(
dssp_dir
),
only_dssp
=
only_dssp
,
force_overwrite
=
force_overwrite
)
return
cls
(
pdb_name
,
pdb_file
,
sequence
,
helper
.
secondary_to_array
(
features
[
"secondary"
]),
features
,
coordinates
)
@
dataclass
class
OutputFiles
:
...
...
@@ -237,7 +250,7 @@ class StructureMultiple:
pdb_files
=
parse_pdb_files
(
input_pdb
)
if
not
Path
(
dssp_dir
).
exists
():
Path
(
dssp_dir
).
mkdir
()
pdbs
=
[
pd
.
parsePDB
(
filename
)
for
filename
in
pdb_files
]
pdbs
=
[
pd
.
parsePDB
(
filename
)
.
select
(
"protein"
)
for
filename
in
pdb_files
]
alpha_indices
=
[
helper
.
get_alpha_indices
(
pdb
)
for
pdb
in
pdbs
]
sequences
=
[
pdbs
[
i
][
alpha_indices
[
i
]].
getSequence
()
for
i
in
range
(
len
(
pdbs
))]
coordinates
=
[
np
.
hstack
((
pdbs
[
i
][
alpha_indices
[
i
]].
getCoords
().
astype
(
np
.
float64
),
np
.
zeros
((
len
(
sequences
[
i
]),
1
))
+
consensus_weight
))
...
...
@@ -548,33 +561,22 @@ class StructureMultiple:
if
not
output_pdb_folder
.
exists
():
output_pdb_folder
.
mkdir
()
reference_name
=
self
.
structures
[
0
].
name
reference_pdb
=
pd
.
parsePDB
(
str
(
self
.
structures
[
0
].
pdb_file
)
)
reference_pdb
=
pd
.
parsePDB
(
self
.
structures
[
0
].
pdb_file
)
core_indices
=
np
.
array
([
i
for
i
in
range
(
len
(
alignments
[
reference_name
]))
if
'-'
not
in
[
alignments
[
n
][
i
]
for
n
in
alignments
]])
aln_ref
=
helper
.
aligned_string_to_array
(
alignments
[
reference_name
])
ref_coords_core
=
reference_pdb
[
helper
.
get_alpha_indices
(
reference_pdb
)].
getCoords
().
astype
(
np
.
float64
)[
np
.
array
([
aln_ref
[
c
]
for
c
in
core_indices
])]
ref_centroid
=
rmsd_calculations
.
nb_mean_axis_0
(
ref_coords_core
)
ref_coords_core
-=
ref_centroid
transformation
=
pd
.
Transformation
(
np
.
eye
(
3
),
-
ref_centroid
)
reference_pdb
=
pd
.
applyTransformation
(
transformation
,
reference_pdb
)
# reference_pdb.setCoords(reference_pdb.getCoords().astype(np.float64) - ref_centroid)
pd
.
writePDB
(
str
(
output_pdb_folder
/
f
"
{
reference_name
}
.pdb"
),
reference_pdb
)
pdb_ref_2
=
pd
.
parsePDB
(
str
(
output_pdb_folder
/
f
"
{
reference_name
}
.pdb"
))
plt
.
scatter
(
pdb_ref_2
.
getCoords
()[:,
0
],
pdb_ref_2
.
getCoords
()[:,
1
],
alpha
=
0.3
)
# plt.show()
for
i
in
range
(
1
,
len
(
self
.
structures
)):
name
=
self
.
structures
[
i
].
name
pdb
=
pd
.
parsePDB
(
str
(
self
.
structures
[
i
].
pdb_file
)
)
pdb
=
pd
.
parsePDB
(
self
.
structures
[
i
].
pdb_file
)
aln_name
=
helper
.
aligned_string_to_array
(
alignments
[
name
])
common_coords_2
=
pdb
[
helper
.
get_alpha_indices
(
pdb
)].
getCoords
().
astype
(
np
.
float64
)[
np
.
array
([
aln_name
[
c
]
for
c
in
core_indices
])]
rotation_matrix
,
translation_matrix
=
rmsd_calculations
.
svd_superimpose
(
ref_coords_core
,
common_coords_2
)
# coords = rmsd_calculations.apply_rotran(pdb.getCoords().astype(np.float64), rotation_matrix, translation_matrix)
# pdb.setCoords(coords)
transformation
=
pd
.
Transformation
(
rotation_matrix
.
T
,
translation_matrix
)
pdb
=
pd
.
applyTransformation
(
transformation
,
pdb
)
pd
.
writePDB
(
str
(
output_pdb_folder
/
f
"
{
name
}
.pdb"
),
pdb
)
pdb_2
=
pd
.
parsePDB
(
str
(
output_pdb_folder
/
f
"
{
name
}
.pdb"
))
plt
.
scatter
(
pdb_2
.
getCoords
()[:,
0
],
pdb_2
.
getCoords
()[:,
1
],
alpha
=
0.3
)
# plt.show()
notebooks/Benchmark_Example_Usage.ipynb
deleted
100644 → 0
View file @
59879f11
This diff is collapsed.
Click to expand it.
setup.py
View file @
06eb4b94
...
...
@@ -4,10 +4,10 @@ setup(name='caretta',
version
=
'1.0'
,
authors
=
[
"Janani Durairaj"
,
"Mehmet Akdel"
],
packages
=
[
"caretta"
],
install_requires
=
[
"numpy
==1.16.2
"
,
"numba
==0.43.0
"
,
"prody"
,
"biopython"
,
"fire"
,
"pyparsing"
],
install_requires
=
[
"numpy"
,
"numba"
,
"prody"
,
"biopython"
,
"fire"
,
"pyparsing"
],
extras_require
=
{
'GUI'
:
[
"dash==1.3.1"
,
"dash-bio==0.1.4"
,
"cryptography"
,
"dash-core-components==1.2.1"
,
"dash-html-components==1.0.1"
,
"dash-renderer==1.1.0"
,
"dash-table==4.3.0"
,
"plotly==3.7.1"
,
"flask"
]
})
q
"dash-table==4.3.0"
,
"plotly==3.7.1"
,
"flask"
]
},
scripts
=
[
"bin/caretta-app"
,
"bin/caretta-cli"
]
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment