Commit 06eb4b94 authored by Ninjani's avatar Ninjani
Browse files

pdb download

parents 59879f11 55708cdd
BSD 3-Clause License
Copyright (c) 2019, Janani Durairaj, Mehmet Akdel
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
......@@ -24,16 +24,11 @@ cd caretta
### Install both the command-line interface and the web-application:
```bash
pip install -e ".[GUI]"
cd bin
chmod +x caretta-cli
chmod +x caretta-app
```
### Install only the command-line interface:
```bash
pip install .
cd bin
chmod +x caretta-cli
```
### Environment variables:
......@@ -47,14 +42,14 @@ export NUMBA_NUM_THREADS=20 # change to required number of threads
### Command-line Usage
```bash
./caretta-cli input_pdb_folder
# e.g. ./caretta-cli ../test_data
caretta-cli input_pdb_folder
# e.g. caretta-cli test_data
# caretta-cli -h for more options
```
### Web-application Usage
```bash
./caretta-app <host-ip> <port>
caretta-app <host-ip> <port>
# e.g. caretta-app localhost 8091
```
......@@ -474,7 +474,7 @@ def run_server(host="0.0.0.0", port=8888):
port
port
"""
app.run_server(host=host, port=port, debug=True)
app.run_server(host=host, port=port)
if __name__ == '__main__':
......
This diff is collapsed.
import multiprocessing
import subprocess
from pathlib import Path
import numpy as np
......@@ -112,14 +111,15 @@ def get_features(pdb_file: str, dssp_dir: str, only_dssp=True, force_overwrite=T
"""
pdb_file = str(pdb_file)
_, name, _ = helper.get_file_parts(pdb_file)
protein = pd.parsePDB(pdb_file).select("protein")
# if Path(pdb_file).suffix != ".pdb":
pdb_file = str(Path(dssp_dir) / f"{name}.pdb")
pd.writePDB(pdb_file, protein)
protein = pd.parsePDB(pdb_file)
if Path(pdb_file).suffix != ".pdb":
pdb_file = str(Path(dssp_dir) / f"{name}.pdb")
pd.writePDB(pdb_file, protein)
dssp_file = Path(dssp_dir) / f"{name}.dssp"
if force_overwrite or not dssp_file.exists():
dssp_file = pd.execDSSP(str(pdb_file), outputname=name, outputdir=str(dssp_dir))
protein = pd.parseDSSP(dssp=dssp_file, ag=protein, parseall=True)
protein = pd.parseDSSP(dssp=str(dssp_file), ag=protein, parseall=True)
data = get_dssp_features(protein)
if only_dssp:
return data
......@@ -175,153 +175,3 @@ def get_dssp_features(protein_dssp):
return data
def get_electrostatics(protein: pd.AtomGroup, pdb_file: str, es_dir: str, overwrite=False):
"""
Gets born and coulomb electrostatics for given protein
Parameters
----------
protein
pdb_file
es_dir
overwrite
Returns
-------
dict of born/coulomb _ Energy/x-force/y-force/z-force _ ca/cb/mean/min/max
"""
es_dir = str(es_dir)
pdb_file = str(pdb_file)
data_born = _get_electrostatics(protein, pdb_file, es_dir, es_type="born", overwrite=overwrite)
data_coulomb = _get_electrostatics(protein, pdb_file, es_dir, es_type="coulomb", overwrite=overwrite)
return {**data_born, **data_coulomb}
def _run_electrostatics(pdb_file, es_dir: str, es_type: str, overwrite=False) -> tuple:
"""
Run apbs-born / apbs-coulomb on protein
NOTE: born is 0-indexed and coulomb is 1-indexed
Parameters
----------
pdb_file
es_dir
es_type
overwrite
Returns
-------
(es_file, pqr_file, add)
"""
assert es_type == "born" or es_type == "coulomb"
_, name, _ = helper.get_file_parts(pdb_file)
pqr_file = Path(es_dir) / f"{name}.pqr"
if not pqr_file.exists():
pdb2pqr(pdb_file, pqr_file)
es_file = Path(es_dir) / f"{name}_{es_type}.txt"
if es_type == "born":
add = 0
if overwrite or not es_file.exists():
apbs_born(str(pqr_file), str(es_file))
else:
add = 1
if overwrite or not es_file.exists():
apbs_coulomb(str(pqr_file), str(es_file))
return es_file, pqr_file, add
def _get_electrostatics(protein: pd.AtomGroup, pdb_file: str, es_dir: str, es_type: str = "born", overwrite=False):
"""
Run apbs-born / apbs-coulomb on protein
NOTE: born is 0-indexed and coulomb is 1-indexed
Parameters
----------
protein
pdb_file
es_dir
es_type
overwrite
Returns
-------
dict of born/coulomb _ Energy/x-force/y-force/z-force _ ca/cb/mean/ #min/max#
"""
if not Path(pdb_file).exists():
pd.writePDB(pdb_file, protein)
es_file, pqr_file, add = _run_electrostatics(pdb_file, es_dir, es_type, overwrite)
pqr_protein = pd.parsePQR(str(pqr_file))
residue_splits = helper.group_indices(pqr_protein.getResindices())
values, value_types = parse_electrostatics_file(es_file)
data = {}
for value_type in value_types:
data[f"{es_type}_{value_type}_ca"] = np.array(
[values[index + add][value_type] if value_type in values[index + add] else 0 for index in
helper.get_alpha_indices(pqr_protein)])
data[f"{es_type}_{value_type}_cb"] = np.array(
[values[index + add][value_type] if value_type in values[index + add] else 0 for index in
helper.get_beta_indices(pqr_protein)])
data[f"{es_type}_{value_type}_mean"] = np.array(
[np.nanmean([values[x + add][value_type] for x in split if (x + add) in values and value_type in values[x + add]]) for split in
residue_splits])
return data
def pdb2pqr(pdb_file, pqr_file):
"""
Convert pdb file to pqr file
"""
exe = "/mnt/nexenta/durai001/programs/pdb2pqr-2.1.1/pdb2pqr"
command = f"{exe} --ff=amber --apbs-input {pdb_file} {pqr_file}"
subprocess.check_call(command, shell=True)
def apbs_born(pqr_file, output_file, epsilon: int = 80):
"""
Runs born electrostatics
"""
exe = "/mnt/nexenta/durai001/programs/APBS-1.5/share/apbs/tools/bin/born"
command = f"{exe} -v -f {epsilon} {pqr_file} > {output_file}"
subprocess.check_call(command, shell=True)
def apbs_coulomb(pqr_file, output_file):
"""
Runs coulomb electrostatics
"""
exe = "/mnt/nexenta/durai001/programs/APBS-1.5/share/apbs/tools/bin/coulomb"
command = f"{exe} -e -f {pqr_file} > {output_file}"
subprocess.check_call(command, shell=True)
def parse_electrostatics_file(filename) -> tuple:
"""
Parse file returned by running apbs_born or apbs_coulomb
Parameters
----------
filename
Returns
-------
(dict(atom_number: dict(value_type: value)), set(value_types))
"""
values = {}
value_types = set()
with open(filename) as f:
for i, line in enumerate(f):
if i < 10:
continue
if "Atom" not in line:
break
line = line.lstrip().rstrip()
atom_number = int(line.split(":")[0][5:])
value = float(line.split("=")[1].split()[0])
value_type = line.split(":")[1].split("=")[0].lstrip().rstrip()
value_types.add(value_type)
if atom_number not in values:
values[atom_number] = {}
values[atom_number][value_type] = value
return values, value_types
......@@ -3,7 +3,6 @@ import typing
from dataclasses import dataclass, field
from pathlib import Path
import matplotlib.pyplot as plt
import numba as nb
import numpy as np
import prody as pd
......@@ -166,6 +165,20 @@ class Structure:
features = feature_extraction.get_features(str(pdb_file), str(dssp_dir), only_dssp=only_dssp, force_overwrite=force_overwrite)
return cls(pdb_name, pdb_file, sequence, helper.secondary_to_array(features["secondary"]), features, coordinates)
@classmethod
def from_pdb_id(cls, pdb_name: str, chain: str, dssp_dir="caretta_tmp",
extract_all_features=True, force_overwrite=False):
pdb = pd.parsePDB(pdb_name).select("protein").select(f"chain {chain}")
pdb_file = pd.writePDB(pdb_name, pdb)
alpha_indices = helper.get_alpha_indices(pdb)
sequence = pdb[alpha_indices].getSequence()
coordinates = pdb[alpha_indices].getCoords().astype(np.float64)
only_dssp = (not extract_all_features)
features = feature_extraction.get_features(str(Path(pdb_file)), str(dssp_dir), only_dssp=only_dssp,
force_overwrite=force_overwrite)
return cls(pdb_name, pdb_file, sequence, helper.secondary_to_array(features["secondary"]), features,
coordinates)
@dataclass
class OutputFiles:
......@@ -237,7 +250,7 @@ class StructureMultiple:
pdb_files = parse_pdb_files(input_pdb)
if not Path(dssp_dir).exists():
Path(dssp_dir).mkdir()
pdbs = [pd.parsePDB(filename) for filename in pdb_files]
pdbs = [pd.parsePDB(filename).select("protein") for filename in pdb_files]
alpha_indices = [helper.get_alpha_indices(pdb) for pdb in pdbs]
sequences = [pdbs[i][alpha_indices[i]].getSequence() for i in range(len(pdbs))]
coordinates = [np.hstack((pdbs[i][alpha_indices[i]].getCoords().astype(np.float64), np.zeros((len(sequences[i]), 1)) + consensus_weight))
......@@ -548,33 +561,22 @@ class StructureMultiple:
if not output_pdb_folder.exists():
output_pdb_folder.mkdir()
reference_name = self.structures[0].name
reference_pdb = pd.parsePDB(str(self.structures[0].pdb_file))
reference_pdb = pd.parsePDB(self.structures[0].pdb_file)
core_indices = np.array([i for i in range(len(alignments[reference_name])) if '-' not in [alignments[n][i] for n in alignments]])
aln_ref = helper.aligned_string_to_array(alignments[reference_name])
ref_coords_core = reference_pdb[helper.get_alpha_indices(reference_pdb)].getCoords().astype(np.float64)[
np.array([aln_ref[c] for c in core_indices])]
ref_centroid = rmsd_calculations.nb_mean_axis_0(ref_coords_core)
ref_coords_core -= ref_centroid
transformation = pd.Transformation(np.eye(3), -ref_centroid)
reference_pdb = pd.applyTransformation(transformation, reference_pdb)
# reference_pdb.setCoords(reference_pdb.getCoords().astype(np.float64) - ref_centroid)
pd.writePDB(str(output_pdb_folder / f"{reference_name}.pdb"), reference_pdb)
pdb_ref_2 = pd.parsePDB(str(output_pdb_folder / f"{reference_name}.pdb"))
plt.scatter(pdb_ref_2.getCoords()[:, 0], pdb_ref_2.getCoords()[:, 1], alpha=0.3)
# plt.show()
for i in range(1, len(self.structures)):
name = self.structures[i].name
pdb = pd.parsePDB(str(self.structures[i].pdb_file))
pdb = pd.parsePDB(self.structures[i].pdb_file)
aln_name = helper.aligned_string_to_array(alignments[name])
common_coords_2 = pdb[helper.get_alpha_indices(pdb)].getCoords().astype(np.float64)[np.array([aln_name[c] for c in core_indices])]
rotation_matrix, translation_matrix = rmsd_calculations.svd_superimpose(ref_coords_core, common_coords_2)
# coords = rmsd_calculations.apply_rotran(pdb.getCoords().astype(np.float64), rotation_matrix, translation_matrix)
# pdb.setCoords(coords)
transformation = pd.Transformation(rotation_matrix.T, translation_matrix)
pdb = pd.applyTransformation(transformation, pdb)
pd.writePDB(str(output_pdb_folder / f"{name}.pdb"), pdb)
pdb_2 = pd.parsePDB(str(output_pdb_folder / f"{name}.pdb"))
plt.scatter(pdb_2.getCoords()[:, 0], pdb_2.getCoords()[:, 1], alpha=0.3)
# plt.show()
This diff is collapsed.
......@@ -4,10 +4,10 @@ setup(name='caretta',
version='1.0',
authors=["Janani Durairaj", "Mehmet Akdel"],
packages=["caretta"],
install_requires=["numpy==1.16.2", "numba==0.43.0", "prody", "biopython", "fire", "pyparsing"],
install_requires=["numpy", "numba", "prody", "biopython", "fire", "pyparsing"],
extras_require={
'GUI': ["dash==1.3.1", "dash-bio==0.1.4", "cryptography",
"dash-core-components==1.2.1", "dash-html-components==1.0.1", "dash-renderer==1.1.0",
"dash-table==4.3.0", "plotly==3.7.1", "flask"]
})
q
"dash-table==4.3.0", "plotly==3.7.1", "flask"]},
scripts=["bin/caretta-app", "bin/caretta-cli"]
)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment