From 942f7d60f4d10f6b38bd3b9ea30132ef0976bf47 Mon Sep 17 00:00:00 2001 From: sauloal <sauloal@yahoo.com.br> Date: Mon, 2 Mar 2015 19:04:55 +0100 Subject: [PATCH] complete phylogeny --- filler.py | 133 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 96 insertions(+), 37 deletions(-) diff --git a/filler.py b/filler.py index 1e4b1e3..5698278 100755 --- a/filler.py +++ b/filler.py @@ -2,17 +2,19 @@ import os import sys +from collections import defaultdict import parser_SQL_struct +col_names = "species|species subgroup|species group|subgenus|genus|subtribe|tribe|subfamily|family|superfamily|parvorder|infraorder|suborder|order|superorder|infraclass|subclass|class|superclass|subphylum|phylum|subkingdom|kingdom|superkingdom|root" + class csv_holder(object): - def __init__(self, incsv, col_names): + def __init__(self, incsv): self.incsv = incsv - self.col_names = col_names - self.num_levels = len(col_names) self.names = [] self.data = {} + self.col_names = col_names.split("|") self.read_csv() def read_csv(self): @@ -38,10 +40,14 @@ class csv_holder(object): self.names.append( name ) - self.data[ name ] = [None]*self.num_levels + self.data[ name ] = [None]*len(self.col_names) + + #if len(self.data) > 15: + # break def save(self, filename): print "saving to %s" % filename + with open(filename, 'w') as fhd: #print self.col_names fhd.write("name\t" + "\t".join(self.col_names) + "\n") @@ -50,12 +56,17 @@ class csv_holder(object): #print data cols = name + "\t" + "\t".join([str(d) if d is not None else "" for d in data]) fhd.write( cols + "\n" ) - + + class filler(object): def __init__(self, csv, querier): self.csv = csv self.querier = querier + + self.csv.col_names.insert(0, "tax_id" ) + self.csv.col_names.insert(0, "division") + self.get_ids() self.get_taxonomy() @@ -66,20 +77,23 @@ class filler(object): if len(tax_ids) == 0: print "Species '%s' does not exists" % name sys.exit(1) + elif len(tax_ids) > 1: print "More than one instance of %s" % name sys.exit(1) + else: tax_id = tax_ids[0] print "adding species %-30s id %7d" % (name, tax_id) - self.csv.data[name][1] = tax_id - #break - + self.csv.data[name].insert(0, tax_id ) + def get_taxonomy(self): + datas = {} + for name in sorted(self.csv.data): - tax_id = self.csv.data[name][1] + tax_id = self.csv.data[name][0] - print "getting taxonomy of '%s' id %d" % ( name, tax_id ) + print "getting taxonomy of %-30s id %d" % ( "'%s'"%name, tax_id ) node = get_node(tax_id) #print " node", node @@ -87,41 +101,84 @@ class filler(object): division_id = node.division_id division_name = get_division_name(division_id) - print " division_id", division_id, "division_name", division_name + datas[name] = data - + self.csv.data[name].insert(0, division_name) + + for name, data in datas.items(): for d in data: - p_rank = d[0] - p_tax_id = d[1] - p_name = get_name_from_tax_id(p_tax_id)[0] - d[3] = p_name - print " parent rank %-15s id %7d name %-30s" % (p_rank, p_tax_id, p_name) - + p_rank = d[0] + p_tax_id = d[1] + p_name = get_name_from_tax_id(p_tax_id )[0] if p_tax_id is not None else "None" + d[2] = p_name + + p_offspring_rank = d[3] + p_offspring_tax_id = d[4] + p_offspring_name = get_name_from_tax_id(p_offspring_tax_id)[0] if p_offspring_tax_id is not None else "None" + d[5] = p_offspring_name + + p_parent_rank = d[6] + p_parent_tax_id = d[7] + p_parent_name = get_name_from_tax_id(p_parent_tax_id )[0] if p_parent_tax_id is not None else "None" + d[8] = p_parent_name + + print " parent rank %-17s id %s name %-30s off (%-17s %s %-30s) par (%-17s %s %-30s)" % \ + (p_rank , "%7d"%p_tax_id if p_tax_id is not None else " None", p_name, + p_offspring_rank, "%7d"%p_offspring_tax_id if p_offspring_tax_id is not None else " None", p_offspring_name, + p_parent_rank , "%7d"%p_parent_tax_id if p_parent_tax_id is not None else " None", p_parent_name) + + if p_rank not in self.csv.col_names: + print "unknown rank %s" % p_rank + sys.exit(1) + if p_rank in self.csv.col_names: p_pos = self.csv.col_names.index(p_rank) self.csv.data[name][p_pos] = p_name - - self.csv.data[name][0] = division_name - #print self.csv.data[name] - - #break + print + + + def parse_node(node): data = [] - parent = node.parent - data.append( [node.rank, node.tax_id, parent.parent.tax_id, None] ) #print " ", data[-1] - while parent.tax_id != 1: - data.append( [parent.rank, parent.tax_id, parent.parent.tax_id, None] ) - #print " ", data[-1] - parent = parent.parent - - data.append( [parent.rank, parent.tax_id, parent.parent.tax_id, None] ) + + prev_rank = None + current_rank = node.rank + next_rank = node.parent.rank + + prev_tax_id = None + current_tax_id = node.tax_id + next_tax_id = node.parent.tax_id + + current = node + + while current_tax_id != 1: + #print current + if current_rank != "no rank": + while next_rank == "no rank" and next_tax_id != 1: + current = current.parent + next_rank = current.parent.rank + next_tax_id = current.parent.tax_id + + data.append( [current_rank, current_tax_id, None, prev_rank, prev_tax_id, None, next_rank, next_tax_id, None] ) + + prev_rank = current_rank + prev_tax_id = current_tax_id + + current = current.parent + current_rank = current.rank + current_tax_id = current.tax_id + next_rank = current.parent.rank + next_tax_id = current.parent.tax_id + + current_rank = "root" + data[-1][6] = current_rank + data.append( [current_rank, current_tax_id, None, prev_rank, prev_tax_id, None, None, None, None] ) - #print " ", data[-1] return data def get_ranks(): @@ -129,7 +186,7 @@ def get_ranks(): ranks = session.query(field).distinct().all() ranks = [r.rank for r in ranks] ranks.sort() - ranks.remove("no rank") + #ranks.remove("no rank") return ranks def query_name(name): @@ -190,10 +247,12 @@ def main(args): elif cmd == "fill": incsv = args[1] - ranks = get_ranks() - ranks.insert(0, "tax_id" ) - ranks.insert(0, "division") - holder = csv_holder(incsv, ranks) + #ranks = get_ranks() + #ranks = [] + #ranks.insert(0, "tax_id" ) + #ranks.insert(0, "division") + #holder = csv_holder(incsv, ranks) + holder = csv_holder(incsv) fill = filler(holder, get_tax_id_from_name) holder.save(incsv + '.filled.csv') -- GitLab