diff --git a/get.sh b/get.sh index 69c7bd816356a4a85c4b2bdb3a1f2c7a40e91b9c..4638324105d2acad0664be24b4d6bb7a5a288b9d 100644 --- a/get.sh +++ b/get.sh @@ -8,3 +8,6 @@ tar --directory taxdump -xvf taxdump.tar.gz mkdir -p taxcat tar --directory taxcat -xvf taxcat.tar.gz + + +grep '|$' taxdump/citations.dmp diff --git a/parser.py b/parser.py index 3a19eca8a535cd848e88202642c8f8cf5cb1c233..4ffd42dd5a5b7c4036c721872a7469e0d4be53c1 100755 --- a/parser.py +++ b/parser.py @@ -22,10 +22,10 @@ INTERLINK = True # INTERLINK = False DUMP_DB_RAW = True -DUMP_DB_RAW = False +#DUMP_DB_RAW = False DUMP_DB_COMPILED = True -DUMP_DB_COMPILED = False +#DUMP_DB_COMPILED = False MAX_READ_LINES = None # MAX_READ_LINES = 50 @@ -164,8 +164,9 @@ def read_dump(fn, cfg): has_read_header = True print " header", line cols = line.split(sep) + print " header cols B", cols cols = [x.strip("\t").strip("\t|").replace(" ", "_") for x in cols] - print " header cols", cols + print " header cols A", cols cfg["header" ] = cols cfg["convertersA"] = [None]*len(cols) for p in xrange(len(cols)): @@ -200,7 +201,7 @@ def read_dump(fn, cfg): if DEBUG and ln <= DEBUG_LINES: print " line d cols", ln, dcols - cfg["data"].append( dcols ) + cfg["data"].append( tuple(dcols) ) if DEBUG and ln == DEBUG_BREAK: break @@ -369,7 +370,7 @@ def list_of_hashes_to_header_data(cfg): val = cfg["data"][pval] lst = [val[x] if x in val else None for x in keys] # v = placeholder( *lst ) - cfg["data"][pval] = lst + cfg["data"][pval] = tuple(lst) def parse_flag(v): @@ -385,9 +386,12 @@ def linearize(cfg): def read_raw(): + max_filetype = max([len( file_type ) in DATASET]) + max_filename = max([len(DATASET[file_type]["filename"]) in DATASET]) + for file_type in DATASET: filename = DATASET[file_type]["filename"] - print "file type", file_type, "file name", filename, "...", + print ("file type %"+max_filetype+"-s file name %"+max_filename+"-s") % ( file_type, filename ), '...', if os.path.exists(filename): print "OK" @@ -746,7 +750,6 @@ class DumpHolder(object): c["data" ] = self.data c["header" ] = self.header c["headerI"] = self.headerI - c["data" ] = self.data c["desc" ] = self.desc c["name" ] = self.name c["holders"] = self.holders @@ -847,7 +850,7 @@ class DumpHolder(object): return self.header def _get_item_val(self, item): - val = copy( self.data[item] ) + val = list(copy( self.data[item] )) #might not need copy statement if self.holders is not None: for holder_num in xrange(len(self.holders)): @@ -958,10 +961,11 @@ class DumpHolder(object): print "QFIND: col_name", col_name, "value", value, "res", res else: - if (not DEBUG) and (MAX_READ_LINES is None): - print " col_name", col_name, "value", value, "NOT FOUND" - print " ", index - sys.exit(1) + #if (not DEBUG) and (MAX_READ_LINES is None): + # print " db name", self.name, "col_name", col_name, "value", value, "NOT FOUND" + # print " ", sorted(index.keys()) + # sys.exit(1) + res = None else: col_pos = self.headerI[col_name] @@ -1139,7 +1143,7 @@ def main(): dmp = config[db_name] - print " printing el" + print "db", db_name, "printing el" elc = 0 for el in dmp: print el @@ -1148,7 +1152,7 @@ def main(): break - print " printing el as dict" + print "db", db_name, "printing el as dict" dmp.set_as_dict(True) elc = 0 for el in dmp: @@ -1158,7 +1162,7 @@ def main(): break - print " printing el as list" + print "db", db_name, "printing el as list" dmp.set_as_dict(False) dmp.set_as_list(True) elc = 0 @@ -1169,7 +1173,7 @@ def main(): break - print " printing el as tuple" + print "db", db_name, "printing el as tuple" dmp.set_use_named_tuple(True) dmp.set_as_dict(False) dmp.set_as_list(False) @@ -1181,7 +1185,7 @@ def main(): break - print " printing el as tuple and dict" + print "db", db_name, "printing el as tuple and dict" dmp.set_as_dict(True) elc = 0 for el in dmp: @@ -1191,7 +1195,7 @@ def main(): break - print " printing el as tuple and list" + print "db", db_name, "printing el as tuple and list" dmp.set_as_dict(False) dmp.set_as_list(True) elc = 0 @@ -1202,7 +1206,7 @@ def main(): break - print " printing el links" + print "db", db_name, "printing el as links" dmp.set_use_named_tuple(False) dmp.set_as_dict(False) dmp.set_as_list(False) @@ -1222,7 +1226,7 @@ def main(): if ITERATE_MAX is not None and elc > ITERATE_MAX: break - print " FINISHED" + print "db", db_name, "printing el FINISHED"