Commit d62f0de2 authored by Aflitos, Saulo Alves's avatar Aflitos, Saulo Alves
Browse files

init

parents
Pipeline #417 skipped
*.vcf.gz
*.vcf
#!/usr/bin/python
##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT /panfs/ANIMAL/group001/minjiumeng/tomato_reseq/SZAXPI008746-45
##samtoolsVersion
#rm fixed/*; for file in *.vcf.gz; do echo $file; gunzip -c $file | ./fixVcf.py sample_names.txt | gzip -1 -c > fixed/$file; done;
import sys
import os
import gzip
import hashlib
import datetime
import traceback
from copy import deepcopy
from pprint import pprint as pp
rw_metadata = True
merge_xml = True
forbiddenChroms = None
samplefile = None
xml_template = None
spp_codes_file = None
chromosomeAlias = None
chromosomeAliasXml = None
outfolder = None
center_name = None
proj_name = None
now_str = datetime.datetime.now().isoformat().replace(':', '_').replace('-', '_').replace('.', '_')
if os.path.exists( "rpc_chrom_alias.py" ):
sys.path.insert(0, '.')
execfile( "rpc_chrom_alias.py" )
#sys.stderr.write( str( sampledb ) )
#sys.stderr.write( str( forbiddenChroms ) )
#sys.exit( 0 )
def getBasename( filename ):
return os.path.basename( filename ).replace( '.vcf.gz', '' ).replace( '.vcf', '' )
def calcMD5( filename ):
return hashlib.md5( open(filename, 'rb').read() ).hexdigest()
def loadSampleDb( samplefile ):
sampledb = {}
try:
#./RF_001_SZAXPI008746-45.vcf.gz /panfs/ANIMAL/group001/minjiumeng/tomato_reseq/SZAXPI008746-45
with open( samplefile ) as fhd:
for line in fhd:
cols = line.split()
newname = getBasename( cols[0] )
sampledb[ cols[1] ] = newname
except:
print "error parsing sample file", samplefile
sys.exit(1)
#pprint.pprint( sampledb )
return sampledb
def loadSppCodes( spp_codes_file ):
spp_codes = {}
try:
#RF_001_SZAXPI008746-45.vcf.gz,S.lyc LA2706
with open( spp_codes_file ) as fhd:
lc = 0
colnames = []
for line in fhd:
if len(line) == 0:
continue
if line[0] == "#":
if lc == 0:
colnames = line[1:].strip().split('\t')
print colnames
continue
lc += 1
line = line.strip()
cols = line.split('\t')
infile = cols[0]
vals = {}
if len(colnames) != len(cols):
print "wrong number of columns", len(colnames), len(cols)
sys.exit(1)
for cn in xrange( 1, len(colnames)):
colname = colnames[cn]
val = cols[cn]
if colname.startswith('ATTR_'):
if 'ATTRIBUTES' not in vals:
vals[ 'ATTRIBUTES' ] = []
if colname.endswith('_KEY'):
attrNum = int( colname[ 5: -4 ] )
while len(vals[ 'ATTRIBUTES' ]) < attrNum:
vals[ 'ATTRIBUTES' ].append( [ [], [] ] )
vals[ 'ATTRIBUTES' ][ attrNum - 1 ][ 0 ] = val
elif colname.endswith('_VALUE'):
attrNum = int( colname[ 5: -6 ] )
vals[ 'ATTRIBUTES' ][ attrNum - 1 ][ 1 ] = val
else:
vals[ colname ] = val
#spp_codes[ infile ][ 'ALIAS' ] = spp_codes[ infile ][ 'CODE' ].replace( ' ', '_' ).replace( '.', '_' )
#vals[ 'BASENAME' ] = getBasename( infile )
spp_codes[ infile ] = vals
except:
print "error parsing spp codes file", spp_codes_file
#exc_type, exc_value, exc_traceback = sys.exc_info()
#traceback.print_tb(exc_traceback, limit=1, file=sys.stdout)
#formatted_lines = traceback.format_exc().splitlines()
#print repr(traceback.extract_tb(exc_traceback))
traceback.print_exc(file=sys.stdout)
sys.exit(1)
#pp( spp_codes )
#sys.exit(0)
return spp_codes
def hash_to_dots( name, conf, vals ):
for k in conf:
val = conf[k]
name2 = k
if name is not None:
#print "NAME IS NOT NONE. ADDING K", k
name2 = name + "." + k
#print "K", k, "NAME", name, "NAME2", name2, "TYPE", type(val)
if isinstance(val, dict):
#print "is dict", name2
hash_to_dots( name2, val, vals )
else:
#print "ADDING NAME2", name2, "VAL", val
vals[ name2 ] = val
def saveMetadata( infile, ofname, conf ):
outmd5 = ofname + '.md5'
outxml = ofname + '.xml'
outsubxml = outxml + '.submission.xml'
conf[ 'BASENAME' ] = infile
redoxml = False
if not os.path.exists( outmd5 ) or not os.path.exists( outxml ) or rw_metadata:
redoxml = True
print " saving md5",outmd5
print " calculating md5"
conf[ 'MD5' ] = calcMD5( ofname )
print " md5 calculated", conf[ 'MD5' ]
open( outmd5, 'w' ).write( "%s %s" % ( conf[ 'MD5' ], infile ) )
if not os.path.exists( outxml ) or rw_metadata or redoxml:
analysis_attributes = ""
if len( conf[ 'ATTRIBUTES' ] ) > 0:
analysis_attributes = " <ANALYSIS_ATTRIBUTES>\n"
for att in conf[ 'ATTRIBUTES' ]:
analysis_attributes += " <ANALYSIS_ATTRIBUTE>\n"
analysis_attributes += " <TAG>%s</TAG>\n" % att[0]
analysis_attributes += " <VALUE>%s</VALUE>\n" % att[1]
analysis_attributes += " </ANALYSIS_ATTRIBUTE>\n"
analysis_attributes += " </ANALYSIS_ATTRIBUTES>\n"
print analysis_attributes
#sys.exit(0)
conf[ '_VARS' ] = {
"ANALYSIS": {
"alias" : proj_name + '_' + conf[ 'SAMPLE_ALIAS' ] + '_' + now_str + '_analysis',
"center_name" : conf[ 'CENTER_NAME' ],
"analysis_center" : conf[ 'CENTER_NAME' ],
#"broker_name" : conf[ 'BROKER_NAME' ],
"date" : conf[ 'ANALYSIS_DATE' ],
#"accession" : conf[ 'ANALYSIS_ID' ], #ERZ016065 analysis id
#"IDENTIFIERS" : {
# "PRIMARY_ID" : conf[ 'ANALYSIS_ID' ], #ERZ016065 analysis id
# "SUBMITTER_ID" : {
# "namespace" : conf[ 'NAMESPACE' ],
# "_val" : conf[ 'SUBMITER_ID' ]
# }
#},
"TITLE" : conf[ 'TITLE' ],
"DESCRIPTION" : conf[ 'DESCRIPTION' ],
"STUDY_REF" : {
"accession" : conf[ 'STUDY_ID' ], # ERP001451 study read/analysis
"refcenter" : conf[ 'REFCENTER' ],
"refname" : conf[ 'STUDY_REF_NAME' ],
#"IDENTIFIERS" : {
# "PRIMARY_ID" : conf[ 'STUDY_ID' ], # ERP001451 study read/analysis
# "SUBMITTER_ID" : {
# "namespace" : conf[ 'NAMESPACE' ],
# "_val" : conf[ 'STUDY_SUBMITER_ID' ],
# }
#}
},
"SAMPLE_REF": {
"accession" : conf[ 'SAMPLE_ID' ], # ERS140602 sample
#"label" : conf[ 'BASENAME' ],
"refcenter" : conf[ 'REFCENTER' ],
"refname" : conf[ 'SAMPLE_REF_NAME' ],
#"IDENTIFIERS" : {
# "PRIMARY_ID" : conf[ 'SAMPLE_ID' ], # ERS140602 sample
# "SUBMITTER_ID" : {
# "namespace" : conf[ 'NAMESPACE' ],
# "_val" : conf[ 'SAMPLE_REF_NAME' ]
# }
#}
},
"FILES": {
"FILE": {
"filename" : os.path.join( conf[ 'EBI_FOLDER'], infile ),
"filetype" : "vcf",
"checksum_method" : "MD5",
"checksum" : conf[ 'MD5' ]
}
},
"ANALYSIS_TYPE": {
"SEQUENCE_VARIATION" : {
"ASSEMBLY" : {
"STANDARD" : {
"accession": conf[ 'ASSEMBLY_ACESSION' ]
}
},
"SEQUENCE" : chromosomeAliasXml.strip(),
"EXPERIMENT_TYPE" : conf[ 'EXPERIMENT_TYPE' ]
}
},
"ANALYSIS_ATTRIBUTES" : analysis_attributes.strip()
#"ANALYSIS_LINKS" : {
# "ANALYSIS_LINK" : {
# "XREF_LINK" : {
# "DB" : conf[ 'ANALYSIS_LINK_DB' ],
# "ID" : conf[ 'ERA' ] # ERA256779 submission (read/analysis)
# }
# }
#}
}
}
pp( conf[ '_VARS' ] )
vals = {}
hash_to_dots( None, conf[ '_VARS' ], vals )
pp( vals )
xmlcontent = "".join( open(xml_template, 'r').readlines() )
#print xmlcontent
for key in vals:
#print "replacing key", key
xmlcontent = xmlcontent.replace( "{%s}" % key, vals[key] )
print xmlcontent
#sys.exit(0)
print " saving xml", outxml
open( outxml, 'w' ).write( xmlcontent )
submission_data = {
"alias" : proj_name + '_' + conf[ 'SAMPLE_ALIAS' ] + '_' + now_str + '_submission',
"center_name": conf[ 'CENTER_NAME' ],
"source_file": os.path.basename( outxml ),
"hold_until" : datetime.date.today().isoformat()
}
#print submission_data
print " saving submission xml", outsubxml
sub_xml_content = """\
<?xml version="1.0" encoding="UTF-8"?>
<SUBMISSION alias="%(alias)s" center_name="%(center_name)s">
<ACTIONS>
<ACTION>
<ADD source="%(source_file)s" schema="analysis"/>
</ACTION>
<ACTION>
<HOLD HoldUntilDate="%(hold_until)s"/>
</ACTION>
</ACTIONS>
</SUBMISSION>""" % submission_data
print sub_xml_content
open( outsubxml, 'w' ).write( sub_xml_content )
def getfhd( infile ):
if infile.endswith( '.gz' ):
return gzip.open( infile )
else:
return open(infile, 'r')
def main(ifnames):
print "REWRITE METADATA ", rw_metadata
print "MERGE XML ", merge_xml
print "FORBIDDEN CHROMOSOMES", forbiddenChroms
print "SAMPLE FILE ", samplefile
print "XML TEMPLATE FILE ", xml_template
print "SPP CODES FILE ", spp_codes_file
print "OUT FOLDER ", outfolder
print "CENTER NAME ", center_name
print "PROJECT NAME ", proj_name
print "CURRENT TIME ", now_str
print "LOADING SAMPLE DB"
sampledb = loadSampleDb( samplefile )
pp( sampledb )
print "LOADING SPECIES CODES"
spp_codes = loadSppCodes( spp_codes_file )
#pp( spp_codes )
print "LOADING CHROMOSOME ALIAS"
pp( chromosomeAlias )
global chromosomeAliasXml
chromosomeAliasXml = ""
for chrom in sorted( chromosomeAlias ):
#"ANALYSIS.ANALYSIS_TYPE.SEQUENCE_VARIATION.ASSEMBLY.SEQUENCE.accession", "CAJW010010000.1",
#"ANALYSIS.ANALYSIS_TYPE.SEQUENCE_VARIATION.ASSEMBLY.SEQUENCE.label" , "contig_10000"
#<SEQUENCE accession="CAJW010010000.1" label="contig_10000"/>
code = chromosomeAlias[chrom]
chromosomeAliasXml += ' <SEQUENCE accession="%(accession)s" label="%(label)s"/>\n' % { "accession": code, "label": chrom }
print "CLEANING VCF FILES"
ofnames = []
ifc = 0
for ifname in ifnames:
ifc += 1
ofname = os.path.join(outfolder, os.path.basename(ifname) )
infile = os.path.basename( ifname )
ofnames.append( ofname + '.xml' )
print "infile %s %03d/%03d" % ( ifname, ifc, len(ifnames) )
print " out file", ofname
print " basename", infile
if not os.path.exists( ofname ):
print " filtering %s and saving in %s" % ( ifname, ofname )
with gzip.open( ofname, mode='wb' ) as ofhd:
with getfhd( ifname ) as ifhd:
for line in ifhd:
if line[0] == "#":
cols = line.split()
if line[:17] == '##samtoolsVersion':
ofhd.write(line)
ofhd.write('##contig=<ID=13,assembly=2.40,species="Solanum lycopersicum">\n')
elif len( cols ) >= 10 and cols[9] in sampledb:
line = line.replace( cols[9], sampledb[ cols[9] ] )
ofhd.write( line )
else:
ofhd.write( line )
else:
cols = line.split()
if cols[0] in forbiddenChroms:
pass
else:
ofhd.write(line)
else:
print " outfile %s already exists" % ofname
print "SAVING METADATA"
saveMetadata( infile, ofname, spp_codes[ infile ] )
if merge_xml:
outxmlfile = os.path.join( outfolder, 'application.xml' )
outsubxmlfile = outxmlfile + ".submission.xml"
with open( outxmlfile, 'w') as ofhd:
ofhd.write( """<?xml version="1.0" encoding="UTF-8"?>\n<ANALYSIS_SET>\n""" )
for ofname in ofnames:
with open( ofname, 'r' ) as ifhd:
for line in ifhd:
if '<?xml version="1.0" encoding="UTF-8"?>' in line:
continue
elif 'ANALYSIS_SET>' in line:
continue
else:
ofhd.write( line )
ofhd.write( """</ANALYSIS_SET>""" )
submission_data = {
"alias" : proj_name + '_merged_' + now_str + '_submission',
"center_name": center_name,
"source_file": os.path.basename( outxmlfile ),
"hold_until" : datetime.date.today().isoformat()
}
print submission_data
sub_xml_content = """\
<?xml version="1.0" encoding="UTF-8"?>
<SUBMISSION alias="%(alias)s" center_name="%(center_name)s">
<ACTIONS>
<ACTION>
<ADD source="%(source_file)s" schema="analysis"/>
</ACTION>
<ACTION>
<HOLD HoldUntilDate="%(hold_until)s"/>
</ACTION>
</ACTIONS>
</SUBMISSION>
""" % submission_data
print sub_xml_content
open( outsubxmlfile, 'w' ).write( sub_xml_content )
if __name__ == '__main__':
main(sys.argv[1:])
samplefile = "rpc_sample_names.txt"
xml_template = "rpc_template.xml"
spp_codes_file = "rpc_spp_codes.csv"
outfolder = 'fixed'
center_name = 'PRI'
proj_name = os.path.abspath(os.path.curdir).replace(' ', '_').replace('/', '_').replace('\\', '_').replace('___', '_').replace('__', '_').strip('_')
chromosomeAlias = {
"SL2.40ch01": "CM001064.1",
"SL2.40ch02": "CM001065.1",
"SL2.40ch03": "CM001066.1",
"SL2.40ch04": "CM001067.1",
"SL2.40ch05": "CM001068.1",
"SL2.40ch06": "CM001069.1",
"SL2.40ch07": "CM001070.1",
"SL2.40ch08": "CM001071.1",
"SL2.40ch09": "CM001072.1",
"SL2.40ch10": "CM001073.1",
"SL2.40ch11": "CM001074.1",
"SL2.40ch12": "CM001075.1",
}
forbiddenChroms = [ 'SL2.40ch00' ]
#xml_conf = {
# "ANALYSIS": {
# "alias" : "morex_haruna_nijo",
# "center_name": "Plant Research International",
# "broker_name": "ENSEMBL GENOMES",
# "accession" : "ERZ016065", # analysis id
# "IDENTIFIERS": {
# "PRIMARY_ID": "ERZ016065",
# "SUBMITTER_ID": {
# "namespace": "IPK-Gatersleben",
# "_val" : "morex_haruna_nijo"
# }
# },
# "TITLE" : "Variations called from whole genome shotgun survey sequencing of Barley cv. Haruna Nijo for de novo assembly of gene-space",
# "DESCRIPTION": "Variations called as described in: A physical, genetic and functional sequence assembly of the barley genome, Nature 2012, 491:711, PMID:23075845",
# "STUDY_REF" : {
# "accession": "ERP001451",
# "refname" : "Haruna Nijo_WGS",
# "refcenter": "IPK-Gatersleben",
# "IDENTIFIERS": {
# "PRIMARY_ID": "ERP001451",
# "SUBMITTER_ID": {
# "namespace": "IPK-Gatersleben",
# "_val" : "Haruna Nijo_WGS"
# }
# }
# },
# "SAMPLE_REF": {
# "accession": "ERS140602",
# "label" : "MAPPING_harunaNijoMOREX_BWAv1.sorted.rmdup.bam",
# "refname" : "Haruna Nijo",
# "refcenter": "IPK-Gatersleben",
# "IDENTIFIERS": {
# "PRIMARY_ID": "ERS140602",
# "SUBMITTER_ID": {
# "namespace": "IPK-Gatersleben",
# "val" : "Haruna Nijo"
# }
# }
# },
# "ANALYSIS_TYPE": {
# "SEQUENCE_VARIATION": {
# "ASSEMBLY": {
# "STANDARD": {
# "accession": "GCA_000188115.1"
# }
# },
# #"SEQUENCE": {
# #"accession": "CAJW010010000.1",
# #"label" :"contig_10000"
# #},
# "EXPERIMENT_TYPE": "Whole genome sequencing"
# }
# },
# "FILES": {
# "FILE": {
# "filename" : "morex:haruna_nijo.final.vcf",
# "filetype" : "vcf",
# "checksum_method": "MD5",
# "checksum" : "5d6017fab64f8a9160e1aac3d7276e48"
# }
# },
# "ANALYSIS_LINKS": {
# "ANALYSIS_LINK": {
# "XREF_LINK": {
# "DB": "ENA-SUBMISSION",
# "ID": "ERA256779"
# }
# }
# }
# }
#}
#pprint.pprint( chromosomeAlias )
#pprint.pprint( xml_conf )
#print chromosomeAlias
#pprint.pprint( xml_conf )
##RF_001 S.lyc LA27061
#sample reference ERS398435
#assembly acession GCA_000188115.1
#assembly link ftp://ftp.solgenomics.net/genomes/Solanum_lycopersicum/assembly/build_2.40/S_lycopersicum_chromosomes.2.40.fa.gz
#pprint.pprint( xml_conf )
#sys.exit(0)
#sys.exit(0)
#sys.exit(0)
Study accession Experiment accession Run accession Sample accession Secondary accession Sample unique name
PRJEB6659 ERX512709 ERR553640 ERS495775 SAMEA2625653 S_pimp_CGN144983
PRJEB6659 ERX512710 ERR553641 ERS495776 SAMEA2625654 RIL_601_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512711 ERR553642 ERS495777 SAMEA2625655 RIL_603_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512712 ERR553643 ERS495778 SAMEA2625656 RIL_608_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512713 ERR553644 ERS495779 SAMEA2625657 RIL_609_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512714 ERR553645 ERS495780 SAMEA2625658 RIL_610_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512715 ERR553646 ERS495781 SAMEA2625659 RIL_611_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512716 ERR553647 ERS495782 SAMEA2625660 RIL_612_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512717 ERR553648 ERS495783 SAMEA2625661 RIL_614_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512718 ERR553649 ERS495784 SAMEA2625662 RIL_615_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512719 ERR553650 ERS495785 SAMEA2625663 RIL_618_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512720 ERR553651 ERS495786 SAMEA2625664 RIL_619_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512721 ERR553652 ERS495787 SAMEA2625665 RIL_622_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512722 ERR553653 ERS495788 SAMEA2625666 RIL_623_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512723 ERR553654 ERS495789 SAMEA2625667 RIL_624_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512724 ERR553655 ERS495790 SAMEA2625668 RIL_625_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512725 ERR553656 ERS495791 SAMEA2625669 RIL_626_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512726 ERR553657 ERS495792 SAMEA2625670 RIL_630_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512727 ERR553658 ERS495793 SAMEA2625671 RIL_631_S_lyc_LA2463_x_S_pimp_CGN144982
PRJEB6659 ERX512728 ERR553659 ERS495794 SAMEA2625672 RIL_634_S_lyc_LA2463_x_S_pimp_CGN144982