From 398ab7eb7f0bfc415c0da2e5bb22e6a41b3f83ec Mon Sep 17 00:00:00 2001 From: "Aflitos, Saulo Alves" <sauloalves.aflitos@wur.nl> Date: Mon, 1 Jun 2015 18:02:46 +0200 Subject: [PATCH] converter for multicolumn vcf. again --- vcfmerger/gen_makefile.py | 40 ++++++------- vcfmerger/gen_makefile.py.examples | 32 +++++----- vcfmerger/split_multicolumn_vcf.py | 96 ++++++++++++++++++++++++++++++ 3 files changed, 132 insertions(+), 36 deletions(-) create mode 100755 vcfmerger/split_multicolumn_vcf.py diff --git a/vcfmerger/gen_makefile.py b/vcfmerger/gen_makefile.py index a8199fe..c77a35e 100755 --- a/vcfmerger/gen_makefile.py +++ b/vcfmerger/gen_makefile.py @@ -10,62 +10,62 @@ timestamp = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') #/home/assembly/tomato150/programs/vcfmerger_ui/data/src/ara/indata -#./vcfmerger/aux/gen_makefile.py --input arabidopsis.csv --infasta TAIR10.fasta --size 50000 --project arabidopsis_50k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols +#./vcfmerger/gen_makefile.py --input arabidopsis.csv --infasta TAIR10.fasta --size 50000 --project arabidopsis_50k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols #make -f makefile_arabidopsis_50k # -#./vcfmerger/aux/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_50000.gff.Chr4.gff.inversion.gff --project arabidopsis_xianwen_50k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols +#./vcfmerger/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_50000.gff.Chr4.gff.inversion.gff --project arabidopsis_xianwen_50k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols #make -f makefile_arabidopsis_xianwen_50k # -#./vcfmerger/aux/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_50000.gff.Chr4.gff.inversion.gff --project arabidopsis_xianwen_50k_sing --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols --simplify-include-singleton +#./vcfmerger/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_50000.gff.Chr4.gff.inversion.gff --project arabidopsis_xianwen_50k_sing --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols --simplify-include-singleton #make -f makefile_arabidopsis_xianwen_50k_sing # -#./vcfmerger/aux/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_10000.gff.Chr4.gff.inversion.gff --project arabidopsis_xianwen_10k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols +#./vcfmerger/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_10000.gff.Chr4.gff.inversion.gff --project arabidopsis_xianwen_10k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols #make -f makefile_arabidopsis_xianwen_10k # -#./vcfmerger/aux/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_10000.gff.Chr4.gff.inversion.gff --project arabidopsis_xianwen_10k_sing --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols --simplify-include-singleton +#./vcfmerger/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_10000.gff.Chr4.gff.inversion.gff --project arabidopsis_xianwen_10k_sing --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols --simplify-include-singleton #make -f makefile_arabidopsis_xianwen_10k_sing # # # #/home/assembly/tomato150/programs/vcfmerger_ui/data/src/tom85 -#./vcfmerger/aux/gen_makefile.py --input short2.lst --infasta S_lycopersicum_chromosomes.2.40.fa --size 10000 --project tom84_10k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols +#./vcfmerger/gen_makefile.py --input short2.lst --infasta S_lycopersicum_chromosomes.2.40.fa --size 10000 --project tom84_10k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols #make -f makefile_tom84_10k # -#./vcfmerger/aux/gen_makefile.py --input short2.lst --infasta S_lycopersicum_chromosomes.2.40.fa --size 50000 --project tom84_50k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols +#./vcfmerger/gen_makefile.py --input short2.lst --infasta S_lycopersicum_chromosomes.2.40.fa --size 50000 --project tom84_50k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols #make -f makefile_tom84_50k # -#./vcfmerger/aux/gen_makefile.py --input short2.lst --filter-gff ITAG2.3_gene_models.gff3.gene.gff3 --project tom84_genes --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols +#./vcfmerger/gen_makefile.py --input short2.lst --filter-gff ITAG2.3_gene_models.gff3.gene.gff3 --project tom84_genes --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols #make -f makefile_tom84_genes # -#./vcfmerger/aux/gen_makefile.py --input short2.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_10000_introgression.gff --project tom84_10k_introgression --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols +#./vcfmerger/gen_makefile.py --input short2.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_10000_introgression.gff --project tom84_10k_introgression --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols #make -f makefile_tom84_10k_introgression # -#./vcfmerger/aux/gen_makefile.py --input short2.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000_introgression.gff --project tom84_50k_introgression --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols +#./vcfmerger/gen_makefile.py --input short2.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000_introgression.gff --project tom84_50k_introgression --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols #make -f makefile_tom84_50k_introgression # # # #/home/assembly/tomato150/programs/vcfmerger_ui/data/src/RIL -#./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols +#./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols #make -f makefile_RIL_50k # -#./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k_mode_ril --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --cluster-no-cols +#./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k_mode_ril --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --cluster-no-cols #make -f makefile_RIL_50k_mode_ril # -#./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k_mode_ril_greedy --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-greedy --cluster-no-cols +#./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k_mode_ril_greedy --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-greedy --cluster-no-cols #make -f makefile_RIL_50k_mode_ril_greedy # -#./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k_mode_ril_delete --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-delete --cluster-no-cols +#./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k_mode_ril_delete --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-delete --cluster-no-cols #make -f makefile_RIL_50k_mode_ril_delete # -#./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k_mode_ril_delete_greedy --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-greedy --concat-RIL-delete --cluster-no-cols +#./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k_mode_ril_delete_greedy --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-greedy --concat-RIL-delete --cluster-no-cols #make -f makefile_RIL_50k_mode_ril_delete_greedy # -#./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_10000.gff --project RIL_10k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols +#./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_10000.gff --project RIL_10k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols #make -f makefile_RIL_10k SCRIPT_DIR = 'vcfmerger' -AUX_DIR = os.path.join(SCRIPT_DIR, 'aux') +AUX_DIR = os.path.join(SCRIPT_DIR) merger = os.path.abspath( os.path.join( SCRIPT_DIR, 'vcfmerger.py' ) ) @@ -76,8 +76,8 @@ walk_ram = os.path.abspath( os.path.join( SCRIPT_DIR, 'vcf_walk_ram.py' ) ) walk_sql = os.path.abspath( os.path.join( SCRIPT_DIR, 'vcf_walk_sql.py' ) ) cluster = os.path.abspath( os.path.join( SCRIPT_DIR, 'cluster.py' ) ) topng = os.path.abspath( os.path.join( SCRIPT_DIR, 'newick_to_png.py') ) -fasta_spacer = os.path.abspath( os.path.join( AUX_DIR , 'fasta_spacer.py' ) ) -tree_maker = os.path.abspath( os.path.join( AUX_DIR , 'FastTreeMP' ) ) +fasta_spacer = os.path.abspath( os.path.join( SCRIPT_DIR, 'fasta_spacer.py' ) ) +tree_maker = os.path.abspath( os.path.join( SCRIPT_DIR, 'FastTreeMP' ) ) class makewriter(object): @@ -372,7 +372,7 @@ def main(args): if infasta: - #vcfmerger/aux/fasta_spacer.py GENOME.fa 50000 + #vcfmerger/fasta_spacer.py GENOME.fa 50000 gff_cmd = "%s %s %s" % (fasta_spacer, infasta, size) writer.write( infasta, filter_gff, gff_cmd, nick='gff' ) diff --git a/vcfmerger/gen_makefile.py.examples b/vcfmerger/gen_makefile.py.examples index e0c91fc..7c642ca 100644 --- a/vcfmerger/gen_makefile.py.examples +++ b/vcfmerger/gen_makefile.py.examples @@ -1,56 +1,56 @@ /home/assembly/tomato150/programs/vcfmerger_ui/data/src/ara/indata -./vcfmerger/aux/gen_makefile.py --input arabidopsis.csv --infasta TAIR10.fasta --size 50000 --project arabidopsis_50k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols +./vcfmerger/gen_makefile.py --input arabidopsis.csv --infasta TAIR10.fasta --size 50000 --project arabidopsis_50k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols make -f makefile_arabidopsis_50k -./vcfmerger/aux/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_50000.gff.Chr4.gff.inversion.gff --project arabidopsis_xianwen_50k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols +./vcfmerger/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_50000.gff.Chr4.gff.inversion.gff --project arabidopsis_xianwen_50k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols make -f makefile_arabidopsis_xianwen_50k -./vcfmerger/aux/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_50000.gff.Chr4.gff.inversion.gff --project arabidopsis_xianwen_50k_sing --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols --simplify-include-singleton +./vcfmerger/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_50000.gff.Chr4.gff.inversion.gff --project arabidopsis_xianwen_50k_sing --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols --simplify-include-singleton make -f makefile_arabidopsis_xianwen_50k_sing -./vcfmerger/aux/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_10000.gff.Chr4.gff.inversion.gff --project arabidopsis_xianwen_10k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols +./vcfmerger/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_10000.gff.Chr4.gff.inversion.gff --project arabidopsis_xianwen_10k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols make -f makefile_arabidopsis_xianwen_10k -./vcfmerger/aux/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_10000.gff.Chr4.gff.inversion.gff --project arabidopsis_xianwen_10k_sing --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols --simplify-include-singleton +./vcfmerger/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_10000.gff.Chr4.gff.inversion.gff --project arabidopsis_xianwen_10k_sing --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols --simplify-include-singleton make -f makefile_arabidopsis_xianwen_10k_sing /home/assembly/tomato150/programs/vcfmerger_ui/data/src/tom85 -./vcfmerger/aux/gen_makefile.py --input short2.lst --infasta S_lycopersicum_chromosomes.2.40.fa --size 10000 --project tom84_10k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols +./vcfmerger/gen_makefile.py --input short2.lst --infasta S_lycopersicum_chromosomes.2.40.fa --size 10000 --project tom84_10k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols make -f makefile_tom84_10k -./vcfmerger/aux/gen_makefile.py --input short2.lst --infasta S_lycopersicum_chromosomes.2.40.fa --size 50000 --project tom84_50k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols +./vcfmerger/gen_makefile.py --input short2.lst --infasta S_lycopersicum_chromosomes.2.40.fa --size 50000 --project tom84_50k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols make -f makefile_tom84_50k -./vcfmerger/aux/gen_makefile.py --input short2.lst --filter-gff ITAG2.3_gene_models.gff3.gene.gff3 --project tom84_genes --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols +./vcfmerger/gen_makefile.py --input short2.lst --filter-gff ITAG2.3_gene_models.gff3.gene.gff3 --project tom84_genes --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols make -f makefile_tom84_genes -./vcfmerger/aux/gen_makefile.py --input short2.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_10000_introgression.gff --project tom84_10k_introgression --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols +./vcfmerger/gen_makefile.py --input short2.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_10000_introgression.gff --project tom84_10k_introgression --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols make -f makefile_tom84_10k_introgression -./vcfmerger/aux/gen_makefile.py --input short2.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000_introgression.gff --project tom84_50k_introgression --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols +./vcfmerger/gen_makefile.py --input short2.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000_introgression.gff --project tom84_50k_introgression --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols make -f makefile_tom84_50k_introgression /home/assembly/tomato150/programs/vcfmerger_ui/data/src/RIL -./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols +./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols make -f makefile_RIL_50k -./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k_mode_ril --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --cluster-no-cols +./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k_mode_ril --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --cluster-no-cols make -f makefile_RIL_50k_mode_ril -./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k_mode_ril_greedy --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-greedy --cluster-no-cols +./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k_mode_ril_greedy --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-greedy --cluster-no-cols make -f makefile_RIL_50k_mode_ril_greedy -./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k_mode_ril_delete --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-delete --cluster-no-cols +./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k_mode_ril_delete --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-delete --cluster-no-cols make -f makefile_RIL_50k_mode_ril_delete -./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k_mode_ril_delete_greedy --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-greedy --concat-RIL-delete --cluster-no-cols +./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff --project RIL_50k_mode_ril_delete_greedy --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-greedy --concat-RIL-delete --cluster-no-cols make -f makefile_RIL_50k_mode_ril_delete_greedy -./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_10000.gff --project RIL_10k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols +./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_10000.gff --project RIL_10k --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols make -f makefile_RIL_10k diff --git a/vcfmerger/split_multicolumn_vcf.py b/vcfmerger/split_multicolumn_vcf.py new file mode 100755 index 0000000..bbf2be9 --- /dev/null +++ b/vcfmerger/split_multicolumn_vcf.py @@ -0,0 +1,96 @@ +#!/usr/bin/python + +import os +import sys +import string + +ignores = ['0/0', './.'] # reference, nocov + +valid_chars = frozenset("_%s%s" % (string.ascii_letters, string.digits)) +def sanitize(name): + return ''.join(c if c in valid_chars else '_' for c in name) + +def main(): + try: + infile = os.sys.argv[1] + except: + print "no input file given" + print sys.argv[0], "<INPUT MULTICOLUMN CSV>" + sys.exit(1) + + if not os.path.exists( infile ): + print "input file %s does not exists" % infile + sys.exit(1) + + if os.path.isdir( infile ): + print "input file %s is a folder" % infile + sys.exit(1) + + print "splitting %s" % infile + defs = [] + names = [] + outfiles = [] + num_cols = None + with open(infile) as fhd: + for line in fhd: + line = line.strip() + + if len(line) == 0: + continue + + if line.startswith("#"): # header + #print "HEADER", line + + if line.startswith("##"): # definition lines + #print "HEADER :: DEF", line + defs.append( line ) + + else: # column description + #print "HEADER :: COL", line + cols = line.split("\t") + num_cols = len(cols) + shared = cols[:9] #CHROM POS ID REF ALT QUAL FILTER INFO FORMA + names = cols[9:] + #print "HEADER :: COL :: SHARED", shared + #print "HEADER :: COL :: NAMES" , names + outfiles = [None]*len(names) + outlist = open("%s.lst" % infile, 'w') + for np, name in enumerate(names): + nof = ("%s_%0"+str(len("%d"%len(names)))+"d_%s.vcf") % (infile, np+1, sanitize(name)) + print ("creating %"+str(len("%d"%len(names)))+"d %-"+str(max([len(x) for x in names]))+"s to %s") % (np+1, name, nof) + nop = open( nof, 'w' ) + + # skipped valid + outfiles[np] = [name, nof, nop, 0 , 0] + + outlist.write("1\t%s\t%s\n" % (os.path.abspath(nof), name)) + + nop.write("\n".join(defs) + "\n") + nop.write("##Split from: %s column %d\n" % ( os.path.abspath(infile), np + 1) ) + nop.write("\t".join(shared)) + nop.write("\t%s\n" % name) + nop.flush() + + continue + + #print "DATA", line + cols = line.split("\t") + assert len(cols) == num_cols + shared = cols[:9] #CHROM POS ID REF ALT QUAL FILTER INFO FORMA + data = cols[9:] + #print "shared", shared + #print "data" , data + for pos, ndata in enumerate(data): + #outfiles[np] = [name, nof, 0, 0, nop] + if any([ndata.startswith(x) for x in ignores]): + outfiles[pos][3] += 1 # skipped + continue + outfiles[pos][4] += 1 # valid + outfiles[pos][2].write("\t".join(shared) + "\t%s\n" % ndata) + + for nop, ndata in enumerate(outfiles): + ndata[2].close() + print ("closing %"+str(len("%d"%len(outfiles)))+"d %-"+str(max([len(x[0]) for x in outfiles]))+"s :: %-"+str(max([len(x[1]) for x in outfiles]))+"s :: skipped %6d exported %6d total %7d") % (nop+1, ndata[0], ndata[1], ndata[3], ndata[4], ndata[3] + ndata[4]) + +if __name__ == '__main__': + main() -- GitLab