From 398ab7eb7f0bfc415c0da2e5bb22e6a41b3f83ec Mon Sep 17 00:00:00 2001
From: "Aflitos, Saulo Alves" <sauloalves.aflitos@wur.nl>
Date: Mon, 1 Jun 2015 18:02:46 +0200
Subject: [PATCH] converter for multicolumn vcf. again

---
 vcfmerger/gen_makefile.py          | 40 ++++++-------
 vcfmerger/gen_makefile.py.examples | 32 +++++-----
 vcfmerger/split_multicolumn_vcf.py | 96 ++++++++++++++++++++++++++++++
 3 files changed, 132 insertions(+), 36 deletions(-)
 create mode 100755 vcfmerger/split_multicolumn_vcf.py

diff --git a/vcfmerger/gen_makefile.py b/vcfmerger/gen_makefile.py
index a8199fe..c77a35e 100755
--- a/vcfmerger/gen_makefile.py
+++ b/vcfmerger/gen_makefile.py
@@ -10,62 +10,62 @@ timestamp = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
 
 
 #/home/assembly/tomato150/programs/vcfmerger_ui/data/src/ara/indata
-#./vcfmerger/aux/gen_makefile.py --input arabidopsis.csv         --infasta TAIR10.fasta                                     --size 50000 --project arabidopsis_50k              --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols
+#./vcfmerger/gen_makefile.py --input arabidopsis.csv         --infasta TAIR10.fasta                                     --size 50000 --project arabidopsis_50k              --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols
 #make -f makefile_arabidopsis_50k
 #
-#./vcfmerger/aux/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_50000.gff.Chr4.gff.inversion.gff              --project arabidopsis_xianwen_50k      --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols
+#./vcfmerger/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_50000.gff.Chr4.gff.inversion.gff              --project arabidopsis_xianwen_50k      --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols
 #make -f makefile_arabidopsis_xianwen_50k
 #
-#./vcfmerger/aux/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_50000.gff.Chr4.gff.inversion.gff              --project arabidopsis_xianwen_50k_sing --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols --simplify-include-singleton
+#./vcfmerger/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_50000.gff.Chr4.gff.inversion.gff              --project arabidopsis_xianwen_50k_sing --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols --simplify-include-singleton
 #make -f makefile_arabidopsis_xianwen_50k_sing
 #
-#./vcfmerger/aux/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_10000.gff.Chr4.gff.inversion.gff              --project arabidopsis_xianwen_10k      --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols
+#./vcfmerger/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_10000.gff.Chr4.gff.inversion.gff              --project arabidopsis_xianwen_10k      --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols
 #make -f makefile_arabidopsis_xianwen_10k
 #
-#./vcfmerger/aux/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_10000.gff.Chr4.gff.inversion.gff              --project arabidopsis_xianwen_10k_sing --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols --simplify-include-singleton
+#./vcfmerger/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_10000.gff.Chr4.gff.inversion.gff              --project arabidopsis_xianwen_10k_sing --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols --simplify-include-singleton
 #make -f makefile_arabidopsis_xianwen_10k_sing
 #
 #
 #
 #/home/assembly/tomato150/programs/vcfmerger_ui/data/src/tom85
-#./vcfmerger/aux/gen_makefile.py --input short2.lst --infasta S_lycopersicum_chromosomes.2.40.fa --size 10000               --project tom84_10k               --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
+#./vcfmerger/gen_makefile.py --input short2.lst --infasta S_lycopersicum_chromosomes.2.40.fa --size 10000               --project tom84_10k               --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
 #make -f makefile_tom84_10k
 #
-#./vcfmerger/aux/gen_makefile.py --input short2.lst --infasta S_lycopersicum_chromosomes.2.40.fa --size 50000               --project tom84_50k               --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
+#./vcfmerger/gen_makefile.py --input short2.lst --infasta S_lycopersicum_chromosomes.2.40.fa --size 50000               --project tom84_50k               --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
 #make -f makefile_tom84_50k
 #
-#./vcfmerger/aux/gen_makefile.py --input short2.lst --filter-gff ITAG2.3_gene_models.gff3.gene.gff3                         --project tom84_genes             --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
+#./vcfmerger/gen_makefile.py --input short2.lst --filter-gff ITAG2.3_gene_models.gff3.gene.gff3                         --project tom84_genes             --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
 #make -f makefile_tom84_genes
 #
-#./vcfmerger/aux/gen_makefile.py --input short2.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_10000_introgression.gff --project tom84_10k_introgression --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
+#./vcfmerger/gen_makefile.py --input short2.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_10000_introgression.gff --project tom84_10k_introgression --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
 #make -f makefile_tom84_10k_introgression
 #
-#./vcfmerger/aux/gen_makefile.py --input short2.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000_introgression.gff --project tom84_50k_introgression --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
+#./vcfmerger/gen_makefile.py --input short2.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000_introgression.gff --project tom84_50k_introgression --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
 #make -f makefile_tom84_50k_introgression
 #
 #
 #
 #/home/assembly/tomato150/programs/vcfmerger_ui/data/src/RIL
-#./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k                        --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
+#./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k                        --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
 #make -f makefile_RIL_50k
 #
-#./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k_mode_ril               --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --cluster-no-cols
+#./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k_mode_ril               --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --cluster-no-cols
 #make -f makefile_RIL_50k_mode_ril
 #
-#./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k_mode_ril_greedy        --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-greedy --cluster-no-cols
+#./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k_mode_ril_greedy        --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-greedy --cluster-no-cols
 #make -f makefile_RIL_50k_mode_ril_greedy
 #
-#./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k_mode_ril_delete        --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-delete --cluster-no-cols
+#./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k_mode_ril_delete        --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-delete --cluster-no-cols
 #make -f makefile_RIL_50k_mode_ril_delete
 #
-#./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k_mode_ril_delete_greedy --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-greedy --concat-RIL-delete --cluster-no-cols
+#./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k_mode_ril_delete_greedy --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-greedy --concat-RIL-delete --cluster-no-cols
 #make -f makefile_RIL_50k_mode_ril_delete_greedy
 #
-#./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_10000.gff   --project RIL_10k                        --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
+#./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_10000.gff   --project RIL_10k                        --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
 #make -f makefile_RIL_10k
 
 SCRIPT_DIR   = 'vcfmerger'
-AUX_DIR      = os.path.join(SCRIPT_DIR, 'aux')
+AUX_DIR      = os.path.join(SCRIPT_DIR)
 
 
 merger       = os.path.abspath( os.path.join( SCRIPT_DIR, 'vcfmerger.py'    ) )
@@ -76,8 +76,8 @@ walk_ram     = os.path.abspath( os.path.join( SCRIPT_DIR, 'vcf_walk_ram.py' ) )
 walk_sql     = os.path.abspath( os.path.join( SCRIPT_DIR, 'vcf_walk_sql.py' ) )
 cluster      = os.path.abspath( os.path.join( SCRIPT_DIR, 'cluster.py'      ) )
 topng        = os.path.abspath( os.path.join( SCRIPT_DIR, 'newick_to_png.py') )
-fasta_spacer = os.path.abspath( os.path.join( AUX_DIR   , 'fasta_spacer.py' ) )
-tree_maker   = os.path.abspath( os.path.join( AUX_DIR   , 'FastTreeMP'      ) )
+fasta_spacer = os.path.abspath( os.path.join( SCRIPT_DIR, 'fasta_spacer.py' ) )
+tree_maker   = os.path.abspath( os.path.join( SCRIPT_DIR, 'FastTreeMP'      ) )
 
 
 class makewriter(object):
@@ -372,7 +372,7 @@ def main(args):
 
 
     if infasta:
-        #vcfmerger/aux/fasta_spacer.py GENOME.fa 50000
+        #vcfmerger/fasta_spacer.py GENOME.fa 50000
         gff_cmd = "%s %s %s"  % (fasta_spacer, infasta, size)
         writer.write( infasta, filter_gff, gff_cmd, nick='gff' )
 
diff --git a/vcfmerger/gen_makefile.py.examples b/vcfmerger/gen_makefile.py.examples
index e0c91fc..7c642ca 100644
--- a/vcfmerger/gen_makefile.py.examples
+++ b/vcfmerger/gen_makefile.py.examples
@@ -1,56 +1,56 @@
 /home/assembly/tomato150/programs/vcfmerger_ui/data/src/ara/indata
-./vcfmerger/aux/gen_makefile.py --input arabidopsis.csv         --infasta TAIR10.fasta                                     --size 50000 --project arabidopsis_50k              --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols
+./vcfmerger/gen_makefile.py --input arabidopsis.csv         --infasta TAIR10.fasta                                     --size 50000 --project arabidopsis_50k              --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols
 make -f makefile_arabidopsis_50k
 
-./vcfmerger/aux/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_50000.gff.Chr4.gff.inversion.gff              --project arabidopsis_xianwen_50k      --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols
+./vcfmerger/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_50000.gff.Chr4.gff.inversion.gff              --project arabidopsis_xianwen_50k      --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols
 make -f makefile_arabidopsis_xianwen_50k
 
-./vcfmerger/aux/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_50000.gff.Chr4.gff.inversion.gff              --project arabidopsis_xianwen_50k_sing --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols --simplify-include-singleton
+./vcfmerger/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_50000.gff.Chr4.gff.inversion.gff              --project arabidopsis_xianwen_50k_sing --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols --simplify-include-singleton
 make -f makefile_arabidopsis_xianwen_50k_sing
 
-./vcfmerger/aux/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_10000.gff.Chr4.gff.inversion.gff              --project arabidopsis_xianwen_10k      --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols
+./vcfmerger/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_10000.gff.Chr4.gff.inversion.gff              --project arabidopsis_xianwen_10k      --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols
 make -f makefile_arabidopsis_xianwen_10k
 
-./vcfmerger/aux/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_10000.gff.Chr4.gff.inversion.gff              --project arabidopsis_xianwen_10k_sing --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols --simplify-include-singleton
+./vcfmerger/gen_makefile.py --input arabidopsis_xianwen.csv --filter-gff TAIR10.fasta_10000.gff.Chr4.gff.inversion.gff              --project arabidopsis_xianwen_10k_sing --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --excluded-chrom chloroplast --excluded-chrom mitochondria --cluster-no-cols --simplify-include-singleton
 make -f makefile_arabidopsis_xianwen_10k_sing
 
 
 
 /home/assembly/tomato150/programs/vcfmerger_ui/data/src/tom85
-./vcfmerger/aux/gen_makefile.py --input short2.lst --infasta S_lycopersicum_chromosomes.2.40.fa --size 10000               --project tom84_10k               --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
+./vcfmerger/gen_makefile.py --input short2.lst --infasta S_lycopersicum_chromosomes.2.40.fa --size 10000               --project tom84_10k               --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
 make -f makefile_tom84_10k
 
-./vcfmerger/aux/gen_makefile.py --input short2.lst --infasta S_lycopersicum_chromosomes.2.40.fa --size 50000               --project tom84_50k               --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
+./vcfmerger/gen_makefile.py --input short2.lst --infasta S_lycopersicum_chromosomes.2.40.fa --size 50000               --project tom84_50k               --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
 make -f makefile_tom84_50k
 
-./vcfmerger/aux/gen_makefile.py --input short2.lst --filter-gff ITAG2.3_gene_models.gff3.gene.gff3                         --project tom84_genes             --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
+./vcfmerger/gen_makefile.py --input short2.lst --filter-gff ITAG2.3_gene_models.gff3.gene.gff3                         --project tom84_genes             --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
 make -f makefile_tom84_genes
 
-./vcfmerger/aux/gen_makefile.py --input short2.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_10000_introgression.gff --project tom84_10k_introgression --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
+./vcfmerger/gen_makefile.py --input short2.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_10000_introgression.gff --project tom84_10k_introgression --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
 make -f makefile_tom84_10k_introgression
 
-./vcfmerger/aux/gen_makefile.py --input short2.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000_introgression.gff --project tom84_50k_introgression --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
+./vcfmerger/gen_makefile.py --input short2.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000_introgression.gff --project tom84_50k_introgression --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
 make -f makefile_tom84_50k_introgression
 
 
 
 /home/assembly/tomato150/programs/vcfmerger_ui/data/src/RIL
-./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k                        --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
+./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k                        --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
 make -f makefile_RIL_50k
 
-./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k_mode_ril               --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --cluster-no-cols
+./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k_mode_ril               --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --cluster-no-cols
 make -f makefile_RIL_50k_mode_ril
 
-./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k_mode_ril_greedy        --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-greedy --cluster-no-cols
+./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k_mode_ril_greedy        --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-greedy --cluster-no-cols
 make -f makefile_RIL_50k_mode_ril_greedy
 
-./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k_mode_ril_delete        --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-delete --cluster-no-cols
+./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k_mode_ril_delete        --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-delete --cluster-no-cols
 make -f makefile_RIL_50k_mode_ril_delete
 
-./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k_mode_ril_delete_greedy --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-greedy --concat-RIL-delete --cluster-no-cols
+./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_50000.gff   --project RIL_50k_mode_ril_delete_greedy --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --concat-RIL --concat-RIL-greedy --concat-RIL-delete --cluster-no-cols
 make -f makefile_RIL_50k_mode_ril_delete_greedy
 
-./vcfmerger/aux/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_10000.gff   --project RIL_10k                        --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
+./vcfmerger/gen_makefile.py --input RIL.lst --filter-gff S_lycopersicum_chromosomes.2.40.fa_10000.gff   --project RIL_10k                        --no-pickle --cluster-no-svg --smart_threads 25 --cluster-threads 5 --cluster-no-cols
 make -f makefile_RIL_10k
 
 
diff --git a/vcfmerger/split_multicolumn_vcf.py b/vcfmerger/split_multicolumn_vcf.py
new file mode 100755
index 0000000..bbf2be9
--- /dev/null
+++ b/vcfmerger/split_multicolumn_vcf.py
@@ -0,0 +1,96 @@
+#!/usr/bin/python
+
+import os
+import sys
+import string
+
+ignores = ['0/0', './.'] # reference, nocov
+
+valid_chars = frozenset("_%s%s" % (string.ascii_letters, string.digits))
+def sanitize(name):
+    return ''.join(c if c in valid_chars else '_' for c in name)
+
+def main():
+    try:
+        infile = os.sys.argv[1]
+    except:
+        print "no input file given"
+        print sys.argv[0], "<INPUT MULTICOLUMN CSV>"
+        sys.exit(1)
+
+    if not os.path.exists( infile ):
+        print "input file %s does not exists" % infile
+        sys.exit(1)
+
+    if os.path.isdir( infile ):
+        print "input file %s is a folder" % infile
+        sys.exit(1)
+
+    print "splitting %s" % infile
+    defs     = []
+    names    = []
+    outfiles = []
+    num_cols = None
+    with open(infile) as fhd:
+        for line in fhd:
+            line = line.strip()
+
+            if len(line) == 0:
+                continue
+
+            if line.startswith("#"): # header
+                #print "HEADER", line
+
+                if line.startswith("##"): # definition lines
+                    #print "HEADER :: DEF", line
+                    defs.append( line )
+
+                else: # column description
+                    #print "HEADER :: COL", line
+                    cols     = line.split("\t")
+                    num_cols = len(cols)
+                    shared   = cols[:9] #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMA
+                    names    = cols[9:]
+                    #print "HEADER :: COL :: SHARED", shared
+                    #print "HEADER :: COL :: NAMES" , names
+                    outfiles = [None]*len(names)
+                    outlist  = open("%s.lst" % infile, 'w')
+                    for np, name in enumerate(names):
+                        nof = ("%s_%0"+str(len("%d"%len(names)))+"d_%s.vcf") % (infile, np+1, sanitize(name))
+                        print ("creating %"+str(len("%d"%len(names)))+"d %-"+str(max([len(x) for x in names]))+"s to %s") % (np+1, name, nof)
+                        nop = open( nof, 'w' )
+
+                        #                               skipped valid
+                        outfiles[np] = [name, nof, nop, 0     , 0]
+
+                        outlist.write("1\t%s\t%s\n" % (os.path.abspath(nof), name))
+
+                        nop.write("\n".join(defs) + "\n")
+                        nop.write("##Split from: %s column %d\n" % ( os.path.abspath(infile), np + 1) )
+                        nop.write("\t".join(shared))
+                        nop.write("\t%s\n" % name)
+                        nop.flush()
+
+                continue
+
+            #print "DATA", line
+            cols   = line.split("\t")
+            assert len(cols) == num_cols
+            shared = cols[:9] #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMA
+            data   = cols[9:]
+            #print "shared", shared
+            #print "data"  , data
+            for pos, ndata in enumerate(data):
+                #outfiles[np] = [name, nof, 0, 0, nop]
+                if any([ndata.startswith(x) for x in ignores]):
+                    outfiles[pos][3] += 1 # skipped
+                    continue
+                outfiles[pos][4] += 1 # valid
+                outfiles[pos][2].write("\t".join(shared) + "\t%s\n" % ndata)
+
+    for nop, ndata in enumerate(outfiles):
+        ndata[2].close()
+        print ("closing %"+str(len("%d"%len(outfiles)))+"d %-"+str(max([len(x[0]) for x in outfiles]))+"s :: %-"+str(max([len(x[1]) for x in outfiles]))+"s :: skipped %6d exported %6d total %7d") % (nop+1, ndata[0], ndata[1], ndata[3], ndata[4], ndata[3] + ndata[4])
+
+if __name__ == '__main__':
+    main()
-- 
GitLab