Commit c2ccc304 authored by Motazedi, Ehsan's avatar Motazedi, Ehsan
Browse files

Integrative pipeline for simulation, haplotyping and haplotyping evaluation of...

Integrative pipeline for simulation, haplotyping and haplotyping evaluation of (polyploid) genomes using NGS reads
parents
#!/usr/bin/env python
"""Converts the allele coding in SDhaP output, i.e. 1,2,3..., into the conventional VCF coding \
0,1,2,... In addition, variant id's are read from the corresponding vcf file and added as a column\
to the output. Written by Ehsan Motazedi, 19-08-2015, Wageningen UR"""
import sys
import getopt
import os
import os.path
import tempfile
import re
from shutil import move
class CustomException(Exception):
def __init__(self, value):
self.parameter = value
def __str__(self):
return repr(self.parameter)
def getinput(argv):
SDhaPphasefile = ''
outputfile = ''
vcffile=''
try:
opts, args = getopt.getopt(argv,"hp:o:v:",["help","phase=","output=","vcf="])
if not (opts or args):
opts=[('-h','')]
elif args:
raise getopt.GetoptError("Input arguments not in the proper format!", None)
else:
pass
except getopt.GetoptError as err:
print str(err)
print 'Command not correct!Try -h, --help!'
sys.exit(2)
else:
for opt, arg in opts:
if opt in ('-h','--help'):
print 'Convert the 1,2,3,... allele coding in the SDhaP phased solution to 0,1,2,...'
print "In addition, variant id's are read from the corresponding vcf file and added as a column to the output."
print 'Usage: ConvertAllelesSDhaP.py -p, --phase <SDhaP phased solution> -o, --output <output file> -v, --vcf <vcf file>'
sys.exit()
elif opt in ("-p", "--phase"):
SDhaPphasefile = arg
elif opt in ("-o", "--output"):
outputfile = arg
elif opt in ("-v", "--vcf"):
vcffile = arg
return (SDhaPphasefile, outputfile, vcffile)
if __name__ == "__main__":
args=getinput(sys.argv[1:])
try:
outfile=None # Output file stream
infile=None # Input file stream
vcffile=None
ploidy=0
for i in args:
if i=='':
print 'Usage: ConvertAllelesSDhaP.py -p, --phase <SDhaP phased solution> -o, --output <output file> -v, --vcf <vcf file>'
raise IOError("output and/or input and/or vcf file not specified!")
else:
pass
SDhaPphasefile=args[0]
outputfile=args[1]
vcffile=args[2]
if not os.path.isfile(SDhaPphasefile):
try:
raise IOError(2, 'No such file or directory',SDhaPphasefile)
except IOError as err:
if not err.args:
err.args=('',)
err.args = err.args + ("Could not find SDhaP's original phasing solution!",)
raise
elif not os.path.isfile(vcffile):
try:
raise IOError(2, 'No such file or directory',vcffile)
except IOError as err:
if not err.args:
err.args=('',)
err.args = err.args + ("Could not find the vcf file!",)
raise
else:
varpos_dic=dict()
ploidy=-1
with tempfile.NamedTemporaryFile(delete=False) as tmpvcf, open(vcffile, 'rU') as myvcf: # VCF file is first changed to be compatible with\
for _line in myvcf: # extractHAIR, which has produced SDhaP fragment file. myvcf should have been from the VCF file given to extractHAIR
if '#' not in _line:
genotype = re.split('/|\|', _line.rstrip().split()[9].split(':')[0])
if ploidy==-1 and '.' not in set(genotype): # The ploidy is first determined from the VCF file, as it is needed later to remove probable homozygous variants.
ploidy=len(genotype)
if '.' not in set(genotype) and len(set(genotype))>1: # Homozygous variants are omitted
tmpvcf.write(_line)
if ploidy<2:
raise CustomException("ERROR: Unexpected genotype detected in the VCF file or no called genotypes detected! Provide the correct format!")
with open(tmpvcf.name,'rU') as tmpvcf2: # The variant position is linked to the variant number
_i=0
for _line in tmpvcf2:
_i+=1
varpos_dic[str(_i)]=_line.rstrip().split()[1]
os.remove(tmpvcf.name)
tmpvcf, _i = None, None
with tempfile.NamedTemporaryFile(delete=False) as outfile, open(SDhaPphasefile, 'rU') as infile: # A temporary output is first made and the results are written to the output only \
for line in infile: # if everything goes smooth! This will protect the already existing files in case of error!
if (line=="\n"):
outfile.write(line) # Skip blank lines between haplotype blocks
else:
columns=line.rstrip().split()
if (columns[0]=="Block"):
outfile.write(line)
else:
l_col=len(columns)
if (l_col>=3):
try:
var_num=str(int(columns[0])+1) # Get the variant number
if ploidy>2: # Do NOT convert alleles for diploids as the SDhaP diploid output is already 0,1,...
for col_num in range(1,l_col): # Change allele coding 1,2,3,... to 0,1,2,... for polyploid output
srr=str(int(columns[col_num])-1) # Temporary string...
if int(srr)<0:
raise ValueError
else:
columns[col_num]=srr
columns=(var_num,)+(varpos_dic.get(var_num),)+tuple(columns[1:]) # Add the corresponding VCF position to the variant number and dosage
outfile.write('\t'.join(columns)+'\n')
except ValueError as err:
if not err.args:
err.args=('',)
err.args =err.args + ("Error in the index file! Allele coding should be 1,2,3...\
\nzero or invalid allele values observed!",)
raise
else:
raise CustomException("The SDhap phase file is corrupt!")
except IOError as e:
print >> sys.stderr, "I/O error({0}): {1} {2}".format(e.errno, e.strerror, e.filename)
print >> sys.stderr, str(e.args[-1])
if outfile:
os.remove(outfile.name)
sys.exit(2)
except ValueError as e:
print >> sys.stderr, "ERROR: "+str(type(e))+ '\n' + '\n'.join(e.args)
if outfile:
os.remove(outfile.name)
sys.exit(2)
except CustomException as instance:
print >> sys.stderr, instance
if outfile:
os.remove(outfile.name)
sys.exit(2)
except:
print >> sys.stderr, "Unexpected Error: %s" %sys.exc_info()[0]
if outfile:
os.remove(outfile.name)
raise
else:
move(outfile.name, outputfile)
print "Succesfully converted the alleles from 1,2,3... to 0,1,2... into {0}" \
.format(outputfile)
sys.exit()
#!/usr/bin/env python
"""Converts the HapCut input, i.e. the fargment matrix produced by extractHAIRS from .bam and .vcf files,
into HapTree or SDhaP input formats. Written by Ehsan Motazedi, 19-08-2015, Wageningen UR """
import sys
import getopt
import re
import os
import os.path
import tempfile
from shutil import move
class CustomException(Exception):
def __init__(self, value):
self.parameter = value
def __str__(self):
return repr(self.parameter)
def getinput(argv):
fragmentfile = ''
outputfile = ''
outformat=''
try:
opts, args = getopt.getopt(argv,"hf:o:x:",["help","fragment=","output=","outformat="])
if args:
raise getopt.GetoptError("Input arguments not in the proper format!", None)
else:
pass
except getopt.GetoptError as err:
print str(err)
print 'Command not correct! Try -h, --help!'
sys.exit(2)
else:
for opt, arg in opts:
if opt in ('-h', '--help'):
print 'FragmentPoly.py -f, --fragment <fragment> -o, --output <outputfile>, \
-x, --outformat <outputformat: SDhaP or HapTree>'
sys.exit()
elif opt in ("-f", "--fragment"):
fragmentfile = arg
elif opt in ("-o", "--output"):
outputfile = arg
elif opt in ("-x", "--outformat"):
outformat = arg
return (fragmentfile, outputfile,outformat)
if __name__ == "__main__": # <<The __main__ module starts here!>>
args=getinput(sys.argv[1:])
try:
infile=None # Input stream for the fragment matrix
outfile=None # Output stream for HapTree/SDhaP fragment file
writefile=None
for i in args:
if i=='':
print 'All parameters must be specified! \n \
Usage: FragmentPoly.py -f, --fragment <fragment> -o, \
--output <outputfile>, -x, --outformat <outputformat> (SDhaP or HapTree)'
raise CustomException("output and/or input and/or format not specified!")
else:
pass
fragmentfile=args[0]
outputfile=args[1]
outformat=args[2]
if not (outformat=='SDhaP' or outformat=='HapTree'):
print 'Output format must be either SDhaP or HapTree!'
raise CustomException("Output format not recognized!")
else:
pass
num_reads=0
num_snps=0
with tempfile.NamedTemporaryFile(delete=False) as outfile, open(fragmentfile, 'rU') as infile: # A temporary output is first made and the results\
for line in infile: # are written to the specified output only if everything goes smooth!
columns=re.split('\s*|\t',line.rstrip()) # This will protect the already existing files in case of error!
l_col=len(columns)-1
if ((l_col>=4) and (l_col%2==0)):
reads_start=columns[2:l_col:2]
hetro_sites=columns[3:l_col:2]
hetro_dict={}
for i in range(0,len(reads_start)):
start_pos=int(reads_start[i])
for j in range(0,len(hetro_sites[i])):
hetro_dict[start_pos-1]=int(hetro_sites[i][j])
start_pos+=1
if outformat=='HapTree':
if (len(hetro_dict)>1):
num_reads+=1
num_snps=max(num_snps,max(hetro_dict.keys()))
outfile.write(str(hetro_dict))
outfile.write("\n")
else:
pass
else: # if the format is SDhaP
if (len(hetro_dict)>1):
num_reads+=1
num_snps=max(num_snps,max(hetro_dict.keys()))
for col in range(3,l_col,2): # change allele coding 0,1,2,... to 1,2,3,...
srr="" # temporary string to convert VCF coding to SDhap coding
for allele in range(0,len(columns[col])):
srr+=str(int(columns[col][allele])+1)
columns[col]=srr
outfile.write(" ".join(columns))
outfile.write("\n")
else:
pass
else:
raise CustomException("The fragment matrix is corrupt!")
if os.path.getsize(fragmentfile)>0:
num_snps+=1
else:
pass
num_snps=str(num_snps)
num_reads=str(num_reads)
if outformat=='SDhaP':
seqs=""
with open(outfile.name, 'rU') as readfile, tempfile.NamedTemporaryFile(delete=False) as writefile:
writefile.write(num_reads+"\n"+num_snps+"\n")
for seqs in readfile:
writefile.write(seqs)
else:
pass
except IOError as e:
print >> sys.stderr, "I/O error({0}): {1} {2}".format(e.errno, e.strerror, e.filename)
if outfile:
os.remove(outfile.name)
if writefile:
os.remove(outfile.name)
sys.exit(2)
except ValueError:
print >> sys.stderr, "Error in the fragment matrix! Invalid values observed!"
if outfile:
os.remove(outfile.name)
if writefile:
os.remove(writefile.name)
sys.exit(2)
except CustomException as instance:
print >> sys.stderr, instance
if outfile:
os.remove(outfile.name)
if writefile:
os.remove(writefile.name)
sys.exit(2)
except:
print >> sys.stderr, "Unexpected Error: %s" %sys.exc_info()[0]
if outfile:
os.remove(outfile.name)
if writefile:
os.remove(writefile.name)
raise
else:
if outformat=='HapTree':
move(outfile.name, outputfile)
else:
os.remove(outfile.name)
move(writefile.name, outputfile)
print "%s SNPs written from %s reads." % (num_snps, num_reads)
print "Output successfully written to {0} fragment file.".format(outputfile)
sys.exit()
This diff is collapsed.
HAPLOSIM, version 1.5, Ehsan Motazedi (ehsan.motazedi@gmail.com), last modified: 06 09 2017
Introduction
============
**Haplosim**\(*haplosim.py*\) is a simulation pipeline to evaluate the performance of single individual haplotyping algorithms that make use of (polyploid) next generation sequencing (NGS) reads. Having specified a reference DNA sequence in fasta format, mutations are induced at single nucleotide level, according to the chosen stochastic model, by software HaploGenerator to simulate individual genomes with the desired ploidy level. According to the chosen technology, NGS reads are generated for Illumina (GA1, GA2, MiSeq, HiSeq 2000, HiSeq 2500) or PacBio (CCS and CLR) in silico, and haplotypes are reconstructed from those reads by three algorithms: HapCompass (Aguiar and Istrail 2013), HapTree (Berger *et al.* 2014) and SDhaP (Das and Bansal 2015). The accuracy of the estimated haplotypes will be assessed against the original haplotypes using several measures, e.g. Vector Error Rate (VER), Phasing Accuracy Rate (PAR), Proportion of missing variants, etc. As an option, one could also obtain compuation timeof the CPU and the memory consumption of each algorithm for each simulated genome. The quality evaluations for each algorithm are reported in the algorithm's DAT file, and the time/memory analysis is reported in separate "timem" DAT file.
The pipeline is compatible with Python 2.7 or later versions, but NOT with Python 3, and makes use of several BASH commands. It is therefore designed for POSIX-oriented platforms, e.g. UNIX and Mac-OS.
Requirements:
=============
To run *Haplosim*, you need the following input file(s):
1. A reference genome in fasta format from which artificially modified genomes are produced. The name of the target contig, length and coordinate of the modified genomes could be specified as input parameters. In the random mode, genomic regions with the desired length are selected at random sites to generate the mutants. In this case, the number of randomly selected regions and the number of mutants for each may be specified.
Besides, you need the following softwares (and their requirements) installed on your system:
1. ART to simulate Illumina reads (NOTE: *Haplosim* currently supports only **ChocolateCherryCake** version!):
http://www.niehs.nih.gov/research/resources/software/biostatistics/art/
At least one of the following aligners:
2. Blasr (for PacBio)
https://github.com/PacificBiosciences/blasr
3. Bowtie 2 (for Illumina):
http://bowtie-bio.sourceforge.net/bowtie2/index.shtml
4. BWA (Burrows-Wheeler Aligner) (for both Illumina and PacBio):
http://bio-bwa.sourceforge.net/
5. ExtractHAIRS from HapCUT to generate the fragment matrix from VCF file:
https://github.com/vibansal/hapcut
6. FreeBayes to call the variants from the alignment:
https://github.com/ekg/freebayes
7. HapCompass_v0.8.2 to run HapCompass algorithm:
http://www.brown.edu/Research/Istrail_Lab/hapcompass.php
8. HapTree_v0.1 to run HapTree algorithm (wrapped by the python script HapTree_multiblock.py, see HapTree_multiblock --help):
http://groups.csail.mit.edu/cb/haptree/
9. PBSIM to simulate PacBio reads:
https://code.google.com/p/pbsim/
10. Picardtools (AddOrReplaceReadGroups.jar):
http://broadinstitute.github.io/picard/
11. Samtools for indexing, sorting and duplicate removal (assumes version 1.4):
https://github.com/samtools/samtools
12. SDhaP to run SDhaP algorithm (wrapped by the bash script hap2, see hap2 --help):
http://sourceforge.net/projects/sdhap/
Java must be also installed on the system, as it is necessary for running HapCompass and Picardtools.
Format of the output files:
================================
1. Three DAT files with the following fields, each line reports the values for one simulation, i.e. region- mutant:
**CORR_ALL_DOSAGES**: Allelic correlation score between between the estimated and original haplotypes, comparing all the variants common between the original and estimates. This measure is equal to the probability that alleles at the same position are the same between the two haplotypes.
**PSR_ALL_DOSAGES**: Pairwise-phasing Accuracy Rate of the estimated haplotypes, comparing all the variants common between the original and estimates. This is the proportion of correct pairwise-phasings among the phasings of all possible pairs of SNPs.
**VER**: Vector Error Rate (Berger et al. 2014) of the estimated haplotypes, including the common variants whose dosage is correctly estimated, normalized by the number of these variants and the ploidy.
**PSR_CORRECT_DOSAGES**: like PSR_ALL_DOSAGES, excluding variants with wrongly estimated dosages.
**CORR_CORRECT_DOSAGES**: like CORR_ALL_DOSAGES, excluding variants with wrongly estimated dosages.
**MISSING_ESTIMATE**: Proportion of original SNPs missing in the estimate.
**SPURIOUS_ESTIMATE**: Ratio of spurious (false) variants in the estimate to the original variants.
**WRONG_DOSAGES_COMMON_SNPS**: Proportion of the common SNPs with wrong dosage in the estimate.
**PPV_ESTIMATE**: Positive predictive value of the output SNPs.
**NUM_GAPS**: Number of gaps induced during the estimation of the haplotyoes, i.e. number of output haplotype blocks minus one, normalized by the number of SNPs.
**CORRECT_HOMOLOGUE_RATE**: The proportion of the correctly estimated homologues in the simulated population.
**ID_REGION**: ID of the reference genomic region used to simulate the mutant genome.
**ID_MUTATION**: Individual IDs of the simulated genomes. This was NOT given in version 1.0.
Also, in the --report-aln mode:
2. fasta reference, fastq reads, aligned BAM ans SAM and VCF files for each simulation, names as my{ref, reads, bam, sam, vcf}xy.{fa, fastq, bam, sam, vcf} for selected reference region number x and mutant numebr y.: This file should contain the variants and genotypes for a single individual that is to be phased. Please make sure that the VCF file conforms to the standard VCF format, since HapCUT does not check this.
3. Log file containing all the generated and estimated haplotypes. Haplotypes only include the variant sites according to definition. The variant position (the same as in the VCF file), ref and alternative alleles are specified in the haplotypes. Each line represents a variant and its allelic dosage in the haplotypes.
In the verbose mode, the number of failures for each algorithm, as well as the error and warning messages and the total CPU time are written to the log file. If --report-aln is off, only problematic haplotypes are reported in the log.
Also, in the verbose mode:
4. A separate log file for the read generator, including the output, error and warning messages from ART and PBSIM.
Running the program:
=====================
Before Running the program, the path to the required programs must be set in the OS environment, e.g by exporting following variables:
ARTSPath, blasrPath, bowtiePath, bwaPath, FreebayesPath, HapCompassPath, HapCutPath, HapTreePath, PBSIMPath, PicardtoolsPath, samtoolsPath, SDhaPPath.
The default is to $HOME directory and the default sub directory containing the executable files. Note that the default folder name might be different from the actual folder on your system. It is also possible to add the path to the executable files to $PATH in the OS environment, hence not needing the variables above (except for *Picardtools* and *HapCompass* which are run as java scripts and whose correct paths must be exported to *Haplosim*).
For parallel execution of *HapTree\_multiblock*, NCORES variable must be set in the OS environment to the desired number of cores.
Input parameters:
=====================
For the input parameters, see the help of the software:
./haplosim.py --help, -h
***Examples:***
Simulate tetraploid genomes (-p 4) from chromosome 5 of PGSC\_DM\_v4.03 draft potato genome (Sharma *et al.* 2011), using a lognormal model with mean and variance of the log distance between neighboring SNPs set to 3 and 1, respectively. 25 regions of 20kb length (-l 20e03) are randomly chosen from this reference from each 20 genomes are generated by random mutation (-r 25 20) according to the lognormal model. Paired-End Reads are generated by MiSeq technology with insert size 600, with a 15x coverage per homologue. The proportions of simplex, duplex, triplex and tetraplex alleles are specified by --dosage. Results will be stored in potato_15_MS_600_lognormal_20e03_HapCompass.dat, potato_15_MS_600_lognormal_20e03_HapTree.dat, potato_15_MS_600_lognormal_20e03_SDhaP.dat and
potato_15_MS_600_lognormal_20e03.log files.
haplosim.py -x 15 $HOME/PGSC_v4.03_resources/PGSC_DM_v4.03_CHR05.fasta potato_15_MS_600_lognormal_20e03 MS PE --insert-size 600 -p 4 -l 20e03 -r 25 20 --lognormal 3 1 --dosage 0.5 0.23 0.14 0.13
Change the coverage to 10x per homologue, stochastic model to mixture poisson with main mutation rate 0.04, technology to PacBio long reads. Aligner is set to bwa-mem (default aligner is blasr for PacBio) and the time out to 1200 seconds (default 500s). --no-missing option makes the software keep all of the estimated haplotypes reported. Dosage distribution will be set to default, i.e. equal probabilities for all of the alternative allele counts simplex, douplex, triplex and tetraplex.
haplosim.py -x 10 $HOME/PGSC_v4.03_resources/PGSC_DM_v4.03_CHR05.fasta potato_10_PacBio_poisson_20e03 CLR -p 4 -l 20e03 -r 15 20 -m 0.04 -t 1200 --no-missing --bwa-mem
A limit could be also set on the total number of generated haplotypes from each reference, using --maxhap option. In that case, random haplotypes are generated equal to the number passed to --maxhap, and the homologues of each genome are selected by sampling with replacement from this total set.
**HaploGenerator** (*haplogenerator.py*) could be used independently to generate random genomes from a reference, by inducing mutations and indels, with several stochastic models. To produce allopolyploid genomes, sub-genomes could be generated from the same reference genome independently by HaploGenerator, and then combined by **allowrapper**. The output will be the fasta file (and its index) for each homologue, as well as a text file containing the generated haplotypes.
***Examples:***
Generate hexaploid genome from tetraploid and diploid sub-genomes, with the names genHexa\_hap1, ..., genHexa\_hap6, genHexa\_varianthaplos.txt:
haplogenerator.py -f $HOME/myref.fa -o genA --model poisson -s "[0.04,0,0]" -m "{'A':'C','G':'A','C':'T','T':'G'}" -p 4
haplogenerator.py -f $HOME/myref.fa -o genB --model lognormal -s "[3,9,0]" -m "{'A':'T','G':'C','C':'A','T':'GAC'}" -p 2 --sdlog "[1,1,0]"
allowrapper -p 6 -f $HOME/myref.fa genA genB genHexa
Wrapper script is provided for HapTree \(*HapTree_multiblock.py*\) to allow multiple haplotype block. Wrapper is provided for SDhaP \(*hap2*\) to reformat its output to be comparable to the outputs of other haplotyping software's and HaploGenerator.
**HapCompare** \(*hapcompare.py*\) could be used independently to compare haplotype files in the format of the outputs of *HaploGenerator*, *HapTree\_multiblock*, *hap2* and *HapCompass*.
Download:
=====================
The software is available at gitlab, Wageningen UR:
git clone git@git.wageningenur.nl:motaz001/Haplosim.git —recursive
Copyright and License:
=====================
Copyright (c) 2016, Ehsan Motazedi, Wageningen UR.
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files \(the "Software"\), to deal in the Software without restriction, including without limitation the rights to use, copy, share and modify the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. This permission is only valid for academic and personal use, i.e. not for commercial purposes.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#!/bin/bash
# Wrapper to combine several haplogenerator outputs to generate a hybrid genome.
# Written by Ehsan Motazedi, Wageningen UR, 22-01-2016.
# Last modified: 06-09-2017.
usage(){
echo 'Usage: allowrapper [-h, --help] -p, --ploidy int+ <ploidy level> -f, --ref <name of the reference in fasta format> infiles... <root name of subgenome files> outputfile <root name for output files>'
}
display_help(){
echo 'Wrapper to allow simulation of allopolyploid genomes and haplotypes from haplogenerator output for the subgenomes. For example, with ploidy equal to "p", haplogenerator.py'
echo 'should have been run "p/2" times to produce the diploid subgenomes. Polyploid subgenomes are also possible. Calling this wrapper combines the haplotypes and'
echo 'renames the subgenome fasta files to produce the same output as if from a single run of haplogenerator. The "root names" must be given for the input and output files'
echo 'and the ploidy of the allopolyploid must be specified in the input. With this method, different models might be considered for each subgenome. Note that the sum'
echo 'of the ploidy levels of the subgenomes must be equal to the speified ploidy level of the allopolyploid. The new variant haplotype file will'
echo 'code the alleles by 0, 1, 2, ... with zero being the nucleotide present in the given reference.'
echo
usage
echo
echo 'where:'
echo ' infiles... the root names of the ancesteral subgenomes produced by haplogenerator'
echo ' output the root name for the files of the allopolyploid genome'
echo ' -h, --help show this help text'
echo ' -f, --ref the full name of the fasta file containing the common reference sequence for all of the subgenomes.'
echo ' -p, --ploidy int+ the ploidy level of the allopolyploid genome'
}
contains() {
string="$1"
substring="$2"
if test "${string#*$substring}" != "$string"; then
return 0 # $substring is in $string
else
return 1 # $substring is not in $string
fi
}
if [ "$1" = "" ]; then
>&2 echo 'ERROR: no input has been given!'
1>&2 usage
exit 1
else
declare -a FILES #FILES=()
while [ "$1" != "" ]; do
if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
display_help
exit 0
elif [ "$1" = "-p" ] || [ "$1" = "--ploidy" ]; then
shift
ploidy=$1
if [ -z $ploidy ]; then
printf "ERROR: you must specify a ploidy level!\n" 1>&2
1>&2 usage
exit 1
fi
test $ploidy -eq $ploidy 2> /dev/null
if [ $? != 0 ]; then
printf "ERROR: invalid argument %s passed to -p, --ploidy.\n" $1 1>&2
1>&2 usage
exit 1
fi
shift
elif [ "$1" = "-f" ] || [ "$1" = "--ref" ]; then
shift
fastafile=$1
if [ ! -f $fastafile ]; then
printf "ERROR: the given reference %s does not exist!\n" $fastafile 1>&2
1>&2 usage
exit 1
fi
shift
else
echo $1 | grep -q "^-"
if [ $? = 0 ]; then
printf "ERROR: argument %s is not valid!\n" $1 1>&2
1>&2 usage
exit 1
fi
FILES+=("$1")
shift
fi
done
fi
if [ "$(expr "$ploidy" : '^[0-9]\+$')" = "0" ] || [ "$ploidy" = "0" ]; then
>&2 echo 'ERROR: invalid ploidy level specified. Ploidy must be a positive integer!'
1>&2 usage
exit 1
fi
#if [ -z ${fastafile+x} ];then # This will NOT report $fastafile="" as unset, while the next line DOES report $fastafile="" as unset
if [ -z $fastafile ];then
>&2 echo 'ERROR: no common reference has been given!'
1>&2 usage
exit 1
fi
declare -i nargs=${#FILES[@]}
if [ $nargs -eq 0 ]; then
>&2 echo 'ERROR: no subgenome has been given!'
1>&2 usage
exit 1
elif [ $nargs -lt 2 ]; then
>&2 echo 'ERROR: no output name has been specified !'
1>&2 usage
exit 1
else
outname=${FILES[-1]}
unset FILES[${#FILES[@]}-1]
fi
declare -i nhapsfatsa=0
declare -i nhapsindx=0
for name in ${FILES[*]}; do
if contains "$name" "/"; then
DIR="${name%/*}"
else
DIR=$(pwd)
fi
BASE="${name##*/}"
if [ ! -d "$DIR" ]; then
>&2 echo 'ERROR: the given directory does not exist for subgenome '$name\!
exit 1
fi
if ! [ -f "$name"_varianthaplos.txt ]; then
>&2 echo 'ERROR: haplotype file missing for subgenome '$name\!
exit 1
fi