# PolyHaplotyper

# A haploblock (or block) is a set of bi-allelic markers (SNPs) that are assumed
# to have no recombinations withing the population studied. A good example is
# (the set of SNPs within) a contig.

# A haplotype is an allele of a haploblock: a unique combination of SNP alleles.

# PolyHaplotyper aims to infer the genetic composition of individuals in
# terms of haplotype dosages for a haploblock, based on SNP dosages, for any
# ploidy level.

# PolyHaplotyper should not be confused with PediHaplotyper, which is aimed
# at diploid individuals that are all part of a pedigree, and where the
# (bi- or multi-allelic) marker alleles have been phased before.
# In contrast, PolyHaplotyper can handle any ploidy level and uses unphased
# bi-allelic marker genotypes (i.e. marker allele dosages). It can work with
# unrelated individuals and/or F1 populations, but (so far) does not use
# segregation ratios.

#'@importFrom utils combn

#the following line serves to stop devtools::check complaining about
#some identifiers that do exist but it doesn't find because they are
#loaded from RData files and/or created in the global evironment:
utils::globalVariables(names=c("ahclist", "ahccompletelist", "ahcinfo"),
                       add=TRUE)

# General approach (not using segregation ratios)

# 1. List all the possible haplotypes (4 for 2 SNPs, 8 for 3 SNPs etc.)
# 2. For each of the dosage combinations, list the min and max nr of
#    each possible haplotype
# 3. Are there any (sets of) haplotypes that CAN BE assumed not to occur
#    given all observed dosages for all SNPs in the haploblock over all
#    individuals?
# 3a. Assuming all observed dosages are correct
# 3b. Assuming errors (occasional errors and/or shifts)
# 4. Assume that the smallest set of haplotypes is the correct one.
# 5. Assign the haplotype genotypes based on this set -
#    is there always a unique solution?

# Approach with F1 segregation ratios:
# 1. List all the possible haplotypes (4 for 2 SNPs, 8 for 3 SNPs etc.)
# 2. List all possible segregation ratios for these multiple haplotypes
#    (assuming polysomic, disomic and/or mixed inheritance)
# 3. Calculate the SNP dosage combinations and their expected frequencies
#    for all possible segregation ratios
# 4. Check which of these are close to the observed dosage segregations
#    and matches with the parental dosages
# 5. (Also if there are multiple F1's) select the acceptable solution with
#    the minimum number of haplotypes

# If there are 1 or more F1 populations and unstructured material:
# 1. Using the second approach, list the acceptable solutions for the F1's
#    (i.e. stop after step 4)
# 2. Over all samples, try each of these solutions (i.e. for each of the
#    possible F1 solutions, make sure that the min nr of each haplotype for
#    each dosage combination (step 2 of the general approach) is set to that
#    corresponding to the F1 solution); then the F1 haplotypes are certain to
#    be included.
# 3. Continue then from step 3 of the general approach (still for each of
#    the F1 solutions) and select the minimal solution.
# 4. Between those solutions based on the different F1 solutions, select the
#    best one (smallest set of haplotypes, least errors assumed, ...)

#'@title get all haplotypes for the given SNPs
#'@description Given a set of SNP (bi-allelic marker) names, generate all
#'possible haplotypes
#'@usage allHaplotypes(SNPnames)
#'@param SNPnames the names of the SNPs (or other bi-allelic markers) in the
#'haploblock (contig)
#'@return a matrix with SNPs in columns and all possible (2^nSNP) haplotypes
#'in rows (1: haplotype contains the dosage-counted SNP allele, 0: haplotype
#'contains the other SNP allele). The colnames are the SNPnames.
allHaplotypes <- function(SNPnames) {
  lst <- list()
  for (s in 1:length(SNPnames)) {
    lst[[s]] <- 0:1
  }
  eg <- as.matrix(expand.grid(lst, stringsAsFactors=FALSE))
  eg <- eg[, ncol(eg):1, drop=FALSE]
  colnames(eg) <- SNPnames
  eg
} #allHaplotypes

loadAllHaploCombList <- function(ploidy) {
  # Function is called only by allHaploComb.
  # Load ahclist and ahccompletelist into GlobalEnv if needed.
  # Checks variable ahcploidy in GlobalEnv. If that doesn't exist or
  # is not equal to ploidy, attempts to load ahclist into the GlobalEnv
  # from a file 'ahclist_nx.RData' in the current working directory (where n is
  # the ploidy). If that is successful, ploidy is stored in variable ahcploidy
  # in the GlobalEnv, else neither ahcploidy not ahclist is created or changed.
  # Return value is TRUE if the ahclist was already available or is loaded
  # succesfully, else FALSE.
  if (!exists("ahcinfo", envir=.GlobalEnv, inherits==FALSE)) {
    newinfo <- list()
    newinfo$ploidy <- 0 #to ensure loading a new ahclist
    newinfo$unsaved <- FALSE
    newinfo$complete_nSNP <- 0
    assign("ahcinfo", value=newinfo, envir=.GlobalEnv, inherits=FALSE)
  }
  if (ahcinfo$ploidy != ploidy) {
    #save any unsaved ahclist:
    if (ahcinfo$unsaved) {
      #the currently loaded ahclist is for a different ploidy and unsaved;
      #we save it before loading the ahclist for the current ploidy:
      save(ahclist, file=paste0("ahclist_", ahcinfo$ploidy,"x.RData"))
    }
    #load the ahccompletelist:
    x <- tryCatch(load(paste0("ahccompletelist_", ploidy,"x.RData"),
                       envir=.GlobalEnv),
                  error=function(e) "error")
    if (identical(x, "error")) {
      ahcinfo$complete_nSNP <- 0
    } else ahcinfo$complete_nSNP <- length(ahccompletelist)
    #load the ahclist:
    x <- tryCatch(load(paste0("ahclist_", ploidy,"x.RData"),
                       envir=.GlobalEnv),
                  error=function(e) "error")
    if (identical(x, "error"))
      assign("ahclist", value=list(), envir=.GlobalEnv, inherits=FALSE)
    #update ahcinfo:
    ahcinfo$ploidy <- ploidy
    ahcinfo$unsaved <- FALSE
  }
} #loadAllHaploCombList

#'@title get all haplotype combinations that result in the given SNP dosages
#'@description get all haplotype combinations that result in the given SNP dosages
#'@usage allHaploComb(SNPdosages=NULL, did=NULL, allhap, ploidy,
#'writeFile=TRUE, progress=FALSE)
#'@param SNPdosages an integer vector with the dosages of each SNP in one
#'individual, with the SNPs in the same order as in the columns of allhap.
#'Either SNPdosages or did must be specified, if both are specified they
#'should match but this is not checked
#'@param did dosage ID: in combination with ploidy specifying the SNPdosages.
#'Either SNPdosages or did must be specified, if both are specified they
#'should match but this is not checked
#'@param allhap a matrix as returned by allHaplotypes, for the same SNPs in the
#'same order as SNPdosages
#'@param ploidy the ploidy of the individual; SNPdosages should all be in
#'0:ploidy
#'@param writeFile The calculated haplotype combis are stored in list
#'ahclist in GlobalEnv. Default writeFile TRUE ensures that this list is written
#'to file each time a new SNPdid is calculated. With writeFile FALSE the list is
#'not written to file but a variable ahcunsaved with value FALSE is
#'created in GlobalEnv.
#'@param progress whether to print a message when starting calculations
#'for a did; default FALSE
#'@details Each column of the return value represents one combination of
#'haplotypes that yields the observed SNP dosages. If any of the SNP dosages are
#'NA or not in 0:ploidy, a matrix with 0 columns is returned.\cr
#'This function makes use of precalculated lists that are read from files
#'in the current working directory: ahccompletelist_nx.RData and
#'ahclist_nx.RData (where n is the ploidy). These lists ahccompletelist and
#'ahclist are stored in the GlobalEnv, as well as another list ahcinfo. Liust
#'ahcinfo may be extended with newly calculated results.
#'@return an integer matrix with one row per haplotype (corresponding to allhap)
#'and one column per combination of haplotypes, with the number of copies
#'of each haplotype in each combination (the dosages of the haplotypes)
allHaploComb <- function(SNPdosages=NULL, did=NULL, allhap, ploidy,
                         writeFile=TRUE, progress=FALSE) {
  # TODO: implement parallel computing of combinations for new did combinations,
  # https://artax.karlin.mff.cuni.cz/r-help/library/Rdsm/html/Rdsm-package.html
  # seems useful (using shared memory) but may need to delay saving ahclist
  # until all threads are finished?
  # Alternative: package parallel, but then no writing to common variables
  # like ahclist until all threads (foreach) are finished

  #initial checks:
  nSNP <- ncol(allhap)
  if (!missing(SNPdosages) && !is.null(SNPdosages) &&
      (length(SNPdosages) != nSNP || !all(SNPdosages %in% 0:ploidy)))
    stop("allHaploComb: invalid SNPdosages")
  if (missing(did) || is.null(did)) {
    if (missing(SNPdosages) || is.null(SNPdosages)) {
      stop("allHaploComb: SNPdosages or did must be specified")
    } else did <- SNPdidfun(SNPdosages, ploidy=ploidy)
  }

  loadAllHaploCombList(ploidy) #does nothing if list already in GlobalEnv
  # first check if this nSNP is in ahccompletelist:
  if (ahcinfo$complete_nSNP >= nSNP) {
    # in the ahccompletelist, for each nSNP the entire list is precalculated
    # by completeAllHaploComb, and the elements are directly indexed by the did:
    return(ahccompletelist[[nSNP]][[did]])
  }
  # else look it up in ahclist, and if needed calculate it:
  if (length(ahclist) >= nSNP)
    didix <- which(names(ahclist[[nSNP]]) == did)
  if (length(didix) == 1) {
    #the haplotype combinations for this SNPdid were already calculated earlier:
    return(ahclist[[nSNP]][[didix]])
  }
  # did was not present in ahclist so we add it:
  #we copy the list to a local tmplist, modify that,
  #store it in .GlobalEnv and save it, and return the matrix with haplotype
  #combinations.
  if (progress) cat(paste0("allHaploComb: calculation for SNPdid=", did, "\n"))
  tmplist <- ahclist
  if (length(tmplist) < nSNP) tmplist[[nSNP]] <- list()
  #calculate new matrix with haplotype combinations
  if (missing(SNPdosages)) SNPdosages <- dosFromSNPdid(did, nSNP, ploidy)
  allmat1 <- matrix(NA_integer_, nrow=ploidy, ncol=0) #matrix with haplotype sets in columns
  allset <- rep(nrow(allhap), ploidy) #current haplotype set: a full set has length ploidy
  while(length(allset) > 0) {
    # if the last allele is 0, change the next:
    if (length(allset) > 0 && allset[length(allset)] <= 0) {
      if (length(allset) == 1) {
        allset <- integer(0)
      } else allset <- allset[1:(length(allset) - 1)]
      if (length(allset) > 0)
        allset[length(allset)] <- allset[length(allset)] - 1
    } else if (all(colSums(allhap[allset,, drop=FALSE]) == SNPdosages)) {
      #solution found; fill up with nr 1 (dosage 0 for all SNPs) and add to allmat1
      allset <- c(allset, rep(1, ploidy-length(allset)))
      allmat1 <- cbind(allmat1, allset)
      allset[ploidy] <- 0 #make sure to change the next level
    } else {
      if (length(allset) < ploidy  &&
          !any(colSums(allhap[allset,, drop=FALSE]) > SNPdosages) &&
          allset[length(allset)] > 1) {
        # only add the next allele if none of the dosages too high;
        # adding allele 1 (all zeroes) won't lead to a solution
        allset <- c(allset, allset[length(allset)]) #allele nrs never increase
      } else {
        # change last allele
        allset[length(allset)] <- allset[length(allset)] - 1
      }
    }
  }
  if (ncol(allmat1) == 0) stop("allHaploComb: 0 solutions found")
  #convert allmat1 to a matrix with the frequencies of all haplotypes
  allmat <- matrix(NA_integer_, nrow=nrow(allhap), ncol=ncol(allmat1))
  for (col in seq_along(allmat1[1,]))
    allmat[,col] <- tabulate(allmat1[,col], nbins=nrow(allhap))

  #add allmat to tmplist and save as ahclist:
  didix <- length(tmplist[[nSNP]]) + 1
  tmplist[[nSNP]][[didix]] <- allmat
  names(tmplist[[nSNP]])[didix] <- did
  assign("ahclist", value=tmplist, envir=.GlobalEnv, inherits=FALSE)
  if (writeFile) {
    save(ahclist, file=paste0("ahclist_", ploidy,"x.RData"))
  } else ahcinfo$ahcunsaved <- TRUE
  allmat
} #allHaploComb

getHapcombCount_1SNP <- function(SNPdosage, ploidy, nhap) {
  #nhap = total nr of different haplotypes for nSNP SNPs = 2^nSNP
  #returns the number of haplotype combinations for a given dosage d at one SNP
  #currently not used
  d <- SNPdosage
  h <- nhap %/% 2 #as.integer(2 ^ (nSNP-1) + 0.001)
  if (d == ploidy) C0 <- 1 else
    C0 <- cumprod(seq(h, h+ploidy-d-1))[ploidy-d] / cumprod(seq_len(ploidy-d))[ploidy-d]
  if (d == 0) C1 <- 1 else
    C1 <- cumprod(seq(h, h+d-1))[d] / cumprod(seq_len(d))[d]
  C0 * C1
} #getHapcombCount_1SNP

totHapcombCount <- function(ploidy, nhap) {
  #nhap = total nr of different haplotypes for nSNP SNPs = 2^nSNP
  #returns the total number of haplotype combinations
  #currently not used
  cumprod(seq(nhap, nhap+ploidy-1))[ploidy] /
      cumprod(seq_len(ploidy))[ploidy]
} #totHapcombCount

#'@title generate all haplotype combinations
#'@description generate a list which contains for each SNP dosage combination
#'at a given ploidy all matching haplotytpe combinations
#'@usage completeAllHaploComb(ploidy, nSNP, savesec=1800, printsec=60,
#'fname=NULL)
#'@param ploidy ploidy (may be even or odd)
#'@param nSNP number of SNPs in the haploblock
#'@param savesec number of seconds between successive saves of the intermediate
#'results
#'@param printsec number of seconds between printout of the currect set of
#'haplotypes (the last two are always equal at the time of printing)
#'@param fname filename (to which the extension .RData will be added) to store
#'the results. Intermediate saves will go to fname + the current set of
#'haplotypes; these intermediate files are temporary and will be deleted.\cr
#'If NULL (default), fname will be set to e.g. ahc_4x_nSNP6, where 4 is the
#'ploidy and nSNP = 6
#'@return a list with for each SNPdid (each combination of SNP dosages) a
#'matrix with one row per haplotype and one column per haplotype combination,
#'containing the dosages of the haplotypes in each haplotype combination
#'@export
completeAllHaploComb <- function(ploidy, nSNP, savesec=1800, printsec=60,
                                 fname=NULL) {
  # this will generate a complete sub-list of an ahclist (see allHaploComb)
  # for the given ploidy and number of SNPs
  # and save it to a file called eg ahc_4x_nSNP7.RData
  # every savesec seconds a temporary file will be saved with the total nr of
  # SNPdids done so far appended to the filename

  if (is.null(fname)) fname <- paste0("ahc_", ploidy, "x_nSNP", nSNP)

  ndids <- as.integer((ploidy + 1) ^ nSNP + 0.001)
  ahc <- list()
  for (did in ndids:1) ahc[[did]] <- matrix(NA_integer_, nrow=ploidy, ncol=1)
  lasthap <- rep(0, ndids) # last stored haplotype column for each did
  allhap <- allHaplotypes(paste0("m", 1:nSNP))
  print(allhap)
  lastfile <- ""
  lastsave <- proc.time()
  lastprint <- lastsave - printsec - 1
  #new approach:
  #instead of taking all SNPdosages in turn and checking which hapcomb
  #matches them, generate all hapcombs in turn and calculate their SNPdids
  nhap <- nrow(allhap)
  allset <- rep(nhap, ploidy) #current haplotype set: a full set has length ploidy
  while(TRUE) {
    SNPdos <- colSums(allhap[allset,, drop=FALSE])
    did <- SNPdidfun(SNPdos, ploidy=ploidy)
    lasthap[did] <- lasthap[did] + 1
    if ((nc<-ncol(ahc[[did]])) < lasthap[did]) {
      #we don't want to reallocate for each new column but increase size
      #when needed by 50%
      extmat <- matrix(NA_integer_, nrow=ploidy, ncol= nc %/% 2 + 2)
      ahc[[did]] <- cbind(ahc[[did]], extmat)
    }
    ahc[[did]][,lasthap[did]] <- allset
    # get the next allset:
    len <- ploidy
    allset[len] <- allset[len] - 1
    while (len > 0 && allset[len] <= 0) {
      len <- len - 1
      if (len > 0) allset[len] <- allset[len] - 1
    }
    if (len <= 0) break
    if (len < ploidy) {
      allset[(len+1):ploidy] <- allset[len] #nhap
      #progress and save: only check when len < ploidy
      if ((proc.time()-lastprint)[3] > printsec) {
        cat(paste0("ploidy: ", ploidy, " nSNP: ", nSNP,
                   " allset: ", paste(allset, collapse=" "), "\n"))
        lastprint <- proc.time()
        if ((lastsave-lastprint)[3] > savesec) {
          save(ahc, file=paste0(fname, "_", paste(allset, collapse="-"),
                                ".RData"))
          if (file.exists(lastfile)) file.remove(lastfile)
          lastfile <- fname
          lastsave <- proc.time()
        }
      }
    } # len < ploidy
  }
  # all elements of ahc are now matrices with one solution per column,
  # in the form of sets like 1-1-2-4 (i.e. all occurring haplotypes listed).
  # now we convert that to columns with the dosages of each haplotype:
  #
  for (did in seq_along(ahc)) {
    #ncomb <- ncol(ahc[[did]])
    allmat <- matrix(NA_integer_, nrow=nhap, ncol=lasthap[did])
    for (col in seq_len(lasthap[did]))
      allmat[,col] <- tabulate(ahc[[did]][,col], nbins=nhap)
    ahc[[did]] <- allmat
  }
  save(ahc, file=paste0(fname, ".RData"))
  if (file.exists(lastfile)) file.remove(lastfile)
  invisible(ahc)
} #completeAllHaploComb

#'@title calculate the SNP dosages resulting from haplotype combinations
#'@description calculate the SNP dosages resulting from haplotype combinations
#'@usage haplocomb2SNPdosages(haplocomb, allhap)
#'@param haplocomb a matrix with one column per combination of haplotypes
#'and one row for each possible haplotype (corresponding to the rows of allhap)
#'with dosage of the haplotypes in each combination. A vector is interpreted
#'as a one-column matrix; all columns must sum to ploidy
#'@param allhap a matrix as returned by allHaplotypes
#'@return a matrix with columns corresponding to the columns of haplocomb
#'and one row for each SNP, with the dosages of each SNP in each combination;
#'colnames are the SNPdids (SNP dosage IDs), rownames are the SNP names taken
#'from allhap
#'@details if haplocomb contains NA values, all values in the corresponding
#'column of the result will also be NA
haplocomb2SNPdosages <- function(haplocomb, allhap) {
  if (is.vector(haplocomb)) haplocomb <- as.matrix(haplocomb)
  if (nrow(haplocomb) != nrow(allhap)) stop("haplocomb2SNPdosages: input error")
  res <- t(t(haplocomb) %*% allhap)
  colnames(res) <- SNPdid(res, ploidy=sum(haplocomb[, 1]), check=FALSE)
  res
} #haplocomb2SNPdosages

#'@title get the min and max dosage of each haplotype over all
#'haplotype combinations
#'@description get the min and max dosage of each haplotype over all
#'haplotype combinations
#'@usage haplofrqMinMax(haplocomb)
#'@param haplocomb a matrix as returned by allHaploComb (haplotypes in rows,
#'each column a different combination of haplotypes that matches the
#'SNP dosages, with the number of times each haplotype occurs in that
#'combination); a vector is interpreted as a one-column matrix
#'@return a matrix with haplotypes in rows, and columns min and max with the
#'minimum and maximum times each haplotype occurs over all different
#'haplotype combinations
haplofrqMinMax <- function(haplocomb) {
  if (is.vector(haplocomb)) haplocomb <- as.matrix(haplocomb)
  minmax <- matrix(NA_integer_, nrow=nrow(haplocomb), ncol=2)
  colnames(minmax) <- c("min", "max")
  minmax[,1] <- apply(haplocomb, 1, min)
  minmax[,2] <- apply(haplocomb, 1, max)
  minmax
} #haplofrqMinMax

#'@title check a SNP dosages matrix or data.frame
#'@description check a SNP dosages matrix or data.frame, select columns,
#'convert to matrix
#'@usage checkSNPdosages(SNPdosages, indiv=NULL, ploidy)
#'@param SNPdosages matrix or data.frame. SNPs are in rows, individuals in
#'columns, each cell has a SNP dosage. Names of individuals are the column
#'names, SNP names are the row names or (if a data.frame) in a column named
#'MarkerNames. All SNP dosages must be in 0:ploidy or NA.
#'@param indiv NULL (default) or a character vector with names of individuals
#'to be selected. If NULL, all columns are selected except, if SNPdosages is
#'a data.frame, the column MarkerNames.
#'@param ploidy all dosages are checked to be in 0:ploidy or NA
#'@return a matrix with the selected columns in the order of indiv, with
#'names of individuals as column names, SNP names as row names
checkSNPdosages <- function(SNPdosages, indiv=NULL, ploidy) {
  message <- ""
  if (is.vector(SNPdosages)) {
    # 1 col; names -> rownames
    dosmat <- as.matrix(SNPdosages)
    colnames(dosmat) <- "1" #name of one individual
  }
  if (is.data.frame(SNPdosages)) {
    mrkcol <- which(tolower(substr(names(SNPdosages), 1, 10)) == "markername")
    if (length(mrkcol) == 1) {
      rownames(SNPdosages) <- as.character(SNPdosages[, mrkcol])
      SNPdosages <- SNPdosages[, -mrkcol]
    }
    if (is.null(names(SNPdosages)))
      names(SNPdosages) <- seq_len(length(SNPdosages))
    if (is.null(indiv)) indiv <- names(SNPdosages)
    indcol <- match(indiv, names(SNPdosages))
    if (anyNA(indcol)) stop("Not all indiv occur in SNPdosages")
    numeric <- sapply(SNPdosages[,indiv], is.numeric)
    logical <- sapply(SNPdosages[,indiv], is.logical)
    if (!all(numeric | logical))
      stop("Not all selected columns of SNPdosages are numeric")
    dosmat <- as.matrix(SNPdosages[,indcol])
  } else if (!is.matrix(SNPdosages) ||
             !(is.numeric(SNPdosages) || is.logical(SNPdosages))) {
    stop("SNPdid: not all selected columns if SNPdosages are numeric")
  } else {
    #SNPdosages is already a numeric matrix
    if (is.null(colnames(SNPdosages)))
      colnames(SNPdosages) <- seq_len(ncol(SNPdosages))
    if (is.null(indiv)) indiv <- colnames(SNPdosages)
    indcol <- match(indiv, colnames(SNPdosages))
    if (anyNA(indcol)) stop("Not all indiv occur in SNPdosages")
    dosmat <- SNPdosages[, indcol]
  }
  #now dosmat is a numeric matrix with colnames
  if (!all(dosmat %in% c(NA, 0:ploidy)))
    stop("Selected columns of SNPdosages contain values not in 0:ploidy")
  if (is.null(rownames(dosmat)))
    rownames(dosmat) <- paste0("mrk", padded(seq_len(nrow(dosmat))))
  dosmat
} #checkSNPdosages

#We will not pre-calculate all haplotype combinations for all possible
#SNP dosage combinations (as there might be very many, depending of nr of SNPs
#and ploidy), but only those that actually occur (possibly including those
#with some missing SNP dosages, not implemented yet)
#We need a function that calculates a SNP dosage combination ID:

SNPdidfun <- function(dos, ploidy) {
  #for use within SNPdid, also called by allHaplocomb
  id <- 0
  fac <- 1
  for (i in length(dos):1) {
    id <- id + fac * dos[i]
    fac <- fac * (ploidy + 1)
  }
  id +1
} #SNPdidfun within SNPdid


#'@title get SNP dosage IDs from SNP dosages
#'@description get SNP dosage IDs (SNPdid)from SNP dosages
#'@usage SNPdid(SNPdosages, indiv=NULL, ploidy, check=TRUE)
#'@param SNPdosages matrix or data.frame. SNPs are in rows, individuals in
#'columns, each cell has a SNP dosage. Names of individuals are the column
#'names, SNP names are the row names or (if a data.frame) in a column named
#'MarkerNames. All SNP dosages must be in 0:ploidy or NA. If a data.frame,
#'additional columns may be present.
#'@param indiv NULL (default) or a character vector with names of individuals
#'to be selected. If NULL, all columns are selected;
#'if SNPdosages is a data.frame, that is probably not what is intended.
#'@param ploidy all SNP dosages are checked to be in 0:ploidy or NA
#'@param check if TRUE (default) checkSNPdosages is called. If FALSE it is
#'assumed that SNPdosages is a matrix (not a data.frame) and it is not checked.
#'@return a vector of SNP dosage IDs, one for each column of SNPdosages:
#'each a number in 1:((ploidy+1)^nrow(SNPdosages)), NA for each column in
#'dosages where any of the dosages are NA
#'@export
SNPdid <- function(SNPdosages, indiv=NULL, ploidy, check=TRUE) {
  if (check) {
    SNPdosages <- checkSNPdosages(SNPdosages, indiv, ploidy)
    if (is.character(SNPdosages)) stop(SNPdosages)
  }
  apply(SNPdosages, MARGIN=2, FUN=SNPdidfun, ploidy=ploidy)
} #SNPdid

#'@title get the SNP dosages from SNPdids (SNP dosage IDs)
#'@description get the SNP dosages from SNPdids (SNP dosage IDs)
#'@usage dosFromSNPdid(dosageIDs, nSNP, ploidy)
#'@param dosageIDs vector of SNP-dosage-combination-IDs (SNPdid)
#'@param nSNP nr of SNPs in the haploblock (contig)
#'@param ploidy the ploidy level, a single positive integer
#'@return a matrix with in columns the SNP dosages corresponding to the
#'SNP dosageIDs, with these SNPdids as colnames, and one row per SNP, with
#'SNP names as rownames
#'@export
dosFromSNPdid <- function(dosageIDs, nSNP, ploidy) {
  SNPdos <- matrix(NA_integer_, nrow=nSNP, ncol=length(dosageIDs))
  colnames(SNPdos) <- dosageIDs # no problem with NA
  if (is.character(dosageIDs)) dosageIDs <- as.integer(dosageIDs)
  pp1 <- ploidy + 1
  for (i in seq_along(dosageIDs)) if (!is.na(dosageIDs[i])) {
    di <- dosageIDs[i] - 1
    fac <- as.integer(round(pp1 ^ (nSNP-1), 0))
    snp <- 1
    while (snp <= nSNP) {
      SNPdos[snp, i] <- di %/% fac
      di <- di %% fac
      fac <- fac %/% pp1
      snp <- snp + 1
    }
  }
  SNPdos
} #dosFromSNPdid

#'@title infer haplotypes from SNP dosages
#'@description infer haplotypes from SNP dosages, treating all individuals
#'as unrelated
#'@usage inferHaplotypes(SNPdosages, indiv, ploidy, minfrac=c(0.1, 0.01),
#'sel_hap=integer(0), progress=TRUE)
#'@param SNPdosages matrix or data.frame. SNPs are in rows, individuals in
#'columns, each cell has a SNP dosage. Names of individuals are the column
#'names, SNP names are the row names or (if a data.frame) in a column named
#'MarkerNames. All SNP dosages must be in 0:ploidy or NA. If a data.frame,
#'additional columns may be present.
#'@param indiv NULL (default) or a character vector with names of individuals
#'to be selected. If NULL, all columns are selected;
#'if SNPdosages is a data.frame, that is probably not what is intended.
#'@param ploidy all SNP dosages should be in 0:ploidy or NA
#'@param minfrac a vector of 1 or 2 fractions, the second smaller than the
#'first. A haplotype is considered to be certainly present if it occurs
#'in at least a fraction minfrac[1] of all individuals; default 0.1. For the
#'meaning of the optional second value in minfrac see Details.
#'@param sel_hap selected haplotypes (haplotypes that must be present according
#'to prior inference or knowledge, numbers refer to rows of matrix produced by
#'allHaplotypes); default none (integer(0))
#'@param progress if TRUE, and new haplotype combinations need to be calculated,
#'and the number of SNPs and the ploidy are both >= 6, progress is indicated
#'by printed messages
#'@details The returned list contains some general calculations and statistics
#'based on SNPdosages and ploidy, and some parts that are the result of the
#'inference. The primary of these is hclist: this contains for each SNPdid in
#'the population the most likely combination of haplotypes (sometimes
#'more than one, if several are equally likely = all have the maximum dosage
#'of haplotypes inferred to be present).\cr
#'Principle: first the haplotypes are derived that must be present in at least
#'a fraction minfrac of the individuals (because those haplotypes occur in all
#'possible haplotype configurations that result in the observed marker dosages).
#'In subsequent iterations, for any individuals that has multiple possible
#'haplotype configurations, the one(s) that has/have the highest total
#'dosage of those haplotypes are assumed to be the correct ones. This may lead
#'to additional haplotypes that must be present. The iterations continue
#'until no new haplotypes are added to the must-be-present list.\cr
#'During this process there may be additional haplotypes assigned to some
#'SNPdids, that are not frequent enough to reach the minfrac[1] criterion. If
#'there is a second fraction in minfrac, in a final round we also consider
#'those extra haplotypes as must-be-present, provided they occur in at least
#'a fraction minfrac[2] of all individuals (minimum 2 individuals). If that
#'leads to more SNPdids having a unique haplotype combination AND not having
#'other SNPdids getting more combination the new assignments are used.\cr
#'#'If sel_hap in the function call already contains haplotypes, these are
#'considered to be certainly present, even if they don't meet the minfrac
#'criterion. This may help to reduce the number of haplotype configurations
#'for some individuals, but may also increase them.
#'@return a list with elements:
#'\itemize{
#'\item{nPresent: a vector with for each haplotype the number of
#'individuals in which it must occur}
#'\item{nAbsent: a vector with for each haplotype the number of
#'individuals in which it cannot occur}
#'\item{minfrac: same as parameter minfrac}
#'\item{allhap: a matrix as returned by allHaplotypes}
#'\item{dosmat: a  matrix with SNP dosages for all individuals in indiv
#'(produced by calling checkSNPdosages on SNPdosages)}
#'\item{SNPdids: a vector of the SNPdid (SNP dosage ID) for each individual in
#'dosmat (each combination of SNP dosages has its own ID; if any of the SNPs
#'has an NA dosage the corresponding SNPdid is also NA)}
#'\item{SNPdidsTable: a table of the counts of all non-NA SNPdids in the
#'population}
#'\item{hclist: a list in which each element has the name of one of the SNPdids
#'in SNPdidsTable, and contains a matrix with one column for each of the
#'remaining haplotype combinations: (a subset of the columns of) the matrix
#'returned by allHaploComb for that SNPdid}
#'}
#'@export
inferHaplotypes <- function(SNPdosages, indiv, ploidy, minfrac=c(0.1, 0.01),
                               sel_hap=integer(0), progress=TRUE) {
  dosmat <- checkSNPdosages(SNPdosages, indiv, ploidy)
  if (is.character(dosmat)) stop(dosmat)
  orig_hap <- sel_hap
  SNPdids <- SNPdid(SNPdosages=dosmat, ploidy=ploidy, check=FALSE)
  SNPdidsTable <- table(SNPdids)
  allhap <- allHaplotypes(rownames(dosmat))
  calcdids <- loadAllHaploCombList(ploidy)
  calcdids <- calcdids && length(ahclist) >= nrow(dosmat)
  if (calcdids) {
    calcdids <- which(!(names(SNPdidsTable) %in%
                          names(ahclist[[nrow(dosmat)]])))
  } else calcdids <- seq_along(SNPdidsTable)
  progress <- progress && length(calcdids) > 0 &&
    nrow(dosmat) >= 6 && ploidy >= 6
  if (length(calcdids) > 0 && nrow(dosmat) + ploidy >= 12) {
    cat(paste("inferHaplotypes:", length(calcdids),
              "haplotype combinations need to be calculated;\n"))
    cat("this may take quite a long time\n")
  }
  nind <- sum(SNPdidsTable) #includes only individuals with no missing SNP data
  cycle <- 0
  while (TRUE) {
    cycle <- cycle + 1
    res <- single_cycle_infer(allhap=allhap, SNPdidsTable=SNPdidsTable,
                              ploidy=ploidy, sel_hap=sel_hap, progress=progress)
    sel_new <- sort(union(orig_hap, which(res$nPresent >= minfrac[1] * nind)))
    haploLost <- setdiff(sel_hap, sel_new)
    if (length(haploLost) > 0) {
      #within these cycles haplotypes can only be gained, not lost, (except
      #the orig_hap which need not be supported by the current data, but
      #these have been added back to sel_new)
      #a warning here means something (the theory?) went wrong
      warning(paste0("inferHaplotypes: lost in cycle ", cycle, ": ",
                    paste(haploLost, collapse=" ")))
      # it seems that it is possible to lose previous must-have haplotypes,
      # but is does not happen often.
      # We must prevent infinite looping, so we store the sel_hap and
      # if the situation occurs again with the same sel_hap we break with
      # an empty sel_hap
      if (exists("prev_sel_hap")) {
        if (any(apply(prev_sel_hap, 1, identical, sel_hap))) {
          sel_hap <- orig_hap
          break
        }
        rbind(prev_sel_hap, sel_hap)
      } else {
        prev_sel_hap <- matrix(sel_hap, nrow=1)
      }
    }
    #haploGained <- setdiff(sel_new, sel_hap)
    #if (length(haploGained) == 0) break;
    if (identical(sel_new, sel_hap)) break #new, check
    sel_hap <- sel_new
  }
  #Now we have haplotype combinations assignments for all SNPdids that
  #have a unique remaining hapcomb, based on the final sel_hap set of
  #haplotypes that must be present.
  #There may be additional haplotypes assigned to some SNPdids, that did not
  #make it into sel_hap because they did not reach the minfrac criterion.
  #Finally we may consider also those extra haplotypes as must-be-present
  #(or some of them, above a certain threshold: occurring in at least 2 and
  #at least 1% of the individuals) and see if that leads to more SNPdids
  #having a unique hapcomb (and not having other SNPdids getting additional
  #hapcombs)
  if (length(minfrac) > 1 && (minfrac[2] < minfrac[1])) {
    SNPdidcolnum <- vapply(res$hclist[names(SNPdidsTable)], FUN=ncol, FUN.VALUE=0)
    onecomb <- SNPdidcolnum == 1
    if (any(!onecomb)) {
      hapcomb <- do.call(cbind, res$hclist[names(SNPdidsTable)[onecomb]])
      #calculate the nr of individuals in which each haplotype occurs:
      #select the SNPdids for which only 1 haplotype combination is possible,
      #and count the nr of indiv in which they occur:
      hapfrq <- colSums(as.vector(SNPdidsTable[onecomb]) * t(hapcomb > 0))
      final_hap <-
        sort(union(orig_hap,
                   which (hapfrq >= 2 & hapfrq > minfrac[2]*sum(SNPdidsTable))))
      resfinal <- single_cycle_infer(allhap, SNPdidsTable, ploidy, final_hap)
      SNPdidcolnumfinal <-
        vapply(resfinal$hclist[names(SNPdidsTable)], FUN=ncol, FUN.VALUE=0)
      #keep resfinal instead of res only if all SNPdids that had only one
      #haplotype combi still have only one (and no dids have more columns
      #than before)
      if (all(SNPdidcolnumfinal <= SNPdidcolnum)) {
        res <- resfinal
        if (FALSE) { # set to TRUE for checking this
          #print comparison:
          lesscol <- SNPdidcolnumfinal < SNPdidcolnum
          m <- matrix(NA_integer_, nrow=2, ncol=sum(lesscol))
          colnames(m) <- names(SNPdidsTable)[lesscol]
          rownames(m) <- c("before", "after")
          m[1,] <- SNPdidcolnum[lesscol]
          m[2,] <- SNPdidcolnumfinal[lesscol]
          print(m)
        }
        # perhaps now there are more SNPdids with a unique haplotype combination
        # (which is the reason for this final test). If so, these may again have
        # extra haplotypes so we could do the final step again. We will not do
        # that because these extra haplotypes are selected on a weak basis
        # (weaker than minfrac[1])
      }
    }
  }

  #finally we add some extra data to the output list:
  res$minfrac <- minfrac
  res$allhap <- allhap
  res$dosmat <- dosmat
  res$SNPdids <- SNPdids
  res$SNPdidsTable <- SNPdidsTable
  res$ploidy <- ploidy
  res
} #inferHaplotypes

single_cycle_infer <- function(allhap, SNPdidsTable, ploidy, sel_hap, progress) {
  #function used only by inferHaplotypes
  abpr <- array(NA_integer_, dim=c(length(SNPdidsTable), 2, nrow(allhap)),
                dimnames=list(names(SNPdidsTable), c("nabs", "npres"),
                              1:nrow(allhap)))
  #       abpr stands for absent-present; contains for each SNPdid the min and
  #       max dosages of each haplotype, taken over all (remaining) haplotype
  #       combinations
  hclist <- list()
  for (i in seq_along(SNPdidsTable)) {
    #   for the current SNPdid, gets dosages of all SNPs
    ahc <- allHaploComb(did=names(SNPdidsTable)[i], #SNPdosages may be missing!
                        allhap=allhap, ploidy=ploidy, progress=progress)
    #   ... and get all possible haplotype combis that produce this SNPdid
    #   (could also be passed as parameter to avoid recalculating)
    if (length(sel_hap) > 0) {
      #if there are haplotypes known to occur, select the haplotype combination
      #that has the hightes total dosage of these known haplotypes
      sel_ahc <- ahc[sel_hap,, drop=FALSE]
      sel_dos <- colSums(sel_ahc) #for each haplotype comb the total dosage of selected haplotypes
      sel_col <- sel_dos == max(sel_dos) #the combination(s) with the maximum dosage
      ahc <- ahc[, sel_col, drop=FALSE] #keep only that/those combination(s)
    }
    hclist[[length(hclist) + 1]] <- ahc
    mima <- haplofrqMinMax(haplocomb=ahc)
    abpr[i,1,] <- SNPdidsTable[i] * (mima[,2] == 0) #nr of ind with this SNPdid
    #             that cannot have the haplotypes
    abpr[i,2,] <- SNPdidsTable[i] * (mima[,1] > 0) # nr of ind with this SNPdid
    #             that must have the haplotypes
  }
  nPresent <- apply(abpr[,2,,drop=FALSE], MARGIN=3, FUN=sum)
  nAbsent <- apply(abpr[,1,,drop=FALSE], MARGIN=3, FUN=sum)
  names(hclist) <- names(SNPdidsTable)
  list(nPresent=nPresent,
       nAbsent=nAbsent,
       hclist=hclist)
} #single_cycle_infer

#'@title pad an integer (prefix with zeroes to a fixed length)
#'@description pad an integer (prefix with zeroes to a fixed length)
#'@usage padded(x, maxx=0)
#'@param x vector of non-negative integers
#'@param maxx a single integer to whose nchar all x elements will be padded;
#'if 0 (default) the largest value in x will be used
#'@return a character vector representing the values of x left-padded with 0's
#'to the length of integer maxx or of max(x)
#'@export
padded <- function(x, maxx=0) {
  formatC(x, width=nchar(max(x, maxx, na.rm=TRUE)), flag="0")
} #padded

#'@title Assign haplotype dosages based on output of inferHaplotypes
#'@description Assign haplotype dosages based on output of inferHaplotypes
#'@usage haplotypeDosages(haploblockname, ihl, partial=FALSE, dropUnused=TRUE)
#'@param haploblockname Prefix to which the (zero-padded) haplotype numbers
#'are added. Any separator (like '_') should be part of haploblockname
#'@param ihl A list as returned by inferHaplotypes
#'@param partial Whether to assign partial partial haplotype combinations,
#'default FALSE. For individuals with multiple fitting haplotype combinations,
#'some of the haplotype dosages may be identical in all these combinations. If
#'partial is TRUE, those haplotype dosages are assigned and the  dosages of the
#'other haplotypes are NA. If FALSE, all haplotype dosages are NA when
#'multiple combinations are possible.
#'@param dropUnused TRUE (default) if the returned matrix should only contain
#'rows for haplotypes that are present; if FALSE matrix contains rows for all
#'possible haplotypes
#'@details The results of inferHaplotypes, as passed in parameter ihl, determine
#'the haplotype combinations that match each SNPdid. This function translates
#'these results in a set of haplotype dosages for each individual.
#'@return A matrix with one row for each haplotype and one column
#'for each individual, with the dosages of all haplotypes per individual. If
#'allrows is FALSE (default) only the rows for the haplotypes with a dosage > 0
#'in any individual are returned, else all rows.
#'@export
haplotypeDosages <- function(haploblockname, ihl,
                             partial=FALSE, dropUnused=TRUE) {
  haplotypeNames <- paste0(haploblockname, padded(1:nrow(ihl$allhap)))
  nind <- sum(ihl$SNPdidsTable)
  #sel_hap <- which(ihl$nPresent >= ihl$threshold * nind)
  result <- matrix(NA_integer_, nrow=length(haplotypeNames),
                   ncol=length(ihl$SNPdids))
  rownames(result) <- haplotypeNames
  colnames(result) <- names(ihl$SNPdids) #individuals
  for (i in seq_along(ihl$SNPdidsTable)) {
    SNPdidind <- which(ihl$SNPdids == as.integer(names(ihl$SNPdidsTable)[i]))
    if (ncol(ihl$hclist[[i]]) == 1) {
      #note that we don't assign haplotype dosages if there is more than
      #one combination possible)
      result[, SNPdidind] <- ihl$hclist[[i]][, 1]
    } else if (partial) {
      #with multiple remaining haplotype combinations, see if there are any
      #that have the same dosage (e.g. 0) over all combinations
      mindos <- apply(ihl$hclist[[i]], MARGIN=1, FUN=min)
      maxdos <- apply(ihl$hclist[[i]], MARGIN=1, FUN=max)
      oneval <- mindos == maxdos
      result[oneval, SNPdidind] <- mindos[oneval]
    }
  }
  #omit haplotypes that don't occur:
  if (dropUnused)
    result <- result[rowSums(result, na.rm=TRUE) > 0,, drop=FALSE] # could have 0 rows
  print(paste("preNAcount =", length(ihl$SNPdids) - sum(ihl$SNPdidsTable))) #number of
  #     individuals with a missing SNPdid due to missing SNP dosages
  print(paste("postNAcount =", sum(is.na(colSums(result)))))
  result
} #haplotypeDosages

#infer dosages based on an F1 population:
# 1. get F1SNPdidsTable for entire F1, don't check yet which / how many
#    haplotype combinations can make up each SNPdid
# 2. for both parents, get all possible haplotype combinations
#    (P1combs, P2combs). To do this most efficiently, a prior round of
#    inferHaplotypes should be run)
# 3. for all combinations from P1combs x P2combs, check how many F1 indiv
#    have a SNPdid that cannot be generated
# 4. select all combinations that reject less that F1frac F1 indivs.
# 5. Each of those combinations gives a (possibly different) set of haplotypes
#    that must be present: for each of these starting sets run
#    inferHaplotypes over the entire sample set (note that the F1s can only
#    have the parental haplotypes, no need to calculate them all)
# 6. Select the best solution; criteria could be:
#    - smallest nr of haplotypes needed to explain (almost) all indiv
#    - largest nr of indiv with a unique explanation
# 7. Return the haplotype combinations for all individuals that are now
#    uniquely defined (and possibly presence/absence/NA for each indiv where
#    there are multiple combinations of haplotypes possible)

# special cases:
# - no F1 population: just do inferHaplotypes and do step 6
# - multiple F1 populations: repeat steps 1-4 for each F1 population, each time
#   using the set of haplotypes that must be present from the previous F1s
#   in the initial inferHaplotypes call.




#'@title infer haplotypes using F1 populations
#'@description infer haplotypes for all individuals, using F1 populations
#'(with parents) to infer haplotypes for non-F1 material as well
#'@usage haplotypeDosagesF1(dosmat, ploidy, haploblockname,
#'PF1, F1, minfrac=c(0.1, 0.01), F1frac=0.05, dropUnused=TRUE,
#'maxparcombs=150000)
#'@param dosmat dosage matrix, SNPs are in rows, individuals in
#'columns, each cell has a SNP dosage (0:ploidy) or NA.\cr
#'All indivs that are not in PF1 or F1 are considered
#'unrelated, i.e. we have no implementation for pedigrees (yet).
#'@param ploidy all SNP dosages should be in 0:ploidy or NA
#'@param haploblockname prefix to which the (zero-padded) haplotype numbers
#'are added. Any separator (like '_') should be part of haploblockname
#'@param PF1 a matrix with 2 columns and one row per F1 population, listing
#'the parents of each population (Note: only one sample per parent; if there
#'are more, use mergeReplicates to merge them)
#'@param F1 a list of character vectors of sample names for the F1 populations
#'in the same order as PF1
#'@param minfrac vector of two fractions, default 0.1 and 0.01. A haplotype is
#'considered to be certainly present if it occurs in at least a fraction
#'minfrac[1] of all individuals; in the final stage for the "other"
#'individuals (those that do not belong to the F1 or its parents) this fraction
#'is lowered to minfrac[2]; see also inferHaplotypes
#'@param F1frac the fraction of F1 individuals for which it is accepted
#'that they are not explained by a parental haplotype combination; default 0.05
#'(allowing for a few percent of Double Reduction)
#'@param dropUnused TRUE (default) if the returned matrix should only contain
#'rows for haplotypes that are present; if FALSE matrix contains rows for all
#'possible haplotypes
#'@param maxparcombs Parent 1 and 2 both may have multiple possible haplotype
#'combinations. For each pair of haplotype combinations (one from P1 and one
#'from P2) the expected F1 segregation must be checked against the observed.
#'This may take a long time if many such combinations need to be checked.
#'This parameter sets a limit to the number of allowed combinations; default
#'150000 takes about 45 min.
#'@details Initially the possible haplotype configurations of the parents are
#'determined. From that, all their possible gametes (assuming polysomic
#'inheritance and no Double Reduction) are calculated and all possible F1
#'haplotype configurations. Comparing this with the observed F1 marker dosages
#'the most likely parental and F1 configurations are found. The inferred
#'parental haplotypes are used for inferring the haplotypes in all "other"
#'samples. If no F1 solutions are found or if the F1 analysis is skipped
#'because of too many parental combinations, the result is based on
#'inferHaplotypes (see there).
#'@return a list with two elements:\cr
#'message ("" if all is well)\cr
#'hapdos: a matrix with one row for each
#'haplotype (dropping the unused ones if dropUnused is TRUE) and one column
#'for each individual, with the inferred dosages of all haplotypes per
#'individual.
#'If dropUnused is TRUE the matrix has 0 rows if no unique haplotype
#'combinations can be inferred for any individual
#'@export
haplotypeDosagesF1 <- function(dosmat, ploidy, haploblockname,
                               PF1, F1, minfrac=c(0.1, 0.01), F1frac=0.05,
                               dropUnused=TRUE, maxparcombs=150000) {

  indiv <- colnames(dosmat); nind <- length(indiv)
  #calculate ihl without using F1 info:
  ihl <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy,
                         minfrac=minfrac[1]) #without final inference
  nhap <- nrow(ihl$allhap)
  F1sizes <- integer(length(F1))
  for (f in seq_along(F1))
    F1sizes[f] <- sum(!is.na(ihl$SNPdids[names(ihl$SNPdids) %in% F1[[f]]]))
  largeF1s <- F1sizes > 6 * ploidy

  had <- matrix(NA_integer_, nrow=nrow(ihl$allhap), ncol=nind)
  #had is matrix of haplotype dosages
  colnames(had) <- indiv
  rownames(had) <- paste0(haploblockname, padded(1:nrow(ihl$allhap)))
  F1sDone <- !largeF1s; changes <- TRUE
  oldhap <- which(ihl$nPresent >= minfrac[1] * nind)
  while (!all(F1sDone) && changes) {
    #continue until all large F1s done or none of the remaining ones
    #can be done
    changes <- FALSE
    for (f in seq_along(F1)) if (!F1sDone[f]) {
      # check the current F1 using the current set of known haplotypes
      # if (success) {
      #   F1sDone[f] <- TRUE
      #   if (new haplotypes added) changes <- TRUE
      ih <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy,
                      minfrac=minfrac[1], sel_hap=oldhap)
      sof <- solveOneF1(dosmat=dosmat, ihl=ih, F1=F1[f], P=PF1[f,],
                        Phad=had[, PF1[f,]], maxparcombs=maxparcombs,
                        ploidy=ploidy, minfrac=minfrac, F1frac=F1frac)
      if (nrow(sof$parcombok) == 1) {
        pSOF <- processSOF(sof, had, oldhap, changes, F1=F1[[f]], P=PF1[f,],
                           ihl=ih)
        had <- pSOF$had; oldhap <- pSOF$oldhap; changes <- pSOF$changes
        F1sDone[f] <- TRUE
      }
    }
  }

  F1sDone[!largeF1s & F1sizes > 0] <- FALSE; changes <- TRUE
  while (!all(F1sDone) && changes) {
    #continue until all F1s (including the large F1s) done or none of the
    #remaining ones can be done; exactly as for largeF1s
    changes <- FALSE
    for (f in seq_along(F1)) if (!F1sDone[f]) {
      # check the current F1 using the current set of known haplotypes
      # if (success) {
      #   F1sDone[f] <- TRUE
      #   if (new haplotypes added) changes <- TRUE
      ih <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy,
                            minfrac=minfrac[1], sel_hap=oldhap)
      sof <- solveOneF1(dosmat=dosmat, ihl=ih, F1=F1[f], P=PF1[f,],
                        Phad=had[, PF1[f,]], maxparcombs=maxparcombs,
                        ploidy=ploidy, minfrac=minfrac, F1frac=F1frac)
      if (nrow(sof$parcombok) == 1) {
        pSOF <- processSOF(sof, had, oldhap, changes, F1=F1[f], P=PF1[f,],
                           ihl=ih)
        had <- pSOF$had; oldhap <- pSOF$oldhap; changes <- pSOF$changes
        F1sDone[f] <- TRUE
      }
    }
  }

  #oldhap now contains all haplotype nrs occurring in any of the
  #parents of F1's that have been solved

  #inferHaplotypes for all non-solved F1s and their parents, and panel
  rest <- setdiff(indiv, c(do.call(c, F1[F1sDone]), PF1[F1sDone,]))
  rest_ihl <- inferHaplotypes(SNPdosages=dosmat, indiv=rest, ploidy=ploidy,
                              minfrac=minfrac, sel_hap=oldhap)
  for (rSNPdid in names(rest_ihl$SNPdidsTable)) {
    rcombs <- rest_ihl$hclist[[rSNPdid]]
    if (ncol(rcombs) == 1) {
      rSNPdidind <- names(rest_ihl$SNPdids)[rest_ihl$SNPdids == rSNPdid]
      had[, colnames(had) %in% rSNPdidind] <- rcombs
    }
  }
  #for all indiv with a unique combination of haplotypes we now have
  #stored the solution (haplotype combination) in had
  if (dropUnused) {
    had <- had[rowSums(had, na.rm=TRUE) > 0, , drop=FALSE]
    #has 0 rows if no indiv has a unique solution
  }
  list(message="", hapdos=had)

  # (if rest_ihl with two minfrac values results in additional haplotypes,
  # see if these can help solve the remaining F1s? And then re-iterate?
  # For later!)

  #Done!
} #haplotypeDosagesF1

processSOF <- function(sof, had, oldhap, changes, F1, P, ihl) {
  #used only by haplotypeDosagesF1
  #sof = list returned by solveOneF1, sof$parhapcomp has 1 row
  #had = matrix of haplotype dosages
  #oldhap = vector of haplotype nrs known to be present
  #changes indicates if the set of oldhap was modified in the current iteration

  #assign hads for P and F1:
  parhapcomb <- cbind(sof$P1combs[, sof$parcombok[1,1]],
                      sof$P2combs[, sof$parcombok[1,2]])
  had[, colnames(had) == P[1]] <- parhapcomb[,1]
  had[, colnames(had) == P[2]] <- parhapcomb[,2]
  F1combs <- getF1combs(parhapcomb)
  F1SNPdos <- haplocomb2SNPdosages(F1combs, allhap=ihl$allhap)
  F1SNPdids <- colnames(F1SNPdos) #SNPdids may not be unique,
  #                  several haplotype combs may give the same SNP dosages
  for (f1SNPdid in unique(F1SNPdids)) {
    if (sum(F1SNPdids == f1SNPdid) == 1) {
      #f1SNPdid unique, only one haplotype comb in F1 matches f1SNPdid
      SNPdidcomb <- F1combs[, F1SNPdids == f1SNPdid, drop=FALSE]
      SNPdidind <- names(ihl$SNPdids[ihl$SNPdids == f1SNPdid]) #all indiv with this f1SNPdid
      SNPdidind <- intersect(F1, SNPdidind)
      had[, colnames(had) %in% SNPdidind] <- SNPdidcomb
    }
  }
  #check if parhapcomb contains new haplotypes:
  parhap <- which(rowSums(parhapcomb) > 0)
  m <- match(parhap, oldhap)
  if (anyNA(m)) {
    #new haplotypes found in parents that earlier were not marked as present
    changes <- changes || anyNA(m)
    oldhap <- sort(union(oldhap, parhap))
  }
  list(had=had, oldhap=oldhap, changes=changes)
} #processSOF


#'@title find the optimal combination of parental haplotype combinations
#'@description For one F1 population and one haploblock, find the optimal
#'combination of parental haplotype combinations
#'@usage solveOneF1(dosmat, ihl, F1, P, Phad, maxparcombs, ploidy,
#'minfrac, F1frac)
#'@param dosmat dosmat with only the SNPs in the current haploblock, and with
#'only one column for each F1 parent, containing its consensus dosages;
#'otherwise all individuals of interest within and outside the F1 are retained
#'@param ihl a list returned by inferHaplotypes
#'@param F1 character vector with sample names of the current F1
#'@param P vector with 2 sample names, one for parent1 and one for parent2.
#'The corresponding columns in dosmat should have pre-calculated consensus
#'SNP genotypes for both parents, and orig_ihl should be based on that
#'dosmat
#'@param Phad matrix with 2 columns, with the haplotype dosages assigned
#'to the parents. Usually both contain only NA_integer_s, but if they are
#'assigned we consider only that had
#'@param maxparcombs Parent 1 and 2 both may have multiple possible haplotype
#'combinations. For each pair of haplotype combinations (one from P1 and one
#'from P2) the expected F1 segregation must be checked against the observed.
#'This may take a long time if many such combinations need to be checked.
#'This parameter sets a limit to the number of allowed combinations; default
#'150000 takes about 45 min.
#'@param ploidy all SNP dosages should be in 0:ploidy or NA
#'@param minfrac vector of two fractions, default 0.1 and 0.01. A haplotype is
#'considered to be certainly present if it occurs in at least a fraction
#'minfrac[1] of all individuals; in the final stage for the "other"
#'individuals (those that do not belong to the F1 or its parents) this fraction
#'is lowered to minfrac[2]; see also inferHaplotypes
#'@param F1frac the fraction of F1 individuals for which it is accepted
#'that they are not explained by a parental haplotype combination; default 0.05
#'(allowing for a few percent of Double Reduction)
#'@return a list with a message, two matrices P1combs and P2combs with the
#'haplotype combinations that were considered for each parent
#'and a matrix parcombok with 0 or 1 rows and two columns: 0 rows if no
#'solution was found, 1 row indexing the columns of P1combs and P2combs
#'if a solution was found.
solveOneF1 <- function(dosmat, ihl, F1, P, Phad, maxparcombs, ploidy, minfrac,
                       F1frac) {
  #TODO: If no solution found and Phad not NA's for both parents,
  #it may be that the had(s) assigned based on a previous F1 is not correct.
  #In that case ignore the Phad and if that succeeds, go back to previous F1
  #with new had for that parent

  calcParcombok <- function(P1combs, P2combs, oldP1combs, oldP2combs, ihl,
                            F1frac) {
    # Function within solveOneF1
    # Calculates the combinations of parental haplotype combinations
    # that can explain (almost) all F1 individuals
    # P1combs, P2combs: matrices with haplotype combinations for P1 and P2
    # oldP1combs, oldP2combs: matrices with parental haplotypes combs that
    # were already calculated and  whose P1-P2 combinations were no solutions
    # (may have 0 columns if none)
    #TODO: parallellize; see also allHaploComb
    P1matchold <- match(as.data.frame(P1combs), as.data.frame(oldP1combs))
    P2matchold <- match(as.data.frame(P2combs), as.data.frame(oldP2combs))
    # NOTE that match matches vector elements, also when the vector is a
    #      list (or data.frame)
    if (!anyNA(P1matchold) && !anyNA(P2matchold)) {
      #no new parental combinations, so no solutions
      parcombok <- matrix(integer(0), ncol=2)
    } else {
      missedF1s <- matrix(NA_integer_, nrow=ncol(P1combs), ncol=ncol(P2combs))
      #missedF1s is the number of F1's with a set of SNP dosages
      #          that cannot be produced with each parental combination
      for (pc1 in seq_len(ncol(P1combs))) for (pc2 in seq_len(ncol(P2combs))) {
        if (is.na(P1matchold[pc1]) || is.na(P2matchold[pc2])) {
          #this is a new combination, check:
          F1combs <- getF1combs(cbind(P1combs[, pc1], P2combs[, pc2]))
          F1SNPdos <- haplocomb2SNPdosages(F1combs, allhap=ihl$allhap)
          F1combSNPdids <- colnames(F1SNPdos)
          missedSNPdids <- setdiff(names(F1SNPdidsTable), F1combSNPdids)
          missedF1s[pc1, pc2] <-
            sum(F1SNPdidsTable[names(F1SNPdidsTable) %in% missedSNPdids])
        }
        else {
          #this is an old combination, don't check but have it discarded:
          missedF1s[pc1, pc2] <- length(F1)
        }
      }
      parcombok <- missedF1s <= F1frac * sum(F1SNPdidsTable)
      #remove for each parent the haplotype combinations that always fail:
      pcok <- rowSums(parcombok) > 0
      P1combs <- P1combs[, pcok, drop=FALSE]
      parcombok <- parcombok[pcok,, drop=FALSE]
      pcok <- colSums(parcombok) > 0
      P2combs <- P2combs[, pcok, drop=FALSE]
      parcombok <- parcombok[, pcok, drop=FALSE]
      parcombok <- which(parcombok, arr.ind=TRUE) #now a matrix with one row for
      # each valid combination of P1combs and P2combs, and 2 columns for the
      # indices to P1combs and P2combs
      colnames(parcombok) <- c("P1comb", "P2comb")
    }
    list(P1combs=P1combs, P2combs=P2combs, parcombok=parcombok)
  } #calcParcombok within solveOneF1

  nhap <- nrow(ihl$allhap)
  PhadNA <- is.na(Phad[1,]) #all or none of the values in a Phad column are NA

  F1SNPdidsTable <- table(ihl$SNPdids[names(ihl$SNPdids) %in% F1])
  if (PhadNA[1]) {
    P1combs <- getParentalCombs(parsamp=P[1], ihl=ihl, useIH=TRUE)
  } else P1combs <- Phad[, 1]
  if (PhadNA[2]) {
    P2combs <- getParentalCombs(parsamp=P[2], ihl=ihl, useIH=TRUE)
  } else P2combs <- Phad[, 2]
  message <- ""
  lst <- NULL
  firstcombs <- ncol(P1combs) * ncol(P2combs)
  if (firstcombs == 0 || firstcombs > maxparcombs) {
    # too many potential combinations (it takes about 45 min to check
    # 150000 combinations)
    message <- paste("more than", maxparcombs, "parental combinations; skipped")
    parcombok <- matrix(integer(0), ncol=2)
  } else {
    origP1combs <- P1combs; origP2combs <- P2combs
    lst <- calcParcombok(P1combs=origP1combs, P2combs=origP2combs,
                         oldP1combs=matrix(integer(0), nrow=nrow(origP1combs)),
                         oldP2combs=matrix(integer(0), nrow=nrow(origP2combs)),
                         ihl=ihl, F1frac=F1frac)
    # note that in this first try the oldPcombs have 0 columns as there has not
    # yet been an earlier calculation
    if (nrow(lst$parcombok) == 0) {
      #No parental combinations fit.
      #1. See how many parental combinations are possible without first
      #   inferHaplotypes (i.e. with allHaploComb).
      #   If p1*p2 < maxparcombs we try them all.
      if (PhadNA[1]) {
        nwP1combs <- getParentalCombs(parsamp=P[1], ihl=ihl, useIH=FALSE)
      } else nwP1combs <- Phad[, 1]
      if (PhadNA[2]) {
        nwP2combs <- getParentalCombs(parsamp=P[2], ihl=ihl, useIH=FALSE)
      } else nwP2combs <- Phad[, 2]
      if ((ncol(nwP1combs) * ncol(nwP2combs)) - firstcombs > maxparcombs) {
        message <- paste("more than", maxparcombs,
                         "extra parental combinations; skipped")
      } else {
        lst <- calcParcombok(P1combs=nwP1combs, P2combs=nwP2combs,
                             oldP1combs=origP1combs, oldP2combs=origP2combs,
                             ihl=ihl, F1frac=F1frac)
        if (nrow(lst$parcombok) == 0) {
          #2. If 1. does not find a solution due to wrong parental SNP data or
          #   because it was skipped because of too many combinations,
          #   just return the inferHaplotypes + haplotypeDosages result,
          #   with a message.
          message <- "no valid F1 solutions found"
        }
      }
    }
  }
  if (message != "") {
    if (is.null(lst))
      lst <- list(P1combs=P1combs, P2combs=P2combs,
                  parcombok=matrix(integer(0), ncol=2))
    return(c(list(message=message), lst))
  }

  # now we have one or more acceptable parental combinations
  P1combs <- lst$P1combs
  P2combs <- lst$P2combs
  parcombok <- lst$parcombok
  #parcombok indicates which combinations of these match (almost) all F1's.
  #Now, any of these combinations could be the correct one. We check all of them
  #and use the best. So for each combination:
  # (1) get the total set of parental haplotypes: these are all known to be
  #     present, given the selected parental combination
  # (2) inferHaplotypes for all samples, and get quality parameters for the
  #     parental combination:
  #     - hAbsent: the number of haplotypes that must be absent in (almost)
  #       all individuals (as large as possible)
  #     - indSingle: the number of individuals that have one single possible
  #       combination of haplotypes (as large as possible)
  #     - indExtra: the number of individuals that need extra haplotypes
  #       (that are not certainly present); leave for now
  #     - minhap: the minimum number of haplotypes needed to explain (almost)
  #       all individuals (as small as possible); harder, leave for now
  indiv <- colnames(dosmat)
  parcombstats <- data.frame(hAbsent = rep(NA_integer_, nrow(parcombok)),
                             indSingle = rep(NA_integer_, nrow(parcombok)),
                             indExtra = rep(NA_integer_, nrow(parcombok)),
                             minhap = rep(NA_integer_, nrow(parcombok)))
  ihlist <- list()
  for (pco in seq_len(nrow(parcombok))) {
    parcomb <- cbind(P1combs[,parcombok[pco, 1]],
                     P2combs[,parcombok[pco, 2]])
    parhaplo <- which(rowSums(parcomb) > 0)
    ih <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy,
                          minfrac=minfrac[1], sel_hap=parhaplo)
    ihlist[[pco ]] <- ih
    parcombstats$hAbsent[pco] <- sum(ih$nAbsent > 0.9 * length(indiv))
    #parcombstats$indSingle (nr of ind with a unique haplotype combi):
    SNPdidsTable <- table(ih$SNPdids)
    #get the number of haplotype combis for each SNPdid (SNPdids become the names):
    SNPdidcolnum <- vapply(ih$hclist[names(SNPdidsTable)], FUN=ncol, FUN.VALUE=0)
    #select the SNPdids for which only 1 haplotype combination is possible,
    #and count the nr of indiv in which they occur:
    onecomb <- SNPdidcolnum == 1
    parcombstats$indSingle[pco] <- sum(SNPdidsTable[onecomb])
    #Could this be improved, e.g. by minimizing the number of haplotypes
    #beyond the ones known to be present in the F1, over all other
    #individuals we might get more unique solutions
  }
  #for now: simple selection of best solution = the parcomb for which
  #most indiv have a unique haplotype combi:
  best <- which(parcombstats$indSingle == max(parcombstats$indSingle))
  if (length(best) > 1) {
    #if more than one parcomb selected, take the one that for which the most
    #haplotypes are absent in > 90% of all indiv (loosely: that needs the
    #fewest haplotypes):
    best2 <- which.max(parcombstats$hAbsent[best])
    best <- best[best2]
  }
  return(list(message="", P1combs=lst[[1]], P2combs=lst[[2]],
              parcombok=parcombok[best,, drop=FALSE]))
  #note: the calling routine can get the set of parental haplotypes by:
  #if(nrow(result$parcombok) != 1) integer(0) else
  #  which(rowSums(cbind(result$P1combs[,result$parcombok[1, 1]],
  #                      result$P2combs[,result$parcombok[1, 2]])) > 0)
} #solveOneF1

#TODO:
# - impute missing SNP dosages (1) in F1 (2) in other?
# - use multiple F1 populations + other
# - partial (as for haplotypeDosages)
# - Double Reduction: If an F1 does not fit the parents it is now assigned
#   NA for all haplotype dosages; if it would fit DR (and that parameter is
#   TRUE), check if that can be the case before assigning NAs)
# - Does the F1 have an advantage for scoring the other samples?
#   Compare this for all contigs

#'@title merge replicate samples in dosage matrix
#'@description merge replicate samples in dosage matrix
#'@usage mergeReplicates(dosmat, replist, solveConflicts=TRUE)
#'@param dosmat a dosage matrix with SNPs in rows and individuals in columns.
#'row names are SNP names, column names are individual names.
#'@param replist a list of character vectors, each of which has the sample names
#'of a set of replicates
#'@param solveConflicts if TRUE (default) and there are conflicting dosage
#'assignments between replicates for the same SNP, the one with highest
#'frequency is used, provided the total freq of other dosages  <= 1 OR
#'<= 10% of the frequency of the maximum dosage. If
#'solveConflicts is FALSE and there are conflicting dosages, the consensus for
#'that SNP will be NA
#'@return a version of dosmat in which only one column of each set of replicates
#'is retained; this column (the first in its set as specified in replist)
#'now has the consensus scores over all replicates
mergeReplicates <- function(dosmat, replist, solveConflicts=TRUE) {
  if (!is.list(replist)) replist <- list(replist)
  for (p in seq_along(replist)) {
    consdos <- getConsensusSNPdosages(dosmat=dosmat,
                                      indiv=replist[[p]],
                                      solveConflicts=solveConflicts)
    dosmat[, replist[[p]][1]] <- consdos #1st sample gets the consensus
    dosmat <- dosmat[, -replist[[p]][-1]] #delete cols of all other samples
  }
  dosmat
} #mergeReplicates

#'@title get consensus SNP dosages from one or more samples
#'@description get consensus SNP dosages from one or more samples
#'@usage getConsensusSNPdosages(dosmat, indiv, solveConflicts=TRUE)
#'@param dosmat a dosage matrix with SNPs in rows and individuals in columns.
#'row names are SNP names, column names are individual names.
#'@param indiv character vector with the names of the samples from which to
#'obtain consensus scores
#'@param solveConflicts if TRUE (default) and there are conflicting dosage
#'assignments between the samples for the same SNP, the one with highest
#'frequency is used, provided the total freq of other dosages  <= 1 OR
#'<= 10% of the frequency of the maximum dosage. If
#'solveConflicts is FALSE and there are conflicting dosages, the consensus for
#'that SNP will be NA
#'@return a vector with one consensus dosage for each row of dosmat, and the
#'row names of dosmat as names
getConsensusSNPdosages <- function(dosmat, indiv, solveConflicts=TRUE) {
  if (is.vector(dosmat)) dosmat <- matrix(dosmat, ncol=1)
  if (!all(indiv %in% colnames(dosmat)))
    stop("not all indiv in dosmat")
  dosmat <- dosmat[,colnames(dosmat) %in% indiv, drop=FALSE]
  suppressWarnings({ #suppress warnings about all dosages NA
    mindos <- apply(dosmat, MARGIN=1, FUN=min, na.rm=TRUE)
    maxdos <- apply(dosmat, MARGIN=1, FUN=max, na.rm=TRUE)
  })
  if (solveConflicts) {
    for (i in which(mindos != maxdos)) {
      tb <- table(dosmat[i,]) # length 0 if all NA
      suppressWarnings({ maxtb <- which(tb == max(tb)) })
      if (length(maxtb) == 1) {
        othercount <- sum (!is.na(dosmat[i,])) - tb[maxtb]
        if (othercount > 1 && othercount > 0.1 * tb[maxtb]) {
          mindos[i] <- NA
        } else mindos[i] <- as.integer(names(tb)[maxtb])
      } else mindos[i] <- NA
    }
  } else {
    #not solve conflicts, set any SNP with conflicting dosages to NA:
    mindos[mindos != maxdos] <- NA #includes rows with all NA because -Inf != Inf
  }
  names(mindos) <- row.names(dosmat)
  mindos
} #getConsensusSNPdosages

#'@title check all parental sample sets
#'@description check that for P1 and P2 the correct number of sample sets are
#'present, that these contain no duplicates and that that they overlap
#'completely or not at all, and that if they overlap, they have the same sample
#'as first one
#'@usage checkAllParentSets(P, npop, indiv)
#'@param P a list of 2 elements, each of which is a list of character vectors
#'containing the sample names for parents 1 and parents 2
#'@param npop the number of F1 populations
#'@param indiv a character vector with all sample names, the parental samples
#'should all be in indiv
#'@return an error message, "" if all is well
checkAllParentSets <- function(P, npop, indiv) {
  if (length(P) !=2 || length(P[[1]] != npop) || length(P[[2]]) != npop)
    return("for each F1 population there should be one set of samples for each parent")
  # check that no parental set has duplicate samples and that all samples are in indiv:
  P <- c(P[[1]], P[[2]])
  for (i in seq_len(2*npop)) {
    if (length(P[[i]]) != length(unique(P[[i]])))
      return("some parental sample sets contain duplicate samples")
    if (!all(P[[i]] %in% indiv))
      stop("Some parental samples are missing from indiv")
  }
  #check that if two parents are the same, all their samples are the same:
  for (i in 1:(length(P)-1)) for (j in (i+1):length(P)) {
    if (length(intersect(P[[i]], P[[j]])) > 0) {
      if (!setequal(P[[i]], P[[j]]))
        return("some parental sample sets overlap partially")
      if (P[[i]][1] != P[[j]][1])
        return("parental sample sets for the same parents must start with the same sample")
    }
  }
  return("")
} #checkAllParentSets

#'@title get the most likely haplotype combinations for a (parental) individual
#'@description get the most likely haplotype combinations for a (parental)
#'individual based on all its samples
#'@usage getParentalCombs(parsamp, ihl, useIH)
#'@param parsamp character vector with all sample names for the individual
#'@param ihl a list as returned by inferHaplotypes, of which SNPdid, hclist and
#'allhap are used
#'@param useIH TRUE if only the inferred haplotype combinations must be
#'returned, FALSE for all possible haplotype combinations
#'@return a matrix with one row for each haplotype and one column for each
#'of the most likely combinations of haplotypes, with each element being
#'the haplotype dosage in that combination
getParentalCombs <- function(parsamp, ihl, useIH) {
  #TODO: see if we can fill in missing SNP dosages by combining the samples

  #which SNPdids occur over all samples in parsamp?
  parSNPdidsTable <- table(ihl$SNPdids[names(ihl$SNPdids) %in% parsamp])
  if (length(parSNPdidsTable) == 0) {
    #all parental SNPdids were NA, return a 0-column matrix:
    return(matrix(0, nrow=nrow(ihl$hclist[[1]]), ncol=0))
  }
  #select only the most frequent SNPdid(s) among the samples of this individual:
  parSNPdidsTable <- parSNPdidsTable[parSNPdidsTable == max(parSNPdidsTable)]
  if (useIH) {
    do.call(cbind, ihl$hclist[names(parSNPdidsTable)])
  } else {
    result <- matrix(0, nrow=nrow(ihl$hclist[[1]]), ncol=0)
    for (d in names(parSNPdidsTable))
      result <- cbind(result,
                      allHaploComb(did=d, allhap=ihl$allhap, ploidy=ihl$ploidy))
    result
  }
} #getParentalCombs

#'@title get all F1 haplotype combinations expected from two parental haplotype
#'combinations
#'@description get all F1 haplotype combinations expected from two parental
#'haplotype combinations
#'@usage getF1combs(parcomb)
#'@param parcomb matrix with one column for each parent, one row per
#'haplotype, giving the dosages of each haplotype for each parent
#'@return a matrix with the same rows as parcomb, and one column for each
#'haplotype combination that can be generated from these two parents,
#'assuming polysomic inheritance with no double reduction.\cr
#'The colnames are NULL: haplotype combinations are not named.
getF1combs <- function(parcomb) {
  ploidy <- sum(parcomb[,1])
  gamP1 <- getAllGametes(parcomb[, 1])
  gamP2 <- getAllGametes(parcomb[, 2])
  F1comb <- matrix(NA_integer_, ncol=ncol(gamP1) * ncol(gamP2),
                   nrow=nrow(parcomb))
  colnames(F1comb) <- 1:ncol(F1comb)
  for (g1 in seq_len(ncol(gamP1))) for (g2 in seq_len(ncol(gamP2))) {
    F1col <- (g1-1) * ncol(gamP2) + g2
    F1comb[, F1col] <-
      tabulate(c(gamP1[,g1], gamP2[,g2]), nbins=nrow(parcomb))
  }
  unique(F1comb, MARGIN=2)
} #getF1combs

getAllGametes <- function (haploDos) {
  #haploDos: vector of the haplotype dosages of a parent, one element
  #            per haplotype
  #result: matrix with (ploidy/2) rows and one column per unique gamete,
  #        each element is the number of a haplotype)
  ploidy <- sum(haploDos)
  # get from haploDos a vector of length ploidy with all haplotypes present:
  allset <- unlist( mapply(rep, 1:length(haploDos), haploDos))
  #get the matrix with unique gametes:
  unique(combn(allset, ploidy/2), MARGIN=2)
} #getAllGametes

#'@title infer haplotypes for a range of haploblocks
#'@description infer haplotypes for a range of haploblocks, for all individuals,
#'using F1 population(s) (with parents) to infer haplotypes for non-F1 material
#'as well
#'@usage solveAllHaploblocks(SNPdosages, indiv, ploidy, haploblocks,
#'P1, P2, F1, replicates=NULL, minfrac=c(0.1, 0.01), F1frac=0.05,
#'dropUnused=TRUE, maxparcombs=150000)
#'@param SNPdosages matrix or data.frame. SNPs are in rows, individuals in
#'columns, each cell has a SNP dosage. Names of individuals are the column
#'names, SNP names are the row names or (if a data.frame) in a column named
#'MarkerNames. All SNP dosages must be in 0:ploidy or NA.
#'@param indiv NULL (default) or a character vector with names of all individuals
#'to be considered. If NULL, all columns of SNPdosages are selected.\cr
#'All indivs that are not in any of the P1, P2 or F1 vectors are considered
#'unrelated, i.e. we have no implementation for pedigrees (yet).
#'@param ploidy all SNP dosages should be in 0:ploidy or NA
#'@param haploblocks a list of character vectors. The names are the names of the
#'haploblocks, the character vectors have the names if the SNP markers in each
#'haploblock. The haploblock names are used as prefixes to which the
#'(zero-padded) haplotype numbers are added. Any separator (like '_') should be
#'part of haploblocknames
#'@param P1 a list of character vectors with the names of the samples for
#'parent1 of each F1, for which consensus SNP dosages will be calculated before
#'haplotype assignment. In the case of a single F1, P1 may also be a character
#'vector; if there are no F1 populations, set to NULL\cr
#'Note that if the same parent is used in multiple F1s or twice in the same F1
#'(a selfing), exactly the same samples should be listed for both, and the first
#'sample should be the same. The haplotype combination is only returned for
#'the first sample of each set.
#'@param P2 as P1, for parent 2
#'@param F1 as P1, for the F1 populations. In the F1s each individual is assumed
#'to be represented by one sample only
#'@param replicates a list of character vectors with sets of replicated
#'samples for other individuals (non-parents) that should be merged. If F1
#'individuals are represented by multiple samples they can be merged by
#'listeng them here, but then only the first of these samples should be
#'listed as an F1 individual. The haplotype combination is only returned for
#'the first sample of each set.
#'@param minfrac vector of two fractions, default 0.1 and 0.01. A haplotype is
#'considered to be certainly present if it occurs in at least a fraction
#'minfrac[1] of all individuals; in the final stage for the "other"
#'individuals (those that do not belong to the F1 or its parents) this fraction
#'is lowered to minfrac[2]; see also inferHaplotypes
#'@param F1frac the fraction of F1 individuals for which it is accepted
#'that they are not explained by a parental haplotype combination; default 0.05
#'(allowing for a few percent of Double Reduction)
#'@param dropUnused TRUE (default) if the returned matrix should only contain
#'rows for haplotypes that are present; if FALSE matrix contains rows for all
#'possible haplotypes
#'@param maxparcombs Parent 1 and 2 both may have multiple possible haplotype
#'combinations. For each pair of haplotype combinations (one from P1 and one
#'from P2) the expected F1 segregation must be checked against the observed.
#'This may take a long time if many such combinations need to be checked.
#'This parameter sets a limit to the number of allowed combinations; default
#'150000 takes about 45 min.
#'@details Initially the possible haplotype configurations of the parents are
#'determined. From that, all their possible gametes (assuming polysomic
#'inheritance and no Double Reduction) are calculated and all possible F1
#'haplotype configurations. Comparing this with the observed F1 marker dosages
#'the most likely parental and F1 configurations are found. The inferred
#'parental haplotypes are used for inferring the haplotypes in all "other"
#'samples. If no F1 solutions are found or if the F1 analysis is skipped
#'because of too many parental combinations, the result is based on
#'inferHaplotypes (see there).\cr
#'The merging of samples is done using mergeReplicates and the haplotypes
#'for each haploblock are assigned by inferHaplotypesF1
#'@return a list with for each haploblock a list with two elements:\cr
#'message ("" if all is well)\cr
#'hapdos: a matrix with one row for each
#'haplotype (dropping the unused ones if dropUnused is TRUE) and one column
#'for each individual, with the inferred dosages of all haplotypes per
#'individual.
#'If dropUnused is TRUE the matrix has 0 rows if no unique haplotype
#'combinations can be inferred for any individual
#'@export
solveAllHaploblocks <- function(SNPdosages, indiv, ploidy, haploblocks,
                                P1, P2, F1, replicates=NULL,
                                minfrac=c(0.1, 0.01), F1frac=0.05,
                                dropUnused=TRUE, maxparcombs=150000) {
  dosmat <- checkSNPdosages(SNPdosages, indiv=indiv, ploidy=ploidy)
  haploblocks <- haploblocks[sapply(haploblocks, length) > 0]
  if (!all(do.call(c, haploblocks) %in% rownames(dosmat)))
    stop("Not all markers in haploblocks occur in SNPdosages")
  solveConflicts <- TRUE #make this a parameter?
  if (!is.null(replicates))
    dosmat <- mergeReplicates(dosmat, replist=replicates, solveConflicts)
  nopops <-  all(c(is.null(P1), is.null(P2), is.null(F1)))
  if (!nopops) {
    if (!is.list(F1)) P1 <- list(F1)
    if (length(P1) != length(P2) || length(P1) != length(F1))
      stop("P1, P2 and F1 do not have the same length")
    if (!all(do.call(c, F1) %in% colnames(dosmat)))
      stop("Some F1 samples not present")

    P <- list(P1, P2) # a list of lists
    for (p in 1:2) if (!is.list(P[[p]])) P[[p]] <- list(P[[p]])
    msg <- checkAllParentSets(P, npop=length(F1), indiv=colnames(dosmat))
    if (msg != "") stop(msg)
    for (p in 1:2) { # p is parent (1 or 2)
      for (pop in seq_along(F1)) {
        #it can be that this same parent has already been checked for an earlier
        #population and then only one sample remains, with the consensus dosage:
        P[[p]][[pop]] <- intersect(P[[p]][[pop]], colnames(dosmat))
        if (length(P[[p]][[pop]]) > 1) {
          dosmat <- mergeReplicates(dosmat, replist=P[[p]][[pop]], solveConflicts)
        }
        PF1[pop, p] <- P[[p]][[pop]][1]
      }
    } # for p in 1:2
    #order the populations by decreasing F1 size:
    F1sizes <- sapply(F1, length)
    o <- order(F1sizes, decreasing=TRUE)
    F1 <- F1[o]
    PF1 <- PF1[o,]
  } #!nopops
  result <- list(); hbrows <- rep(NA_integer_, length(haploblocks))
  for (hb in seq_along(haploblocks)) {
    if (nopops) {
      ihl <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy,
                             minfrac=minfrac)
      result[[hb]]$message <- ihl$message
      result[[hb]]$had <- haplotypeDosages(haploblockname=names(haploblocks)[hb],
                                           ihl=ihl)
    } else {
      result[[hb]] <-
        haplotypeDosagesF1(dosmat[match(haploblocks[[hb]], rownames(dosmat)),],
                           ploidy=ploidy,
                           haploblockname=names(haploblocks)[hb],
                           PF1=PF1, F1=F1, minfrac=minfrac, F1frac=F1frac,
                           dropUnused=dropUnused, maxparcombs=maxparcombs)
        hbrows <- ifelse(is.matrix(result[[hb]]), nrow(result[[hb]]), 1)
    }
  }
} #solveAllHaploblocks
