From 73160ccf13160493b37f02192e312d23ae1e03ad Mon Sep 17 00:00:00 2001
From: Roeland Voorrips <roeland.voorrips@wur.nl>
Date: Fri, 21 Apr 2017 17:14:47 +0200
Subject: [PATCH] new untested haplotypeDosagesF1 for multiple F1s. Issues with
 ahc(complete)list in check package

---
 R/PolyHaplotyper.R | 469 ++++++++++++++++++++++++++++++---------------
 1 file changed, 311 insertions(+), 158 deletions(-)

diff --git a/R/PolyHaplotyper.R b/R/PolyHaplotyper.R
index cf9eede..8479f50 100644
--- a/R/PolyHaplotyper.R
+++ b/R/PolyHaplotyper.R
@@ -70,7 +70,7 @@ utils::globalVariables(names=c("ahcploidy", "ahclist", "ahcunsaved"),
 #'@usage allHaplotypes(SNPnames)
 #'@param SNPnames the names of the SNPs (or other bi-allelic markers) in the
 #'haploblock (contig)
-#'@return a matrix with SNPs in columns and all possible (2^n) haplotypes
+#'@return a matrix with SNPs in columns and all possible (2^nSNP) haplotypes
 #'in rows (1: haplotype contains the dosage-counted SNP allele, 0: haplotype
 #'contains the other SNP allele). The colnames are the SNPnames.
 allHaplotypes <- function(SNPnames) {
@@ -79,13 +79,14 @@ allHaplotypes <- function(SNPnames) {
     lst[[s]] <- 0:1
   }
   eg <- as.matrix(expand.grid(lst, stringsAsFactors=FALSE))
-  eg <- eg[,ncol(eg):1]
+  eg <- eg[, ncol(eg):1, drop=FALSE]
   colnames(eg) <- SNPnames
   eg
 } #allHaplotypes
 
 loadAllHaploCombList <- function(ploidy) {
-  # Load ahclist into GlobalEnv if needed.
+  # Function is called only by allHaploComb.
+  # Load ahclist and ahccompletelist into GlobalEnv if needed.
   # Checks variable ahcploidy in GlobalEnv. If that doesn't exist or
   # is not equal to ploidy, attempts to load ahclist into the GlobalEnv
   # from a file 'ahclist_nx.RData' in the current working directory (where n is
@@ -94,23 +95,37 @@ loadAllHaploCombList <- function(ploidy) {
   # Return value is TRUE if the ahclist was already available or is loaded
   # succesfully, else FALSE.
   listloaded <- FALSE
-  if (exists("ahcploidy", envir=.GlobalEnv, inherits==FALSE) &&
-    exists("ahclist", envir=.GlobalEnv, inherits==FALSE)) {
-    listloaded <- ahcploidy == ploidy
-    if (!listloaded && exists("ahcunsaved", envir=.GlobalEnv, inherits==FALSE)
-        && ahcunsaved) {
+  if (!exists("ahcinfo", envir=.GlobalEnv, inherits==FALSE)) {
+    newinfo <- list()
+    newinfo$ploidy <- 0 #to ensure loading a new ahclist
+    newinfo$unsaved <- FALSE
+    newinfo$complete_nSNP <- 0
+    assign("ahcinfo", value=newinfo, envir=.GlobalEnv, inherits=FALSE)
+  }
+  if (ahcinfo$ploidy != ploidy) {
+    #save any unsaved ahclist:
+    if (ahcinfo$unsaved) {
       #the currently loaded ahclist is for a different ploidy and unsaved;
       #we save it before loading the ahclist for the current ploidy:
-      save(ahclist, file=paste0("ahclist_", ahcploidy,"x.RData"))
+      save(ahclist, file=paste0("ahclist_", ahcinfo$ploidy,"x.RData"))
     }
+    #load the ahccompletelist:
+    x <- tryCatch(load(paste0("ahccompletelist_", ploidy,"x.RData"),
+                       envir=.GlobalEnv),
+                  error=function(e) "error")
+    if (identical(x, "error")) {
+      ahcinfo$complete_nSNP <- 0
+    } else ahcinfo$complete_nSNP <- length(ahccompletelist)
+    #load the ahclist:
+    x <- tryCatch(load(paste0("ahclist_", ploidy,"x.RData"),
+                       envir=.GlobalEnv),
+                  error=function(e) "error")
+    if (identical(x, "error"))
+      assign("ahclist", value=list(), envir=.GlobalEnv, inherits=FALSE)
+    #update ahcinfo:
+    ahcinfo$ploidy <- ploidy
+    ahcinfo$unsaved <- FALSE
   }
-  if (!listloaded && file.exists(paste0("ahclist_", ploidy, "x.RData"))) {
-    load(paste0("ahclist_", ploidy,"x.RData"), envir=.GlobalEnv)
-    assign("ahcploidy", value=ploidy, envir=.GlobalEnv, inherits=FALSE)
-    assign("ahcunsaved", value=FALSE, envir=.GlobalEnv, inherits=FALSE)
-    listloaded <- TRUE
-  }
-  listloaded
 } #loadAllHaploCombList
 
 #'@title get all haplotype combinations that result in the given SNP dosages
@@ -136,8 +151,13 @@ loadAllHaploCombList <- function(ploidy) {
 #'@param progress whether to print a message when starting calculations
 #'for a did; default FALSE
 #'@details Each column of the return value represents one combination of
-#'haplotypes that yieds the observed SNP dosages. If any of the SNP dosages are
-#'NA or not in 0:ploidy, a matrix with 0 columns is returned
+#'haplotypes that yields the observed SNP dosages. If any of the SNP dosages are
+#'NA or not in 0:ploidy, a matrix with 0 columns is returned.\cr
+#'This function makes use of precalculated lists that are read from files
+#'in the current working directory: ahccompletelist_nx.RData and
+#'ahclist_nx.RData (where n is the ploidy). These lists ahccompletelist and
+#'ahclist are stored in the GlobalEnv, as well as another list ahcinfo. Liust
+#'ahcinfo may be extended with newly calculated results.
 #'@return an integer matrix with one row per haplotype (corresponding to allhap)
 #'and one column per combination of haplotypes, with the number of copies
 #'of each haplotype in each combination (the dosages of the haplotypes)
@@ -161,25 +181,27 @@ allHaploComb <- function(SNPdosages=NULL, did=NULL, allhap, ploidy,
     } else did <- SNPdidfun(SNPdosages, ploidy=ploidy)
   }
 
-  #we use ahclist in .GlobalEnv:
-  listloaded <- loadAllHaploCombList(ploidy)
-  didix <- integer(0)
+  loadAllHaploCombList(ploidy) #does nothing if list already in GlobalEnv
+  # first check if this nSNP is in ahccompletelist:
+  if (ahcinfo$complete_nSNP >= nSNP) {
+    # in the ahccompletelist, for each nSNP the entire list is precalculated
+    # by completeAllHaploComb, and the elements are directly indexed by the did:
+    return(ahccompletelist[[nSNP]][[did]])
+  }
+  # else look it up in ahclist, and if needed calculate it:
   if (listloaded && length(ahclist) >= nSNP)
     didix <- which(names(ahclist[[nSNP]]) == did)
   if (length(didix) == 1) {
     #the haplotype combinations for this SNPdid were already calculated earlier:
     return(ahclist[[nSNP]][[didix]])
   }
-
-  #now ahclist is not loaded because the file did not exist yet,
-  #or it is loaded but the required element is not yet computed.
-  #we create or copy the list to a local tmplist, modify that,
+  # did was not present in ahclist so we add it:
+  #we copy the list to a local tmplist, modify that,
   #store it in .GlobalEnv and save it, and return the matrix with haplotype
   #combinations.
   if (progress) cat(paste0("allHaploComb: calculation for SNPdid=", did, "\n"))
-  if (listloaded) tmplist <- ahclist else tmplist <- list()
+  tmplist <- ahclist
   if (length(tmplist) < nSNP) tmplist[[nSNP]] <- list()
-
   #calculate new matrix with haplotype combinations
   if (missing(SNPdosages)) SNPdosages <- dosFromSNPdid(did, nSNP, ploidy)
   allmat1 <- matrix(NA_integer_, nrow=ploidy, ncol=0) #matrix with haplotype sets in columns
@@ -223,7 +245,7 @@ allHaploComb <- function(SNPdosages=NULL, did=NULL, allhap, ploidy,
   assign("ahclist", value=tmplist, envir=.GlobalEnv, inherits=FALSE)
   if (writeFile) {
     save(ahclist, file=paste0("ahclist_", ploidy,"x.RData"))
-  } else assign("ahcunsaved", value=TRUE, envir=.GlobalEnv, inherits=FALSE)
+  } else ahcinfo$ahcunsaved <- TRUE
   allmat
 } #allHaploComb
 
@@ -293,7 +315,7 @@ completeAllHaploComb <- function(ploidy, nSNP, savesec=1800, printsec=60,
   nhap <- nrow(allhap)
   allset <- rep(nhap, ploidy) #current haplotype set: a full set has length ploidy
   while(TRUE) {
-    SNPdos <- colSums(allhap[allset,])
+    SNPdos <- colSums(allhap[allset,, drop=FALSE])
     did <- SNPdidfun(SNPdos, ploidy=ploidy)
     lasthap[did] <- lasthap[did] + 1
     if ((nc<-ncol(ahc[[did]])) < lasthap[did]) {
@@ -362,7 +384,7 @@ haplocomb2SNPdosages <- function(haplocomb, allhap) {
   if (is.vector(haplocomb)) haplocomb <- as.matrix(haplocomb)
   if (nrow(haplocomb) != nrow(allhap)) stop("haplocomb2SNPdosages: input error")
   res <- t(t(haplocomb) %*% allhap)
-  colnames(res) <- SNPdid(res, ploidy=sum(haplocomb[,1]), check=FALSE)
+  colnames(res) <- SNPdid(res, ploidy=sum(haplocomb[, 1]), check=FALSE)
   res
 } #haplocomb2SNPdosages
 
@@ -541,7 +563,7 @@ dosFromSNPdid <- function(dosageIDs, nSNP, ploidy) {
 #'by printed messages
 #'@details The returned list contains some general calculations and statistics
 #'based on SNPdosages and ploidy, and some parts that are the result of the
-#'inference. The primary of these if hclist: this contains for each SNPdid in
+#'inference. The primary of these is hclist: this contains for each SNPdid in
 #'the population the most likely combination of haplotypes (sometimes
 #'more than one, if several are equally likely = all have the maximum dosage
 #'of haplotypes inferred to be present).\cr
@@ -559,7 +581,7 @@ dosFromSNPdid <- function(dosageIDs, nSNP, ploidy) {
 #'those extra haplotypes as must-be-present, provided they occur in at least
 #'a fraction minfrac[2] of all individuals (minimum 2 individuals). If that
 #'leads to more SNPdids having a unique haplotype combination AND not having
-#'other SNPdids getting more combination) the new assignments are used.\cr
+#'other SNPdids getting more combination the new assignments are used.\cr
 #'#'If sel_hap in the function call already contains haplotypes, these are
 #'considered to be certainly present, even if they don't meet the minfrac
 #'criterion. This may help to reduce the number of haplotype configurations
@@ -583,9 +605,6 @@ dosFromSNPdid <- function(dosageIDs, nSNP, ploidy) {
 #'in SNPdidsTable, and contains a matrix with one column for each of the
 #'remaining haplotype combinations: (a subset of the columns of) the matrix
 #'returned by allHaploComb for that SNPdid}
-# not any more: \item{haploLost: (should be integer(0); a vector of haplotypes that
-#were inferred to be present in the penultimate cycle but were not inferred
-#to be present in the last cycle}
 #'}
 #'@export
 inferHaplotypes <- function(SNPdosages, indiv, ploidy, minfrac=c(0.1, 0.01),
@@ -900,6 +919,8 @@ haplotypeDosages <- function(haploblockname, ihl,
 haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname,
                                P1, P2, F1, minfrac=c(0.1, 0.01), F1frac=0.05,
                                dropUnused=TRUE, maxparcombs=150000) {
+  # All next lines to be moved to initial check function,
+  # called once for all hploblocks:
   dosmat <- checkSNPdosages(SNPdosages, indiv, ploidy)
   if (is.character(dosmat)) stop(dosmat)
   if (!is.list(F1)) F1 <- list(F1)
@@ -916,7 +937,7 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname,
   #PF1 will contain one sample name per population for each parent;
   #dosmat will be changed so that all other parental samples are deleted and
   #the sample in PF1 has the consensus SNP dosages per parent
-  for (p in 1:2) {
+  for (p in 1:2) { # p is parent (1 or 2)
     for (pop in seq_along(F1)) {
       #it can be that this same parent has already been checked for an earlier
       #population and then only one sample remains, with the consensus dosage:
@@ -930,26 +951,221 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname,
       }
       PF1[pop, p] <- P[[p]][[pop]][1]
     }
-  }
-  indiv <- colnames(dosmat)
-  # now PF1 has the names of the first samples of P1 and P2 for each pop,
+  } # for p in 1:2
+  #order the F1 populations by decreasing size:
+  F1sizes <- sapply(F1, length)
+  o <- order(F1sizes, decreasing=TRUE)
+  F1 <- F1[o]
+  P[[1]] <- P[[1]][o]
+  P[[2]] <- P[[2]][o]
+  PF1 <- PF1[o,]
+  F1sizes <- F1sizes[o]
+  #TODO: move the calculation of parental consensus scores and ordering of
+  #F1 populations to a separate function where it will be done for all
+  #markers at once (assuming the nr of missing data does not vary too much)
+  # now PF1 has the names of the one remaining sample of P1 and P2 for each F1 pop,
   # and dosmat has for each parent only one column with the consensus SNP dosages,
   # with as colnames the sample names in PF1
-  # Note that if the same parent is used multiple times all samples should
-  # be given each time (order may be different)
+  # Note that if the same parent is used in multiple F1 population the same set
+  # of samples should be given each time (order may be different)
+
+  #HERE STARTS THE CODE FOR THE CURRENT HAPLOBLOCK
+  #we have got the dosmat and the (re-ordered) P and F1 lists,
+  #NOT the PF1 and F1sizes
+
+  indiv <- colnames(dosmat); nind <- length(indiv)
+  #calculate ihl without using F1 info:
   ihl <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy,
                          minfrac=minfrac[1]) #without final inference
+  nhap <- nrow(ihl$allhap)
+  F1sizes <- integer(length(F1))
+  for (f in seq_along(F1))
+    F1sizes[f] <- sum(!is.na(ihl$did[names(ihl$did %in% F1[[f]])]))
+  largeF1s <- F1sizes > 6 * ploidy
+
+  had <- matrix(NA_integer_, nrow=nrow(ihl$allhap), ncol=nind)
+  #had is matrix of haplotype dosages
+  colnames(had) <- indiv
+  rownames(had) <- paste0(haploblockname, padded(1:nrow(ihl$allhap)))
+  F1sDone <- !largeF1s; changes <- TRUE
+  oldhap <- which(ihl$nPresent >= minfrac[1] * nind)
+  while (!all(F1sDone) && changes) {
+    #continue until all large F1s done or none of the remaining ones
+    #can be done
+    changes <- FALSE
+    for (f in seq_along(F1)) if (!F1sDone[f]) {
+      # check the current F1 using the current set of known haplotypes
+      # if (success) {
+      #   F1sDone[f] <- TRUE
+      #   if (new haplotypes added) changes <- TRUE
+      ih <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy,
+                      minfrac=minfrac[1], sel_hap=oldhap)
+      parents <- c(P[[1]][f], P[[2]][f])
+      sof <- solveOneF1(dosmat=dosmat, ihl=ih, F1=F1[f],
+                        P=parents, Phad=had[, parents], maxparcombs=maxparcombs,
+                        ploidy=ploidy, minfrac=minfrac, F1frac=F1frac)
+      if (nrow(sof$parcombok) == 1) {
+        pSOF <- processSOF(sof, had, oldhap, changes, F1=F1[f], P=parents,
+                           ihl=ih)
+        had <- pSOF$had; oldhap <- pSOF$oldhap; changes <- pSOF$changes
+        F1sDone[f] <- TRUE
+      }
+    }
+  }
+
+  F1sDone[!largeF1s & F1sizes > 0] <- FALSE; changes <- TRUE
+  while (!all(F1sDone) && changes) {
+    #continue until all F1s (including the large F1s) done or none of the
+    #remaining ones can be done; exactly as for largeF1s
+    changes <- FALSE
+    for (f in seq_along(F1)) if (!F1sDone[f]) {
+      # check the current F1 using the current set of known haplotypes
+      # if (success) {
+      #   F1sDone[f] <- TRUE
+      #   if (new haplotypes added) changes <- TRUE
+      ih <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy,
+                            minfrac=minfrac[1], sel_hap=oldhap)
+      parents <- c(P[[1]][f], P[[2]][f])
+      sof <- solveOneF1(dosmat=dosmat, ihl=ih, F1-F1[f],
+                        P=parents, Phad=had[, parents], maxparcombs=maxparcombs,
+                        ploidy=ploidy, minfrac=minfrac, F1frac=F1frac)
+      if (nrow(sof$parcombok) == 1) {
+        pSOF <- processSOF(sof, had, oldhap, changes, F1=F1[f], P=parents,
+                           ihl=ih)
+        had <- pSOF$had; oldhap <- pSOF$oldhap; changes <- pSOF$changes
+        F1sDone[f] <- TRUE
+      }
+    }
+  }
+
+  #oldhap now contains all haplotype nrs occurring in any of the
+  #parents of F1's that have been solved
+
+  #inferHaplotypes for all non-solved F1s and their parents, and panel
+  rest <- setdiff(indiv, c(do.call(c, F1[F1sDone]),
+                           do.call(c, P[[1]][F1sDone]),
+                           do.call(c, P[[2]][F1sDone])))
+  rest_ihl <- inferHaplotypes(SNPdosages=dosmat, indiv=rest, ploidy=ploidy,
+                              minfrac=minfrac, sel_hap=oldhap)
+  for (rSNPdid in names(rest_ihl$SNPdidsTable)) {
+    rcombs <- rest_ihl$hclist[[rSNPdid]]
+    if (ncol(rcombs) == 1) {
+      rSNPdidind <- names(rest_ihl$SNPdids)[rest_ihl$SNPdids == rSNPdid]
+      had[, colnames(had) %in% rSNPdidind] <- rcombs
+    }
+  }
+  #for all indiv with a unique combination of haplotypes we now have
+  #stored the solution (haplotype combination) in had
+  if (dropUnused) {
+    had <- had[rowSums(had, na.rm=TRUE) > 0, , drop=FALSE]
+    #has 0 rows if no indiv has a unique solution
+  }
+  list(message="", hapdos=had)
+
+  # (if rest_ihl with two minfrac values results in additional haplotypes,
+  # see if these can help solve the remaining F1s? And then re-iterate?
+  # For later!)
+
+  #Done!
+} #haplotypeDosagesF1
+
+processSOF <- function(sof, had, oldhap, changes, F1, P, ihl) {
+  #used only by haplotypeDosagesF1
+  #sof = list returned by solveOneF1
+  #had = matrix of haplotype dosages
+  #oldhap = vector of haplotype nrs known to be present
+  #changes indicates if the set of oldhap was modified in the current iteration
+
+  #assign hads for P and F1:
+  parhapcomb <- cbind(sof[[2]][, sof$parcombok[1,]],
+                      sof[[3]][, sof$parcombok[2,]])
+  had[, colnames(had) == P[1]] <- sof$P1combs[,parhapcomb[1,]]
+  had[, colnames(had) == P[2]] <- sof$P2combs[,parhapcomb[2,]]
+  F1combs <- getF1combs(parhapcomb)
+  F1SNPdos <- haplocomb2SNPdosages(F1combs, allhap=ihl$allhap)
+  F1SNPdids <- colnames(F1SNPdos) #SNPdids may not be unique,
+  #                  several haplotype combs may give the same SNP dosages
+  for (f1SNPdid in unique(F1SNPdids)) {
+    if (sum(F1SNPdids == f1SNPdid) == 1) {
+      #f1SNPdid unique, only one haplotype comb in F1 matches f1SNPdid
+      SNPdidcomb <- F1combs[, F1SNPdids == f1SNPdid, drop=FALSE]
+      SNPdidind <- names(ihl$SNPdids[ihl$SNPdids == f1SNPdid]) #all indiv with this f1SNPdid
+      SNPdidind <- intersect(F1, SNPdidind)
+      had[, colnames(had) %in% SNPdidind] <- SNPdidcomb
+    }
+  }
+  #check if parhapcomb contains new haplotypes:
+  parhap <- which(rowSums(parhapcomb) > 0)
+  m <- match(parhap, oldhap)
+  if (anyNA(m)) {
+    #new haplotypes found in parents that earlier were not marked as present
+    changes <- changes || anyNA(m)
+    oldhap <- sort(union(oldhap, parhap))
+  }
+  list(had=had, oldhap=oldhap, changes=changes)
+} #processSOF
+
 
-  #First:analyse F1 with parents:
+#'@title find the optimal combination of parental haplotype combinations
+#'@description For one F1 population and one haploblock, find the optimal
+#'combination of parental haplotype combinations
+#'@usage solveOneF1(dosmat, ihl, F1, P, Phad, maxparcombs, ploidy,
+#'minfrac, F1frac)
+#'@param dosmat dosmat with only the SNPs in the current haploblock, and with
+#'only one column for each F1 parent, containing its consensus dosages;
+#'otherwise all individuals of interest within and outside the F1 are retained
+#'@param ihl a list returned by inferHaplotypes
+#'@param F1 character vector with sample names of the current F1
+#'@param P vector with 2 sample names, one for parent1 and one for parent2.
+#'The corresponding columns in dosmat should have pre-calculated consensus
+#'SNP genotypes for both parents, and orig_ihl should be based on that
+#'dosmat
+#'@param Phad matrix with 2 columns, with the haplotype dosages assigned
+#'to the parents. Usually both contain only NA_integer_s, but if they are
+#'assigned we consider only that had
+#'@param maxparcombs Parent 1 and 2 both may have multiple possible haplotype
+#'combinations. For each pair of haplotype combinations (one from P1 and one
+#'from P2) the expected F1 segregation must be checked against the observed.
+#'This may take a long time if many such combinations need to be checked.
+#'This parameter sets a limit to the number of allowed combinations; default
+#'150000 takes about 45 min.
+#'@param ploidy all SNP dosages should be in 0:ploidy or NA
+#'@param minfrac vector of two fractions, default 0.1 and 0.01. A haplotype is
+#'considered to be certainly present if it occurs in at least a fraction
+#'minfrac[1] of all individuals; in the final stage for the "other"
+#'individuals (those that do not belong to the F1 or its parents) this fraction
+#'is lowered to minfrac[2]; see also inferHaplotypes
+#'@param F1frac the fraction of F1 individuals for which it is accepted
+#'that they are not explained by a parental haplotype combination; default 0.05
+#'(allowing for a few percent of Double Reduction)
+#'@return a list with a message, two matrices P1combs and P2combs with the
+#'haplotype combinations that were considered for each parent
+#'and a matrix parcombok with 0 or 1 rows and two columns: 0 rows if no
+#'solution was found, 1 row indexing the columns of P1combs and P2combs
+#'if a solution was found.
+solveOneF1 <- function(dosmat, ihl, F1, P, Phad, maxparcombs, ploidy, minfrac,
+                       F1frac) {
+  #TODO: If no solution found and Phad not NA's for both parents,
+  #it may be that the had(s) assigned based on a previous F1 is not correct.
+  #In that case ignore the Phad and if that succeeds, go back to previous F1
+  #with new had for that parent
 
-  calcParcombok <- function(P1combs, P2combs, oldP1combs, oldP2combs) {
+  calcParcombok <- function(P1combs, P2combs, oldP1combs, oldP2combs, ihl,
+                            F1frac) {
+    # Function within solveOneF1
+    # Calculates the combinations of parental haplotype combinations
+    # that can explain (almost) all F1 individuals
+    # P1combs, P2combs: matrices with haplotype combinations for P1 and P2
+    # oldP1combs, oldP2combs: matrices with parental haplotypes combs that
+    # were already calculated and  whose P1-P2 combinations were no solutions
+    # (may have 0 columns if none)
     #TODO: parallellize; see also allHaploComb
     P1matchold <- match(as.data.frame(P1combs), as.data.frame(oldP1combs))
     P2matchold <- match(as.data.frame(P2combs), as.data.frame(oldP2combs))
     # NOTE that match matches vector elements, also when the vector is a
     #      list (or data.frame)
     if (!anyNA(P1matchold) && !anyNA(P2matchold)) {
-      #no new parental combinations
+      #no new parental combinations, so no solutions
       parcombok <- matrix(integer(0), ncol=2)
     } else {
       missedF1s <- matrix(NA_integer_, nrow=ncol(P1combs), ncol=ncol(P2combs))
@@ -983,93 +1199,74 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname,
       # indices to P1combs and P2combs
     }
     list(P1combs=P1combs, P2combs=P2combs, parcombok=parcombok)
-  } #calcParcombok within haplotypeDosagesF1
-
-  for (pop in seq_along(F1)) {
-    #todo: analysis per F1; in which order:
-    #large to small F1, or small to large nr of parental combs, or ...?
-    #among F1's larger than (eg) 6 * ploidy: order by increasing nr
-    #of parental combinations based on initial inferHaplotypes;
-    #- when no solution found: calculate nr of parental combs based on all
-    #  possibilities and reorder pops
-    #- when solution found: re-do inferHaplotypes with the known parental
-    #  haplotypes present and re-order the remaining pops that have not been
-    #  tried yet on the new nr of parental combs
-    #- after all larger pops have been done, do the small F1's in decreasing
-    #  order of size (first based on inferHaplotypes, then if needed based
-    #  on all possible combs)
-    #- after the last F1, a last inferHaplotypes with all parental combs given
-    #  to score the "other" samples
-  }
-
-  F1dat <- data.frame(
-    size = integer(length(F1)), #fixed: nr of F1 indiv with non-NA SNPdid
-    parcombcount = integer(length(F1)), #will change several times
-    nextcombs = rep(1, length(F1)), #next to try:
-    #           1= parcombs based on last inferHaplotypes,
-    #           2= all possible parcombs except tried earlier,
-    #           3= nothing (ready),
-    solved = rep(FALSE, length(F1))
-  )
-
-  for (f in seq_along(F1)) {
-    dids <- ihl$SNPdids[names(ihl$SNPdids) %in% F1[[f]]]
-    F1dat$size[f] <- sum(is.na(dids))
-  }
-  largeF1s <- which(F1$size > 6 * ploidy)
+  } #calcParcombok within solveOneF1
 
+  nhap <- nrow(ihl$allhap)
+  PhadNA <- is.na(Phad[1,]) #all or none of the values in a Phad column are NA
 
   F1SNPdidsTable <- table(ihl$SNPdids[names(ihl$SNPdids) %in% F1])
-  P1combs <- getParentalCombs(parsamp=P[1], ihl=ihl, useIH=TRUE)
-  P2combs <- getParentalCombs(parsamp=P[2], ihl=ihl, useIH=TRUE)
+  if (PhadNA[1]) {
+    P1combs <- getParentalCombs(parsamp=P[1], ihl=ihl, useIH=TRUE)
+  } else P1combs <- Phad[, 1]
+  if (PhadNA[2]) {
+    P2combs <- getParentalCombs(parsamp=P[2], ihl=ihl, useIH=TRUE)
+  } else P2combs <- Phad[, 2]
   message <- ""
-  if (ncol(P1combs) * ncol(P2combs) > maxparcombs) {
-    message <- paste("more than", maxparcombs, "parental combinations; skipped,",
-                     "result based on inferHaplotypes")
+  lst <- NULL
+  firstcombs <- ncol(P1combs) * ncol(P2combs)
+  if (firstcombs == 0 || firstcombs > maxparcombs) {
+    # too many potential combinations (it takes about 45 min to check
+    # 150000 combinations)
+    message <- paste("more than", maxparcombs, "parental combinations; skipped")
     parcombok <- matrix(integer(0), ncol=2)
   } else {
     origP1combs <- P1combs; origP2combs <- P2combs
     lst <- calcParcombok(P1combs=origP1combs, P2combs=origP2combs,
                          oldP1combs=matrix(integer(0), nrow=nrow(origP1combs)),
-                         oldP2combs=matrix(integer(0), nrow=nrow(origP2combs)))
+                         oldP2combs=matrix(integer(0), nrow=nrow(origP2combs)),
+                         ihl=ihl, F1frac=F1frac)
+    # note that in this first try the oldPcombs have 0 columns as there has not
+    # yet been an earlier calculation
     if (nrow(lst$parcombok) == 0) {
       #No parental combinations fit.
       #1. See how many parental combinations are possible without first
       #   inferHaplotypes (i.e. with allHaploComb).
-      #   If p1*p2 < 150000 (ca 45 min) try them all.
-      nwP1combs <- getParentalCombs(parsamp=P1, ihl=ihl, useIH=FALSE)
-      nwP2combs <- getParentalCombs(parsamp=P2, ihl=ihl, useIH=FALSE)
-      if (ncol(nwP1combs) * ncol(nwP2combs) > maxparcombs) {
+      #   If p1*p2 < maxparcombs we try them all.
+      if (PhadNA[1]) {
+        nwP1combs <- getParentalCombs(parsamp=P[1], ihl=ihl, useIH=FALSE)
+      } else nwP1combs <- Phad[, 1]
+      if (PhadNA[2]) {
+        nwP2combs <- getParentalCombs(parsamp=P[2], ihl=ihl, useIH=FALSE)
+      } else nwP2combs <- Phad[, 2]
+      if ((ncol(nwP1combs) * ncol(nwP2combs)) - firstcombs > maxparcombs) {
         message <- paste("more than", maxparcombs,
-                         "extra parental combinations; skipped,",
-                         "result based on inferHaplotypes")
+                         "extra parental combinations; skipped")
       } else {
         lst <- calcParcombok(P1combs=nwP1combs, P2combs=nwP2combs,
-                             oldP1combs=origP1combs, oldP2combs=origP2combs)
+                             oldP1combs=origP1combs, oldP2combs=origP2combs,
+                             ihl=ihl, F1frac=F1frac)
         if (nrow(lst$parcombok) == 0) {
           #2. If 1. does not find a solution due to wrong parental SNP data or
           #   because it was skipped because of too many combinations,
           #   just return the inferHaplotypes + haplotypeDosages result,
           #   with a message.
-          message <- "no valid F1 solutions found; result based on inferHaplotypes"
+          message <- "no valid F1 solutions found"
         }
       }
     }
   }
   if (message != "") {
-    ihl <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy,
-                           minfrac=minfrac) #including final inference
-    had <- haplotypeDosages(haploblockname=haploblockname, ihl=ihl,
-                            partial=TRUE, dropUnused=dropUnused)
-    return(list(message=message, hapdos=had))
+    if (is.null(lst))
+      lst <- list(P1combs=P1combs, P2combs=P2combs,
+                  parcombok=matrix(integer(0), ncol=2))
+    return(c(list(message=message), lst))
   }
 
   # now we have one or more acceptable parental combinations
   P1combs <- lst$P1combs
   P2combs <- lst$P2combs
   parcombok <- lst$parcombok
-  #P1combs and P2combs only have haplotype combinations that may be ok,
-  #and parcombok indicates which combinations of these match (almost) all F1's.
+  #parcombok indicates which combinations of these match (almost) all F1's.
   #Now, any of these combinations could be the correct one. We check all of them
   #and use the best. So for each combination:
   # (1) get the total set of parental haplotypes: these are all known to be
@@ -1084,6 +1281,7 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname,
   #       (that are not certainly present); leave for now
   #     - minhap: the minimum number of haplotypes needed to explain (almost)
   #       all individuals (as small as possible); harder, leave for now
+  indiv <- colnames(dosmat)
   parcombstats <- data.frame(hAbsent = rep(NA_integer_, nrow(parcombok)),
                              indSingle = rep(NA_integer_, nrow(parcombok)),
                              indExtra = rep(NA_integer_, nrow(parcombok)),
@@ -1093,8 +1291,8 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname,
     parcomb <- cbind(P1combs[,parcombok[pco, 1]],
                      P2combs[,parcombok[pco, 2]])
     parhaplo <- which(rowSums(parcomb) > 0)
-    ih <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv , ploidy=ploidy,
-                             minfrac=minfrac[1], sel_hap=parhaplo)
+    ih <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy,
+                          minfrac=minfrac[1], sel_hap=parhaplo)
     ihlist[[pco ]] <- ih
     parcombstats$hAbsent[pco] <- sum(ih$nAbsent > 0.9 * length(indiv))
     #parcombstats$indSingle (nr of ind with a unique haplotype combi):
@@ -1119,61 +1317,12 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname,
     best2 <- which.max(parcombstats$hAbsent[best])
     best <- best[best2]
   }
-
-  #finally:
-  #we know which pco (parental combination) is the best.
-  #haplotype dosages for parents are available from parcombok[best,]
-  #haplotype dosages for F1 can then be recalculated with getF1combs
-  #haplotype dosages for the other samples are recalculated using
-  #   parcombok[best,] and both minfrac values
-
-  had <- matrix(NA_integer_, nrow=nrow(ihl$allhap), ncol=length(indiv))
-  #had means haplotype dosages
-  colnames(had) <- indiv
-  rownames(had) <- paste0(haploblockname, padded(1:nrow(ihl$allhap)))
-  parcomb <- cbind(P1combs[, parcombok[best, 1]], #selected haplotype comb of P1
-                   P2combs[, parcombok[best, 2]]) #and P2
-  had[, colnames(had) %in% P1] <- parcomb[,1]
-  had[, colnames(had) %in% P2] <- parcomb[,2]
-  #F1: it may be that among the F1 some SNPdid have only one possible combination
-  #of haplotypes, while the same SNPdid in the other samples has more than
-  #one possible combination (and hence would be discarded). Or that a solution
-  #that works in unrelated material is incompatible with the F1 parents.
-  #So we do the F1 separately from the others:
-  F1combs <- getF1combs(parcomb)
-  F1SNPdos <- haplocomb2SNPdosages(F1combs, allhap=ihl$allhap)
-  F1SNPdids <- colnames(F1SNPdos) #SNPdids may not be unique,
-  #                  several haplotype combs may give the same SNP dosages
-  for (f1SNPdid in unique(F1SNPdids)) {
-    if (sum(F1SNPdids == f1SNPdid) == 1) {
-      #f1SNPdid unique, only one haplotype comb in F1 matches f1SNPdid
-      SNPdidcomb <- F1combs[, F1SNPdids == f1SNPdid, drop=FALSE]
-      SNPdidind <- names(ihl$SNPdids[ihl$SNPdids == f1SNPdid]) #all indiv with this f1SNPdid
-      SNPdidind <- intersect(F1, SNPdidind)
-      had[, colnames(had) %in% SNPdidind] <- SNPdidcomb
-    }
-  }
-  other <- setdiff(indiv, c(do.call(c, F1), PF1)) #all except F1s and F1 parents
-  if (length(other) > 0) {
-    parhaplo <- which(rowSums(parcomb) > 0)
-    iho <- inferHaplotypes(SNPdosages=dosmat, indiv=other, ploidy=ploidy,
-                           minfrac=minfrac, sel_hap=parhaplo)
-    for (oSNPdid in names(iho$SNPdidsTable)) {
-      ocombs <- iho$hclist[[oSNPdid]]
-      if (ncol(ocombs) == 1) {
-        oSNPdidind <- names(iho$SNPdids)[iho$SNPdids == oSNPdid]
-        had[, colnames(had) %in% oSNPdidind] <- ocombs
-      }
-    }
-  }
-  #for all indiv with a unique combination of haplotypes we now have
-  #stored the solution (haplotype combination) in had
-  if (dropUnused) {
-    had <- had[rowSums(had, na.rm=TRUE) > 0, , drop=FALSE]
-    #has 0 rows if no indiv has a unique solution
-  }
-  list(message="", hapdos=had)
-} #haplotypeDosagesF1
+  return(list(c(message="", lst[1:2], parcombok=parcombok[best,, drop=FALSE])))
+  #note: the calling routine can get the set of parental haplootypes by:
+  #if(nrow(result$parcombok) != 1) integer(0) else
+  #  which(rowSums(cbind(result[[2]][,result$parcombok[1,]],
+  #                      result[[3]][,result$parcombok[2,]])) > 0)
+} #solveOneF1
 
 #TODO:
 # - impute missing SNP dosages (1) in F1 (2) in other?
@@ -1194,7 +1343,8 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname,
 #'obtain consensus scores
 #'@param solveConflicts if TRUE (default) and there are conflicting dosage
 #'assignments between the samples for the same SNP, the one with highest
-#'frequency is used. If there are more dosages with the maximum frequency or if
+#'frequency is used, provided the number of other dosages is not more than 1
+#'or not more than 10% of the frequency of the maximum dosage. If
 #'solveConflicts is FALSE and there are conflicting dosages, the consensus for
 #'that SNP will be NA
 #'@return a vector with one consensus dosage for each row of dosmat, and the
@@ -1216,7 +1366,10 @@ getConsensusSNPdosages <- function(dosmat, indiv, solveConflicts=TRUE) {
       tb <- table(dosmat[i,]) # length 0 if all NA
       suppressWarnings({ maxtb <- which(tb == max(tb)) })
       if (length(maxtb) == 1) {
-        mindos[i] <- as.integer(names(tb)[maxtb])
+        othercount <- sum (!is.na(dosmat[i,])) - tb[maxtb]
+        if (othercount > 1 && othercount > 0.1 * tb[maxtb]) {
+          mindos[i] <- NA
+        } else mindos[i] <- as.integer(names(tb)[maxtb])
       } else mindos[i] <- NA
     }
   } else {
-- 
GitLab