new completeAllHaploComb. Start with use of multiple F1 populations.

5b4cabbc · Voorrips, Roeland · 32c7b45b · 5b4cabbc
Commit 5b4cabbc authored 8 years ago by Voorrips, Roeland
--- a/R/PolyHaplotyper.R
+++ b/R/PolyHaplotyper.R
@@ -11,17 +11,19 @@
 # ploidy level.
 # PolyHaplotyper should not be confused with PediHaplotyper, which is aimed
-# at diploid individuals that are all part of a pedigree. In contrast,
+# at diploid individuals that are all part of a pedigree, and where the
-# PolyHaplotyper can handle any ploidy level and unreletaed individuals.
+# (bi- or multi-allelic) marker alleles have been phased before.
-# It can make use of F1 populations, but (so far) does not use segregation
+# In contrast, PolyHaplotyper can handle any ploidy level and uses unphased
-# ratios.
+# bi-allelic marker genotypes (i.e. marker allele dosages). It can work with
+# unrelated individuals and/or F1 populations, but (so far) does not use
+# segregation ratios.
 #'@importFrom utils combn
 #the following line serves to stop devtools::check complaining about
 #some identifiers that do exist but it doesn't find because they are
 #loaded from RData files and/or created in the global evironment:
-utils::globalVariables(names=c("ahcploidy", "ahclist"),
+utils::globalVariables(names=c("ahcploidy", "ahclist", "ahcunsaved"),
                       add=TRUE)
 # General approach (not using segregation ratios)
@@ -87,16 +89,25 @@ loadAllHaploCombList <- function(ploidy) {
  # Checks variable ahcploidy in GlobalEnv. If that doesn't exist or
  # is not equal to ploidy, attempts to load ahclist into the GlobalEnv
  # from a file 'ahclist_nx.RData' in the current working directory (where n is
-  # the ploidy). If that is succesful, ploidy is stored in variable ahcploidy
+  # the ploidy). If that is successful, ploidy is stored in variable ahcploidy
  # in the GlobalEnv, else neither ahcploidy not ahclist is created or changed.
  # Return value is TRUE if the ahclist was already available or is loaded
  # succesfully, else FALSE.
-  listloaded <- exists("ahcploidy", envir=.GlobalEnv, inherits==FALSE) &&
+  listloaded <- FALSE
-    (ahcploidy == ploidy) &&
+  if (exists("ahcploidy", envir=.GlobalEnv, inherits==FALSE) &&
-    exists("ahclist", envir=.GlobalEnv, inherits==FALSE)
+    exists("ahclist", envir=.GlobalEnv, inherits==FALSE)) {
+    listloaded <- ahcploidy == ploidy
+    if (!listloaded && exists("ahcunsaved", envir=.GlobalEnv, inherits==FALSE)
+        && ahcunsaved) {
+      #the currently loaded ahclist is for a different ploidy and unsaved;
+      #we save it before loading the ahclist for the current ploidy:
+      save(ahclist, file=paste0("ahclist_", ahcploidy,"x.RData"))
+    }
+  }
  if (!listloaded && file.exists(paste0("ahclist_", ploidy, "x.RData"))) {
    load(paste0("ahclist_", ploidy,"x.RData"), envir=.GlobalEnv)
    assign("ahcploidy", value=ploidy, envir=.GlobalEnv, inherits=FALSE)
+    assign("ahcunsaved", value=FALSE, envir=.GlobalEnv, inherits=FALSE)
    listloaded <- TRUE
  }
  listloaded
@@ -104,7 +115,8 @@ loadAllHaploCombList <- function(ploidy) {
 #'@title get all haplotype combinations that result in the given SNP dosages
 #'@description get all haplotype combinations that result in the given SNP dosages
-#'@usage allHaploComb(SNPdosages, did, allhap, ploidy, progress=FALSE)
+#'@usage allHaploComb(SNPdosages=NULL, did=NULL, allhap, ploidy,
+#'writeFile=TRUE, progress=FALSE)
 #'@param SNPdosages an integer vector with the dosages of each SNP in one
 #'individual, with the SNPs in the same order as in the columns of allhap.
 #'Either SNPdosages or did must be specified, if both are specified they
@@ -116,6 +128,11 @@ loadAllHaploCombList <- function(ploidy) {
 #'same order as SNPdosages
 #'@param ploidy the ploidy of the individual; SNPdosages should all be in
 #'0:ploidy
+#'@param writeFile The calculated haplotype combis are stored in list
+#'ahclist in GlobalEnv. Default writeFile TRUE ensures that this list is written
+#'to file each time a new SNPdid is calculated. With writeFile FALSE the list is
+#'not written to file but a variable ahcunsaved with value FALSE is
+#'created in GlobalEnv.
 #'@param progress whether to print a message when starting calculations
 #'for a did; default FALSE
 #'@details Each column of the return value represents one combination of
@@ -124,7 +141,8 @@ loadAllHaploCombList <- function(ploidy) {
 #'@return an integer matrix with one row per haplotype (corresponding to allhap)
 #'and one column per combination of haplotypes, with the number of copies
 #'of each haplotype in each combination (the dosages of the haplotypes)
-allHaploComb <- function(SNPdosages, did, allhap, ploidy, progress=FALSE) {
+allHaploComb <- function(SNPdosages=NULL, did=NULL, allhap, ploidy,
+                         writeFile=TRUE, progress=FALSE) {
  # TODO: implement parallel computing of combinations for new did combinations,
  # https://artax.karlin.mff.cuni.cz/r-help/library/Rdsm/html/Rdsm-package.html
  # seems useful (using shared memory) but may need to delay saving ahclist
@@ -134,11 +152,11 @@ allHaploComb <- function(SNPdosages, did, allhap, ploidy, progress=FALSE) {
  #initial checks:
  nSNP <- ncol(allhap)
-  if (!missing(SNPdosages) &&
+  if (!missing(SNPdosages) && !is.null(SNPdosages) &&
      (length(SNPdosages) != nSNP || !all(SNPdosages %in% 0:ploidy)))
    stop("allHaploComb: invalid SNPdosages")
-  if (missing(did)) {
+  if (missing(did) || is.null(did)) {
-    if (missing(SNPdosages)) {
+    if (missing(SNPdosages) || is.null(SNPdosages)) {
      stop("allHaploComb: SNPdosages or did must be specified")
    } else did <- SNPdidfun(SNPdosages, ploidy=ploidy)
  }
@@ -158,7 +176,7 @@ allHaploComb <- function(SNPdosages, did, allhap, ploidy, progress=FALSE) {
  #we create or copy the list to a local tmplist, modify that,
  #store it in .GlobalEnv and save it, and return the matrix with haplotype
  #combinations.
-  if (progress) cat(paste0("allHaploComb: calculation for did=", did, "\n"))
+  if (progress) cat(paste0("allHaploComb: calculation for SNPdid=", did, "\n"))
  if (listloaded) tmplist <- ahclist else tmplist <- list()
  if (length(tmplist) < nSNP) tmplist[[nSNP]] <- list()
@@ -203,10 +221,128 @@ allHaploComb <- function(SNPdosages, did, allhap, ploidy, progress=FALSE) {
  tmplist[[nSNP]][[didix]] <- allmat
  names(tmplist[[nSNP]])[didix] <- did
  assign("ahclist", value=tmplist, envir=.GlobalEnv, inherits=FALSE)
-  save(ahclist, file=paste0("ahclist_", ploidy,"x.RData"))
+  if (writeFile) {
+    save(ahclist, file=paste0("ahclist_", ploidy,"x.RData"))
+  } else assign("ahcunsaved", value=TRUE, envir=.GlobalEnv, inherits=FALSE)
  allmat
 } #allHaploComb
+getHapcombCount_1SNP <- function(SNPdosage, ploidy, nhap) {
+  #nhap = total nr of different haplotypes for nSNP SNPs = 2^nSNP
+  #returns the number of haplotype combinations for a given dosage d at one SNP
+  #currently not used
+  d <- SNPdosage
+  h <- nhap %/% 2 #as.integer(2 ^ (nSNP-1) + 0.001)
+  if (d == ploidy) C0 <- 1 else
+    C0 <- cumprod(seq(h, h+ploidy-d-1))[ploidy-d] / cumprod(seq_len(ploidy-d))[ploidy-d]
+  if (d == 0) C1 <- 1 else
+    C1 <- cumprod(seq(h, h+d-1))[d] / cumprod(seq_len(d))[d]
+  C0 * C1
+} #getHapcombCount_1SNP
+totHapcombCount <- function(ploidy, nhap) {
+  #nhap = total nr of different haplotypes for nSNP SNPs = 2^nSNP
+  #returns the total number of haplotype combinations
+  #currently not used
+  cumprod(seq(nhap, nhap+ploidy-1))[ploidy] /
+      cumprod(seq_len(ploidy))[ploidy]
+} #totHapcombCount
+#'@title generate all haplotype combinations
+#'@description generate a list which contains for each SNP dosage combination
+#'at a given ploidy all matching haplotytpe combinations
+#'@usage completeAllHaploComb(ploidy, nSNP, savesec=1800, printsec=60,
+#'fname=NULL)
+#'@param ploidy ploidy (may be even or odd)
+#'@param nSNP number of SNPs in the haploblock
+#'@param savesec number of seconds between successive saves of the intermediate
+#'results
+#'@param printsec number of seconds between printout of the currect set of
+#'haplotypes (the last two are always equal at the time of printing)
+#'@param fname filename (to which the extension .RData will be added) to store
+#'the results. Intermediate saves will go to fname + the current set of
+#'haplotypes; these intermediate files are temporary and will be deleted.\cr
+#'If NULL (default), fname will be set to e.g. ahc_4x_nSNP6, where 4 is the
+#'ploidy and nSNP = 6
+#'@return a list with for each SNPdid (each combination of SNP dosages) a
+#'matrix with one row per haplotype and one column per haplotype combination,
+#'containing the dosages of the haplotypes in each haplotype combination
+#'@export
+completeAllHaploComb <- function(ploidy, nSNP, savesec=1800, printsec=60,
+                                 fname=NULL) {
+  # this will generate a complete sub-list of an ahclist (see allHaploComb)
+  # for the given ploidy and number of SNPs
+  # and save it to a file called eg ahc_4x_nSNP7.RData
+  # every savesec seconds a temporary file will be saved with the total nr of
+  # SNPdids done so far appended to the filename
+  if (is.null(fname)) fname <- paste0("ahc_", ploidy, "x_nSNP", nSNP)
+  ndids <- as.integer((ploidy + 1) ^ nSNP + 0.001)
+  ahc <- list()
+  for (did in ndids:1) ahc[[did]] <- matrix(NA_integer_, nrow=ploidy, ncol=1)
+  lasthap <- rep(0, ndids) # last stored haplotype column for each did
+  allhap <- allHaplotypes(paste0("m", 1:nSNP))
+  print(allhap)
+  lastfile <- ""
+  lastsave <- proc.time()
+  lastprint <- lastsave - printsec - 1
+  #new approach:
+  #instead of taking all SNPdosages in turn and checking which hapcomb
+  #matches them, generate all hapcombs in turn and calculate their SNPdids
+  nhap <- nrow(allhap)
+  allset <- rep(nhap, ploidy) #current haplotype set: a full set has length ploidy
+  while(TRUE) {
+    SNPdos <- colSums(allhap[allset,])
+    did <- SNPdidfun(SNPdos, ploidy=ploidy)
+    lasthap[did] <- lasthap[did] + 1
+    if ((nc<-ncol(ahc[[did]])) < lasthap[did]) {
+      #we don't want to reallocate for each new column but increase size
+      #when needed by 50%
+      extmat <- matrix(NA_integer_, nrow=ploidy, ncol= nc %/% 2 + 2)
+      ahc[[did]] <- cbind(ahc[[did]], extmat)
+    }
+    ahc[[did]][,lasthap[did]] <- allset
+    # get the next allset:
+    len <- ploidy
+    allset[len] <- allset[len] - 1
+    while (len > 0 && allset[len] <= 0) {
+      len <- len - 1
+      if (len > 0) allset[len] <- allset[len] - 1
+    }
+    if (len <= 0) break
+    if (len < ploidy) {
+      allset[(len+1):ploidy] <- allset[len] #nhap
+      #progress and save: only check when len < ploidy
+      if ((proc.time()-lastprint)[3] > printsec) {
+        cat(paste0("ploidy: ", ploidy, " nSNP: ", nSNP,
+                   " allset: ", paste(allset, collapse=" "), "\n"))
+        lastprint <- proc.time()
+        if ((lastsave-lastprint)[3] > savesec) {
+          save(ahc, file=paste0(fname, "_", paste(allset, collapse="-"),
+                                ".RData"))
+          if (file.exists(lastfile)) file.remove(lastfile)
+          lastfile <- fname
+          lastsave <- proc.time()
+        }
+      }
+    } # len < ploidy
+  }
+  # all elements of ahc are now matrices with one solution per column,
+  # in the form of sets like 1-1-2-4 (i.e. all occurring haplotypes listed).
+  # now we convert that to columns with the dosages of each haplotype:
+  #
+  for (did in seq_along(ahc)) {
+    #ncomb <- ncol(ahc[[did]])
+    allmat <- matrix(NA_integer_, nrow=nhap, ncol=lasthap[did])
+    for (col in seq_len(lasthap[did]))
+      allmat[,col] <- tabulate(ahc[[did]][,col], nbins=nhap)
+    ahc[[did]] <- allmat
+  }
+  save(ahc, file=paste0(fname, ".RData"))
+  if (file.exists(lastfile)) file.remove(lastfile)
+  invisible(ahc)
+} #completeAllHaploComb
 #'@title calculate the SNP dosages resulting from haplotype combinations
 #'@description calculate the SNP dosages resulting from haplotype combinations
@@ -711,15 +847,21 @@ haplotypeDosages <- function(haploblockname, ihl,
 #'names, SNP names are the row names or (if a data.frame) in a column named
 #'MarkerNames. All SNP dosages must be in 0:ploidy or NA. If a data.frame,
 #'additional columns may be present.
-#'@param indiv NULL (default) or a character vector with names of individuals
+#'@param indiv NULL (default) or a character vector with names of all individuals
-#'to be selected. If NULL, all columns are selected;
+#'to be considered. If NULL, all columns are selected;
-#'if SNPdosages is a data.frame, that is probably not what is intended.
+#'if SNPdosages is a data.frame, that is probably not what is intended.\cr
+#'All indivs that are not in any of the P1, P2 or F1 vectors are considered
+#'unrelated, i.e. we have no implementation for pedigrees (yet).
 #'@param ploidy all SNP dosages should be in 0:ploidy or NA
 #'@param haploblockname prefix to which the (zero-padded) haplotype numbers
 #'are added. Any separator (like '_') should be part of haploblockname
-#'@param P1 character vector with sample name(s) for P1 (parent 1)
+#'@param P1 character vector with sample name(s) for P1 (parent 1), or a list
-#'@param P2 character vector with sample name(s) for P2 (parent 2)
+#'of such vectors in case there are multiple F1s. Note that there may be
-#'@param F1 character vector of sample names for F1
+#'multiple samples of a parent from which a consensus marker dosage combination
+#'is calculated.
+#'@param P2 as P1, for parent 2
+#'@param F1 character vector of sample names for F1, or a list
+#'of such vectors in case there are multiple F1s
 #'@param minfrac vector of two fractions, default 0.1 and 0.01. A haplotype is
 #'considered to be certainly present if it occurs in at least a fraction
 #'minfrac[1] of all individuals; in the final stage for the "other"
@@ -755,15 +897,46 @@ haplotypeDosages <- function(haploblockname, ihl,
 #'If dropUnused is TRUE the matrix has 0 rows if no unique haplotype
 #'combinations can be inferred for any individual
 #'@export
-haplotypeDosagesF1 <- function(SNPdosages, indiv, ploidy, haploblockname,
+haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname,
                               P1, P2, F1, minfrac=c(0.1, 0.01), F1frac=0.05,
                               dropUnused=TRUE, maxparcombs=150000) {
  dosmat <- checkSNPdosages(SNPdosages, indiv, ploidy)
  if (is.character(dosmat)) stop(dosmat)
+  if (!is.list(F1)) F1 <- list(F1)
+  #F1 is now a list with one vector of sample names for each F1 population
+  if (!all(do.call(c, F1) %in% colnames(dosmat)))
+    stop("Some F1 samples not present")
+  P <- list(P1, P2) # a list of lists
+  for (p in 1:2) if (!is.list(P[[p]])) P[[p]] <- list(P[[p]])
+  msg <- checkAllParentSets(P, npop=length(F1), indiv=colnames(dosmat))
+  if (msg != "") stop(msg)
+  PF1 <- matrix(NA, nrow=length(F1), ncol=2,
+              dimnames=list(paste0("pop", seq_along(F1)), c("P1", "P2")))
+  #PF1 will contain one sample name per population for each parent;
+  #dosmat will be changed so that all other parental samples are deleted and
+  #the sample in PF1 has the consensus SNP dosages per parent
+  for (p in 1:2) {
+    for (pop in seq_along(F1)) {
+      #it can be that this same parent has already been checked for an earlier
+      #population and then only one sample remains, with the consensus dosage:
+      P[[p]][[pop]] <- intersect(P[[p]][[pop]], colnames(dosmat))
+      if (length(P[[p]][[pop]]) > 1) {
+        P[[p]][[pop]] <- sort(P[[p]][[pop]])
+        Pcons <- getConsensusSNPdosages(dosmat=dosmat, indiv=P[[p]][[pop]])
+        delsamp <- P[[p]][[pop]][-1] #only not the first sample
+        dosmat <- dosmat[, -(colnames(dosmat) %in% delsamp)]
+        dosmat[,colnames(dosmat) == P[[p]][[pop]][1]] <- Pcons
+      }
+      PF1[pop, p] <- P[[p]][[pop]][1]
+    }
+  }
  indiv <- colnames(dosmat)
-  P1 <- as.character(P1); P2<- as.character(P2); F1 <- as.character(F1)
+  # now PF1 has the names of the first samples of P1 and P2 for each pop,
-  if (!all(c(P1, P2) %in% indiv)) stop("Some parental samples missing")
+  # and dosmat has for each parent only one column with the consensus SNP dosages,
-  if (!all(F1 %in% indiv)) stop("Some F1 samples missing")
+  # with as colnames the sample names in PF1
+  # Note that if the same parent is used multiple times all samples should
+  # be given each time (order may be different)
  ihl <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy,
                         minfrac=minfrac[1]) #without final inference
@@ -812,9 +985,43 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv, ploidy, haploblockname,
    list(P1combs=P1combs, P2combs=P2combs, parcombok=parcombok)
  } #calcParcombok within haplotypeDosagesF1
+  for (pop in seq_along(F1)) {
+    #todo: analysis per F1; in which order:
+    #large to small F1, or small to large nr of parental combs, or ...?
+    #among F1's larger than (eg) 6 * ploidy: order by increasing nr
+    #of parental combinations based on initial inferHaplotypes;
+    #- when no solution found: calculate nr of parental combs based on all
+    #  possibilities and reorder pops
+    #- when solution found: re-do inferHaplotypes with the known parental
+    #  haplotypes present and re-order the remaining pops that have not been
+    #  tried yet on the new nr of parental combs
+    #- after all larger pops have been done, do the small F1's in decreasing
+    #  order of size (first based on inferHaplotypes, then if needed based
+    #  on all possible combs)
+    #- after the last F1, a last inferHaplotypes with all parental combs given
+    #  to score the "other" samples
+  }
+  F1dat <- data.frame(
+    size = integer(length(F1)), #fixed: nr of F1 indiv with non-NA SNPdid
+    parcombcount = integer(length(F1)), #will change several times
+    nextcombs = rep(1, length(F1)), #next to try:
+    #           1= parcombs based on last inferHaplotypes,
+    #           2= all possible parcombs except tried earlier,
+    #           3= nothing (ready),
+    solved = rep(FALSE, length(F1))
+  )
+  for (f in seq_along(F1)) {
+    dids <- ihl$SNPdids[names(ihl$SNPdids) %in% F1[[f]]]
+    F1dat$size[f] <- sum(is.na(dids))
+  }
+  largeF1s <- which(F1$size > 6 * ploidy)
  F1SNPdidsTable <- table(ihl$SNPdids[names(ihl$SNPdids) %in% F1])
-  P1combs <- getParentalCombs(parsamp=P1, ihl=ihl, useIH=TRUE)
+  P1combs <- getParentalCombs(parsamp=P[1], ihl=ihl, useIH=TRUE)
-  P2combs <- getParentalCombs(parsamp=P2, ihl=ihl, useIH=TRUE)
+  P2combs <- getParentalCombs(parsamp=P[2], ihl=ihl, useIH=TRUE)
  message <- ""
  if (ncol(P1combs) * ncol(P2combs) > maxparcombs) {
    message <- paste("more than", maxparcombs, "parental combinations; skipped,",
@@ -946,15 +1153,17 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv, ploidy, haploblockname,
      had[, colnames(had) %in% SNPdidind] <- SNPdidcomb
    }
  }
-  other <- setdiff(indiv, c(P1, P2, F1))
+  other <- setdiff(indiv, c(do.call(c, F1), PF1)) #all except F1s and F1 parents
-  parhaplo <- which(rowSums(parcomb) > 0)
+  if (length(other) > 0) {
-  iho <- inferHaplotypes(SNPdosages=dosmat, indiv=other, ploidy=ploidy,
+    parhaplo <- which(rowSums(parcomb) > 0)
-                         minfrac=minfrac, sel_hap=parhaplo)
+    iho <- inferHaplotypes(SNPdosages=dosmat, indiv=other, ploidy=ploidy,
-  for (oSNPdid in names(iho$SNPdidsTable)) {
+                           minfrac=minfrac, sel_hap=parhaplo)
-    ocombs <- iho$hclist[[oSNPdid]]
+    for (oSNPdid in names(iho$SNPdidsTable)) {
-    if (ncol(ocombs) == 1) {
+      ocombs <- iho$hclist[[oSNPdid]]
-      oSNPdidind <- names(iho$SNPdids)[iho$SNPdids == oSNPdid]
+      if (ncol(ocombs) == 1) {
-      had[, colnames(had) %in% oSNPdidind] <- ocombs
+        oSNPdidind <- names(iho$SNPdids)[iho$SNPdids == oSNPdid]
+        had[, colnames(had) %in% oSNPdidind] <- ocombs
+      }
    }
  }
  #for all indiv with a unique combination of haplotypes we now have
@@ -976,19 +1185,94 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv, ploidy, haploblockname,
 # - Does the F1 have an advantage for scoring the other samples?
 #   Compare this for all contigs
+#'@title get consensus SNP dosages from one or more samples
+#'@description get consensus SNP dosages from one or more samples
+#'@usage getConsensusSNPdosages(dosmat, indiv, solveConflicts=TRUE)
+#'@param dosmat a dosage matrix with SNPs in rows and individuals in columns.
+#'row names are SNP names, column names are individual names.
+#'@param indiv character vector with the names of the samples from which to
+#'obtain consensus scores
+#'@param solveConflicts if TRUE (default) and there are conflicting dosage
+#'assignments between the samples for the same SNP, the one with highest
+#'frequency is used. If there are more dosages with the maximum frequency or if
+#'solveConflicts is FALSE and there are conflicting dosages, the consensus for
+#'that SNP will be NA
+#'@return a vector with one consensus dosage for each row of dosmat, and the
+#'row names of dosmat as names
+#'@details a consensus dosage is calculated if at least one of the dosages is
+#'not NA, and if all non-NA dosages are equal. Any SNPs for which this is not
+#'the case will have NA as consensus dosage.
+getConsensusSNPdosages <- function(dosmat, indiv, solveConflicts=TRUE) {
+  if (is.vector(dosmat)) dosmat <- matrix(dosmat, ncol=1)
+  if (!all(indiv %in% colnames(dosmat)))
+    stop("not all indiv in dosmat")
+  dosmat <- dosmat[,colnames(dosmat) %in% indiv, drop=FALSE]
+  suppressWarnings({ #suppress warnings about all dosages NA
+    mindos <- apply(dosmat, MARGIN=1, FUN=min, na.rm=TRUE)
+    maxdos <- apply(dosmat, MARGIN=1, FUN=max, na.rm=TRUE)
+  })
+  if (solveConflicts) {
+    for (i in which(mindos != maxdos)) {
+      tb <- table(dosmat[i,]) # length 0 if all NA
+      suppressWarnings({ maxtb <- which(tb == max(tb)) })
+      if (length(maxtb) == 1) {
+        mindos[i] <- as.integer(names(tb)[maxtb])
+      } else mindos[i] <- NA
+    }
+  } else {
+    #not solve conflicts, set any SNP with conflicting dosages to NA:
+    mindos[mindos != maxdos] <- NA #includes rows with all NA because -Inf != Inf
+  }
+  names(mindos) <- row.names(dosmat)
+  mindos
+} #getConsensusSNPdosages
+#'@title check all parental sample sets
+#'@description check that for P1 and P2 the correct number of sample sets are
+#'present, that these contain no duplicates and that that they overlap
+#'completely or not at all
+#'@usage checkAllParentSets(P, npop, indiv)
+#'@param P a list of 2 elements, each of which is a list of character vectors
+#'containing the sample names for parents 1 and parents 2
+#'@param npop the number of F1 populations
+#'@param indiv a character vector with all sample names, the parental samples
+#'should all be in indiv
+#'@return an error message, "" if all is well
+checkAllParentSets <- function(P, npop, indiv) {
+  if (length(P) !=2 || length(P[[1]] != npop) || length(P[[2]]) != npop)
+    return("for each F1 population there should be one set of samples for each parent")
+  # check that no parental set has duplicate samples and that all samples are in indiv:
+  P <- c(P[[1]], P[[2]])
+  for (i in seq_len(2*npop)) {
+    if (length(P[[i]]) != length(unique(P[[i]])))
+      return("some parental sample sets contain duplicate samples")
+    if (!all(P[[i]] %in% indiv))
+      stop("Some parental samples are missing from indiv")
+  }
+  #check that if two parents are the same, all their samples are the same:
+  for (i in 1:(length(P)-1)) for (j in (i+1):length(P)) {
+    if (length(intersect(P[[i]], P[[j]])) > 0) {
+      if (!setequal(P[[i]], P[[j]]))
+        return("some parental sample sets overlap partially")
+    }
+  }
+  return("")
+} #checkAllParentSets
 #'@title get the most likely haplotype combinations for a (parental) individual
 #'@description get the most likely haplotype combinations for a (parental)
 #'individual based on all its samples
 #'@usage getParentalCombs(parsamp, ihl, useIH)
 #'@param parsamp character vector with all sample names for the individual
-#'@param ihl a list as returned by inferHaplotypes
+#'@param ihl a list as returned by inferHaplotypes, of which SNPdid, hclist and
+#'allhap are used
 #'@param useIH TRUE if only the inferred haplotype combinations must be
 #'returned, FALSE for all possible haplotype combinations
 #'@return a matrix with one row for each haplotype and one column for each
 #'of the most likely combinations of haplotypes, with each element being
 #'the haplotype dosage in that combination
 getParentalCombs <- function(parsamp, ihl, useIH) {
+  #TODO: see if we can fill in missing SNP dosages by combining the samples
  #which SNPdids occur over all samples in parsamp?
  parSNPdidsTable <- table(ihl$SNPdids[names(ihl$SNPdids) %in% parsamp])