From 73160ccf13160493b37f02192e312d23ae1e03ad Mon Sep 17 00:00:00 2001 From: Roeland Voorrips <roeland.voorrips@wur.nl> Date: Fri, 21 Apr 2017 17:14:47 +0200 Subject: [PATCH] new untested haplotypeDosagesF1 for multiple F1s. Issues with ahc(complete)list in check package --- R/PolyHaplotyper.R | 469 ++++++++++++++++++++++++++++++--------------- 1 file changed, 311 insertions(+), 158 deletions(-) diff --git a/R/PolyHaplotyper.R b/R/PolyHaplotyper.R index cf9eede..8479f50 100644 --- a/R/PolyHaplotyper.R +++ b/R/PolyHaplotyper.R @@ -70,7 +70,7 @@ utils::globalVariables(names=c("ahcploidy", "ahclist", "ahcunsaved"), #'@usage allHaplotypes(SNPnames) #'@param SNPnames the names of the SNPs (or other bi-allelic markers) in the #'haploblock (contig) -#'@return a matrix with SNPs in columns and all possible (2^n) haplotypes +#'@return a matrix with SNPs in columns and all possible (2^nSNP) haplotypes #'in rows (1: haplotype contains the dosage-counted SNP allele, 0: haplotype #'contains the other SNP allele). The colnames are the SNPnames. allHaplotypes <- function(SNPnames) { @@ -79,13 +79,14 @@ allHaplotypes <- function(SNPnames) { lst[[s]] <- 0:1 } eg <- as.matrix(expand.grid(lst, stringsAsFactors=FALSE)) - eg <- eg[,ncol(eg):1] + eg <- eg[, ncol(eg):1, drop=FALSE] colnames(eg) <- SNPnames eg } #allHaplotypes loadAllHaploCombList <- function(ploidy) { - # Load ahclist into GlobalEnv if needed. + # Function is called only by allHaploComb. + # Load ahclist and ahccompletelist into GlobalEnv if needed. # Checks variable ahcploidy in GlobalEnv. If that doesn't exist or # is not equal to ploidy, attempts to load ahclist into the GlobalEnv # from a file 'ahclist_nx.RData' in the current working directory (where n is @@ -94,23 +95,37 @@ loadAllHaploCombList <- function(ploidy) { # Return value is TRUE if the ahclist was already available or is loaded # succesfully, else FALSE. listloaded <- FALSE - if (exists("ahcploidy", envir=.GlobalEnv, inherits==FALSE) && - exists("ahclist", envir=.GlobalEnv, inherits==FALSE)) { - listloaded <- ahcploidy == ploidy - if (!listloaded && exists("ahcunsaved", envir=.GlobalEnv, inherits==FALSE) - && ahcunsaved) { + if (!exists("ahcinfo", envir=.GlobalEnv, inherits==FALSE)) { + newinfo <- list() + newinfo$ploidy <- 0 #to ensure loading a new ahclist + newinfo$unsaved <- FALSE + newinfo$complete_nSNP <- 0 + assign("ahcinfo", value=newinfo, envir=.GlobalEnv, inherits=FALSE) + } + if (ahcinfo$ploidy != ploidy) { + #save any unsaved ahclist: + if (ahcinfo$unsaved) { #the currently loaded ahclist is for a different ploidy and unsaved; #we save it before loading the ahclist for the current ploidy: - save(ahclist, file=paste0("ahclist_", ahcploidy,"x.RData")) + save(ahclist, file=paste0("ahclist_", ahcinfo$ploidy,"x.RData")) } + #load the ahccompletelist: + x <- tryCatch(load(paste0("ahccompletelist_", ploidy,"x.RData"), + envir=.GlobalEnv), + error=function(e) "error") + if (identical(x, "error")) { + ahcinfo$complete_nSNP <- 0 + } else ahcinfo$complete_nSNP <- length(ahccompletelist) + #load the ahclist: + x <- tryCatch(load(paste0("ahclist_", ploidy,"x.RData"), + envir=.GlobalEnv), + error=function(e) "error") + if (identical(x, "error")) + assign("ahclist", value=list(), envir=.GlobalEnv, inherits=FALSE) + #update ahcinfo: + ahcinfo$ploidy <- ploidy + ahcinfo$unsaved <- FALSE } - if (!listloaded && file.exists(paste0("ahclist_", ploidy, "x.RData"))) { - load(paste0("ahclist_", ploidy,"x.RData"), envir=.GlobalEnv) - assign("ahcploidy", value=ploidy, envir=.GlobalEnv, inherits=FALSE) - assign("ahcunsaved", value=FALSE, envir=.GlobalEnv, inherits=FALSE) - listloaded <- TRUE - } - listloaded } #loadAllHaploCombList #'@title get all haplotype combinations that result in the given SNP dosages @@ -136,8 +151,13 @@ loadAllHaploCombList <- function(ploidy) { #'@param progress whether to print a message when starting calculations #'for a did; default FALSE #'@details Each column of the return value represents one combination of -#'haplotypes that yieds the observed SNP dosages. If any of the SNP dosages are -#'NA or not in 0:ploidy, a matrix with 0 columns is returned +#'haplotypes that yields the observed SNP dosages. If any of the SNP dosages are +#'NA or not in 0:ploidy, a matrix with 0 columns is returned.\cr +#'This function makes use of precalculated lists that are read from files +#'in the current working directory: ahccompletelist_nx.RData and +#'ahclist_nx.RData (where n is the ploidy). These lists ahccompletelist and +#'ahclist are stored in the GlobalEnv, as well as another list ahcinfo. Liust +#'ahcinfo may be extended with newly calculated results. #'@return an integer matrix with one row per haplotype (corresponding to allhap) #'and one column per combination of haplotypes, with the number of copies #'of each haplotype in each combination (the dosages of the haplotypes) @@ -161,25 +181,27 @@ allHaploComb <- function(SNPdosages=NULL, did=NULL, allhap, ploidy, } else did <- SNPdidfun(SNPdosages, ploidy=ploidy) } - #we use ahclist in .GlobalEnv: - listloaded <- loadAllHaploCombList(ploidy) - didix <- integer(0) + loadAllHaploCombList(ploidy) #does nothing if list already in GlobalEnv + # first check if this nSNP is in ahccompletelist: + if (ahcinfo$complete_nSNP >= nSNP) { + # in the ahccompletelist, for each nSNP the entire list is precalculated + # by completeAllHaploComb, and the elements are directly indexed by the did: + return(ahccompletelist[[nSNP]][[did]]) + } + # else look it up in ahclist, and if needed calculate it: if (listloaded && length(ahclist) >= nSNP) didix <- which(names(ahclist[[nSNP]]) == did) if (length(didix) == 1) { #the haplotype combinations for this SNPdid were already calculated earlier: return(ahclist[[nSNP]][[didix]]) } - - #now ahclist is not loaded because the file did not exist yet, - #or it is loaded but the required element is not yet computed. - #we create or copy the list to a local tmplist, modify that, + # did was not present in ahclist so we add it: + #we copy the list to a local tmplist, modify that, #store it in .GlobalEnv and save it, and return the matrix with haplotype #combinations. if (progress) cat(paste0("allHaploComb: calculation for SNPdid=", did, "\n")) - if (listloaded) tmplist <- ahclist else tmplist <- list() + tmplist <- ahclist if (length(tmplist) < nSNP) tmplist[[nSNP]] <- list() - #calculate new matrix with haplotype combinations if (missing(SNPdosages)) SNPdosages <- dosFromSNPdid(did, nSNP, ploidy) allmat1 <- matrix(NA_integer_, nrow=ploidy, ncol=0) #matrix with haplotype sets in columns @@ -223,7 +245,7 @@ allHaploComb <- function(SNPdosages=NULL, did=NULL, allhap, ploidy, assign("ahclist", value=tmplist, envir=.GlobalEnv, inherits=FALSE) if (writeFile) { save(ahclist, file=paste0("ahclist_", ploidy,"x.RData")) - } else assign("ahcunsaved", value=TRUE, envir=.GlobalEnv, inherits=FALSE) + } else ahcinfo$ahcunsaved <- TRUE allmat } #allHaploComb @@ -293,7 +315,7 @@ completeAllHaploComb <- function(ploidy, nSNP, savesec=1800, printsec=60, nhap <- nrow(allhap) allset <- rep(nhap, ploidy) #current haplotype set: a full set has length ploidy while(TRUE) { - SNPdos <- colSums(allhap[allset,]) + SNPdos <- colSums(allhap[allset,, drop=FALSE]) did <- SNPdidfun(SNPdos, ploidy=ploidy) lasthap[did] <- lasthap[did] + 1 if ((nc<-ncol(ahc[[did]])) < lasthap[did]) { @@ -362,7 +384,7 @@ haplocomb2SNPdosages <- function(haplocomb, allhap) { if (is.vector(haplocomb)) haplocomb <- as.matrix(haplocomb) if (nrow(haplocomb) != nrow(allhap)) stop("haplocomb2SNPdosages: input error") res <- t(t(haplocomb) %*% allhap) - colnames(res) <- SNPdid(res, ploidy=sum(haplocomb[,1]), check=FALSE) + colnames(res) <- SNPdid(res, ploidy=sum(haplocomb[, 1]), check=FALSE) res } #haplocomb2SNPdosages @@ -541,7 +563,7 @@ dosFromSNPdid <- function(dosageIDs, nSNP, ploidy) { #'by printed messages #'@details The returned list contains some general calculations and statistics #'based on SNPdosages and ploidy, and some parts that are the result of the -#'inference. The primary of these if hclist: this contains for each SNPdid in +#'inference. The primary of these is hclist: this contains for each SNPdid in #'the population the most likely combination of haplotypes (sometimes #'more than one, if several are equally likely = all have the maximum dosage #'of haplotypes inferred to be present).\cr @@ -559,7 +581,7 @@ dosFromSNPdid <- function(dosageIDs, nSNP, ploidy) { #'those extra haplotypes as must-be-present, provided they occur in at least #'a fraction minfrac[2] of all individuals (minimum 2 individuals). If that #'leads to more SNPdids having a unique haplotype combination AND not having -#'other SNPdids getting more combination) the new assignments are used.\cr +#'other SNPdids getting more combination the new assignments are used.\cr #'#'If sel_hap in the function call already contains haplotypes, these are #'considered to be certainly present, even if they don't meet the minfrac #'criterion. This may help to reduce the number of haplotype configurations @@ -583,9 +605,6 @@ dosFromSNPdid <- function(dosageIDs, nSNP, ploidy) { #'in SNPdidsTable, and contains a matrix with one column for each of the #'remaining haplotype combinations: (a subset of the columns of) the matrix #'returned by allHaploComb for that SNPdid} -# not any more: \item{haploLost: (should be integer(0); a vector of haplotypes that -#were inferred to be present in the penultimate cycle but were not inferred -#to be present in the last cycle} #'} #'@export inferHaplotypes <- function(SNPdosages, indiv, ploidy, minfrac=c(0.1, 0.01), @@ -900,6 +919,8 @@ haplotypeDosages <- function(haploblockname, ihl, haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname, P1, P2, F1, minfrac=c(0.1, 0.01), F1frac=0.05, dropUnused=TRUE, maxparcombs=150000) { + # All next lines to be moved to initial check function, + # called once for all hploblocks: dosmat <- checkSNPdosages(SNPdosages, indiv, ploidy) if (is.character(dosmat)) stop(dosmat) if (!is.list(F1)) F1 <- list(F1) @@ -916,7 +937,7 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname, #PF1 will contain one sample name per population for each parent; #dosmat will be changed so that all other parental samples are deleted and #the sample in PF1 has the consensus SNP dosages per parent - for (p in 1:2) { + for (p in 1:2) { # p is parent (1 or 2) for (pop in seq_along(F1)) { #it can be that this same parent has already been checked for an earlier #population and then only one sample remains, with the consensus dosage: @@ -930,26 +951,221 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname, } PF1[pop, p] <- P[[p]][[pop]][1] } - } - indiv <- colnames(dosmat) - # now PF1 has the names of the first samples of P1 and P2 for each pop, + } # for p in 1:2 + #order the F1 populations by decreasing size: + F1sizes <- sapply(F1, length) + o <- order(F1sizes, decreasing=TRUE) + F1 <- F1[o] + P[[1]] <- P[[1]][o] + P[[2]] <- P[[2]][o] + PF1 <- PF1[o,] + F1sizes <- F1sizes[o] + #TODO: move the calculation of parental consensus scores and ordering of + #F1 populations to a separate function where it will be done for all + #markers at once (assuming the nr of missing data does not vary too much) + # now PF1 has the names of the one remaining sample of P1 and P2 for each F1 pop, # and dosmat has for each parent only one column with the consensus SNP dosages, # with as colnames the sample names in PF1 - # Note that if the same parent is used multiple times all samples should - # be given each time (order may be different) + # Note that if the same parent is used in multiple F1 population the same set + # of samples should be given each time (order may be different) + + #HERE STARTS THE CODE FOR THE CURRENT HAPLOBLOCK + #we have got the dosmat and the (re-ordered) P and F1 lists, + #NOT the PF1 and F1sizes + + indiv <- colnames(dosmat); nind <- length(indiv) + #calculate ihl without using F1 info: ihl <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy, minfrac=minfrac[1]) #without final inference + nhap <- nrow(ihl$allhap) + F1sizes <- integer(length(F1)) + for (f in seq_along(F1)) + F1sizes[f] <- sum(!is.na(ihl$did[names(ihl$did %in% F1[[f]])])) + largeF1s <- F1sizes > 6 * ploidy + + had <- matrix(NA_integer_, nrow=nrow(ihl$allhap), ncol=nind) + #had is matrix of haplotype dosages + colnames(had) <- indiv + rownames(had) <- paste0(haploblockname, padded(1:nrow(ihl$allhap))) + F1sDone <- !largeF1s; changes <- TRUE + oldhap <- which(ihl$nPresent >= minfrac[1] * nind) + while (!all(F1sDone) && changes) { + #continue until all large F1s done or none of the remaining ones + #can be done + changes <- FALSE + for (f in seq_along(F1)) if (!F1sDone[f]) { + # check the current F1 using the current set of known haplotypes + # if (success) { + # F1sDone[f] <- TRUE + # if (new haplotypes added) changes <- TRUE + ih <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy, + minfrac=minfrac[1], sel_hap=oldhap) + parents <- c(P[[1]][f], P[[2]][f]) + sof <- solveOneF1(dosmat=dosmat, ihl=ih, F1=F1[f], + P=parents, Phad=had[, parents], maxparcombs=maxparcombs, + ploidy=ploidy, minfrac=minfrac, F1frac=F1frac) + if (nrow(sof$parcombok) == 1) { + pSOF <- processSOF(sof, had, oldhap, changes, F1=F1[f], P=parents, + ihl=ih) + had <- pSOF$had; oldhap <- pSOF$oldhap; changes <- pSOF$changes + F1sDone[f] <- TRUE + } + } + } + + F1sDone[!largeF1s & F1sizes > 0] <- FALSE; changes <- TRUE + while (!all(F1sDone) && changes) { + #continue until all F1s (including the large F1s) done or none of the + #remaining ones can be done; exactly as for largeF1s + changes <- FALSE + for (f in seq_along(F1)) if (!F1sDone[f]) { + # check the current F1 using the current set of known haplotypes + # if (success) { + # F1sDone[f] <- TRUE + # if (new haplotypes added) changes <- TRUE + ih <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy, + minfrac=minfrac[1], sel_hap=oldhap) + parents <- c(P[[1]][f], P[[2]][f]) + sof <- solveOneF1(dosmat=dosmat, ihl=ih, F1-F1[f], + P=parents, Phad=had[, parents], maxparcombs=maxparcombs, + ploidy=ploidy, minfrac=minfrac, F1frac=F1frac) + if (nrow(sof$parcombok) == 1) { + pSOF <- processSOF(sof, had, oldhap, changes, F1=F1[f], P=parents, + ihl=ih) + had <- pSOF$had; oldhap <- pSOF$oldhap; changes <- pSOF$changes + F1sDone[f] <- TRUE + } + } + } + + #oldhap now contains all haplotype nrs occurring in any of the + #parents of F1's that have been solved + + #inferHaplotypes for all non-solved F1s and their parents, and panel + rest <- setdiff(indiv, c(do.call(c, F1[F1sDone]), + do.call(c, P[[1]][F1sDone]), + do.call(c, P[[2]][F1sDone]))) + rest_ihl <- inferHaplotypes(SNPdosages=dosmat, indiv=rest, ploidy=ploidy, + minfrac=minfrac, sel_hap=oldhap) + for (rSNPdid in names(rest_ihl$SNPdidsTable)) { + rcombs <- rest_ihl$hclist[[rSNPdid]] + if (ncol(rcombs) == 1) { + rSNPdidind <- names(rest_ihl$SNPdids)[rest_ihl$SNPdids == rSNPdid] + had[, colnames(had) %in% rSNPdidind] <- rcombs + } + } + #for all indiv with a unique combination of haplotypes we now have + #stored the solution (haplotype combination) in had + if (dropUnused) { + had <- had[rowSums(had, na.rm=TRUE) > 0, , drop=FALSE] + #has 0 rows if no indiv has a unique solution + } + list(message="", hapdos=had) + + # (if rest_ihl with two minfrac values results in additional haplotypes, + # see if these can help solve the remaining F1s? And then re-iterate? + # For later!) + + #Done! +} #haplotypeDosagesF1 + +processSOF <- function(sof, had, oldhap, changes, F1, P, ihl) { + #used only by haplotypeDosagesF1 + #sof = list returned by solveOneF1 + #had = matrix of haplotype dosages + #oldhap = vector of haplotype nrs known to be present + #changes indicates if the set of oldhap was modified in the current iteration + + #assign hads for P and F1: + parhapcomb <- cbind(sof[[2]][, sof$parcombok[1,]], + sof[[3]][, sof$parcombok[2,]]) + had[, colnames(had) == P[1]] <- sof$P1combs[,parhapcomb[1,]] + had[, colnames(had) == P[2]] <- sof$P2combs[,parhapcomb[2,]] + F1combs <- getF1combs(parhapcomb) + F1SNPdos <- haplocomb2SNPdosages(F1combs, allhap=ihl$allhap) + F1SNPdids <- colnames(F1SNPdos) #SNPdids may not be unique, + # several haplotype combs may give the same SNP dosages + for (f1SNPdid in unique(F1SNPdids)) { + if (sum(F1SNPdids == f1SNPdid) == 1) { + #f1SNPdid unique, only one haplotype comb in F1 matches f1SNPdid + SNPdidcomb <- F1combs[, F1SNPdids == f1SNPdid, drop=FALSE] + SNPdidind <- names(ihl$SNPdids[ihl$SNPdids == f1SNPdid]) #all indiv with this f1SNPdid + SNPdidind <- intersect(F1, SNPdidind) + had[, colnames(had) %in% SNPdidind] <- SNPdidcomb + } + } + #check if parhapcomb contains new haplotypes: + parhap <- which(rowSums(parhapcomb) > 0) + m <- match(parhap, oldhap) + if (anyNA(m)) { + #new haplotypes found in parents that earlier were not marked as present + changes <- changes || anyNA(m) + oldhap <- sort(union(oldhap, parhap)) + } + list(had=had, oldhap=oldhap, changes=changes) +} #processSOF + - #First:analyse F1 with parents: +#'@title find the optimal combination of parental haplotype combinations +#'@description For one F1 population and one haploblock, find the optimal +#'combination of parental haplotype combinations +#'@usage solveOneF1(dosmat, ihl, F1, P, Phad, maxparcombs, ploidy, +#'minfrac, F1frac) +#'@param dosmat dosmat with only the SNPs in the current haploblock, and with +#'only one column for each F1 parent, containing its consensus dosages; +#'otherwise all individuals of interest within and outside the F1 are retained +#'@param ihl a list returned by inferHaplotypes +#'@param F1 character vector with sample names of the current F1 +#'@param P vector with 2 sample names, one for parent1 and one for parent2. +#'The corresponding columns in dosmat should have pre-calculated consensus +#'SNP genotypes for both parents, and orig_ihl should be based on that +#'dosmat +#'@param Phad matrix with 2 columns, with the haplotype dosages assigned +#'to the parents. Usually both contain only NA_integer_s, but if they are +#'assigned we consider only that had +#'@param maxparcombs Parent 1 and 2 both may have multiple possible haplotype +#'combinations. For each pair of haplotype combinations (one from P1 and one +#'from P2) the expected F1 segregation must be checked against the observed. +#'This may take a long time if many such combinations need to be checked. +#'This parameter sets a limit to the number of allowed combinations; default +#'150000 takes about 45 min. +#'@param ploidy all SNP dosages should be in 0:ploidy or NA +#'@param minfrac vector of two fractions, default 0.1 and 0.01. A haplotype is +#'considered to be certainly present if it occurs in at least a fraction +#'minfrac[1] of all individuals; in the final stage for the "other" +#'individuals (those that do not belong to the F1 or its parents) this fraction +#'is lowered to minfrac[2]; see also inferHaplotypes +#'@param F1frac the fraction of F1 individuals for which it is accepted +#'that they are not explained by a parental haplotype combination; default 0.05 +#'(allowing for a few percent of Double Reduction) +#'@return a list with a message, two matrices P1combs and P2combs with the +#'haplotype combinations that were considered for each parent +#'and a matrix parcombok with 0 or 1 rows and two columns: 0 rows if no +#'solution was found, 1 row indexing the columns of P1combs and P2combs +#'if a solution was found. +solveOneF1 <- function(dosmat, ihl, F1, P, Phad, maxparcombs, ploidy, minfrac, + F1frac) { + #TODO: If no solution found and Phad not NA's for both parents, + #it may be that the had(s) assigned based on a previous F1 is not correct. + #In that case ignore the Phad and if that succeeds, go back to previous F1 + #with new had for that parent - calcParcombok <- function(P1combs, P2combs, oldP1combs, oldP2combs) { + calcParcombok <- function(P1combs, P2combs, oldP1combs, oldP2combs, ihl, + F1frac) { + # Function within solveOneF1 + # Calculates the combinations of parental haplotype combinations + # that can explain (almost) all F1 individuals + # P1combs, P2combs: matrices with haplotype combinations for P1 and P2 + # oldP1combs, oldP2combs: matrices with parental haplotypes combs that + # were already calculated and whose P1-P2 combinations were no solutions + # (may have 0 columns if none) #TODO: parallellize; see also allHaploComb P1matchold <- match(as.data.frame(P1combs), as.data.frame(oldP1combs)) P2matchold <- match(as.data.frame(P2combs), as.data.frame(oldP2combs)) # NOTE that match matches vector elements, also when the vector is a # list (or data.frame) if (!anyNA(P1matchold) && !anyNA(P2matchold)) { - #no new parental combinations + #no new parental combinations, so no solutions parcombok <- matrix(integer(0), ncol=2) } else { missedF1s <- matrix(NA_integer_, nrow=ncol(P1combs), ncol=ncol(P2combs)) @@ -983,93 +1199,74 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname, # indices to P1combs and P2combs } list(P1combs=P1combs, P2combs=P2combs, parcombok=parcombok) - } #calcParcombok within haplotypeDosagesF1 - - for (pop in seq_along(F1)) { - #todo: analysis per F1; in which order: - #large to small F1, or small to large nr of parental combs, or ...? - #among F1's larger than (eg) 6 * ploidy: order by increasing nr - #of parental combinations based on initial inferHaplotypes; - #- when no solution found: calculate nr of parental combs based on all - # possibilities and reorder pops - #- when solution found: re-do inferHaplotypes with the known parental - # haplotypes present and re-order the remaining pops that have not been - # tried yet on the new nr of parental combs - #- after all larger pops have been done, do the small F1's in decreasing - # order of size (first based on inferHaplotypes, then if needed based - # on all possible combs) - #- after the last F1, a last inferHaplotypes with all parental combs given - # to score the "other" samples - } - - F1dat <- data.frame( - size = integer(length(F1)), #fixed: nr of F1 indiv with non-NA SNPdid - parcombcount = integer(length(F1)), #will change several times - nextcombs = rep(1, length(F1)), #next to try: - # 1= parcombs based on last inferHaplotypes, - # 2= all possible parcombs except tried earlier, - # 3= nothing (ready), - solved = rep(FALSE, length(F1)) - ) - - for (f in seq_along(F1)) { - dids <- ihl$SNPdids[names(ihl$SNPdids) %in% F1[[f]]] - F1dat$size[f] <- sum(is.na(dids)) - } - largeF1s <- which(F1$size > 6 * ploidy) + } #calcParcombok within solveOneF1 + nhap <- nrow(ihl$allhap) + PhadNA <- is.na(Phad[1,]) #all or none of the values in a Phad column are NA F1SNPdidsTable <- table(ihl$SNPdids[names(ihl$SNPdids) %in% F1]) - P1combs <- getParentalCombs(parsamp=P[1], ihl=ihl, useIH=TRUE) - P2combs <- getParentalCombs(parsamp=P[2], ihl=ihl, useIH=TRUE) + if (PhadNA[1]) { + P1combs <- getParentalCombs(parsamp=P[1], ihl=ihl, useIH=TRUE) + } else P1combs <- Phad[, 1] + if (PhadNA[2]) { + P2combs <- getParentalCombs(parsamp=P[2], ihl=ihl, useIH=TRUE) + } else P2combs <- Phad[, 2] message <- "" - if (ncol(P1combs) * ncol(P2combs) > maxparcombs) { - message <- paste("more than", maxparcombs, "parental combinations; skipped,", - "result based on inferHaplotypes") + lst <- NULL + firstcombs <- ncol(P1combs) * ncol(P2combs) + if (firstcombs == 0 || firstcombs > maxparcombs) { + # too many potential combinations (it takes about 45 min to check + # 150000 combinations) + message <- paste("more than", maxparcombs, "parental combinations; skipped") parcombok <- matrix(integer(0), ncol=2) } else { origP1combs <- P1combs; origP2combs <- P2combs lst <- calcParcombok(P1combs=origP1combs, P2combs=origP2combs, oldP1combs=matrix(integer(0), nrow=nrow(origP1combs)), - oldP2combs=matrix(integer(0), nrow=nrow(origP2combs))) + oldP2combs=matrix(integer(0), nrow=nrow(origP2combs)), + ihl=ihl, F1frac=F1frac) + # note that in this first try the oldPcombs have 0 columns as there has not + # yet been an earlier calculation if (nrow(lst$parcombok) == 0) { #No parental combinations fit. #1. See how many parental combinations are possible without first # inferHaplotypes (i.e. with allHaploComb). - # If p1*p2 < 150000 (ca 45 min) try them all. - nwP1combs <- getParentalCombs(parsamp=P1, ihl=ihl, useIH=FALSE) - nwP2combs <- getParentalCombs(parsamp=P2, ihl=ihl, useIH=FALSE) - if (ncol(nwP1combs) * ncol(nwP2combs) > maxparcombs) { + # If p1*p2 < maxparcombs we try them all. + if (PhadNA[1]) { + nwP1combs <- getParentalCombs(parsamp=P[1], ihl=ihl, useIH=FALSE) + } else nwP1combs <- Phad[, 1] + if (PhadNA[2]) { + nwP2combs <- getParentalCombs(parsamp=P[2], ihl=ihl, useIH=FALSE) + } else nwP2combs <- Phad[, 2] + if ((ncol(nwP1combs) * ncol(nwP2combs)) - firstcombs > maxparcombs) { message <- paste("more than", maxparcombs, - "extra parental combinations; skipped,", - "result based on inferHaplotypes") + "extra parental combinations; skipped") } else { lst <- calcParcombok(P1combs=nwP1combs, P2combs=nwP2combs, - oldP1combs=origP1combs, oldP2combs=origP2combs) + oldP1combs=origP1combs, oldP2combs=origP2combs, + ihl=ihl, F1frac=F1frac) if (nrow(lst$parcombok) == 0) { #2. If 1. does not find a solution due to wrong parental SNP data or # because it was skipped because of too many combinations, # just return the inferHaplotypes + haplotypeDosages result, # with a message. - message <- "no valid F1 solutions found; result based on inferHaplotypes" + message <- "no valid F1 solutions found" } } } } if (message != "") { - ihl <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy, - minfrac=minfrac) #including final inference - had <- haplotypeDosages(haploblockname=haploblockname, ihl=ihl, - partial=TRUE, dropUnused=dropUnused) - return(list(message=message, hapdos=had)) + if (is.null(lst)) + lst <- list(P1combs=P1combs, P2combs=P2combs, + parcombok=matrix(integer(0), ncol=2)) + return(c(list(message=message), lst)) } # now we have one or more acceptable parental combinations P1combs <- lst$P1combs P2combs <- lst$P2combs parcombok <- lst$parcombok - #P1combs and P2combs only have haplotype combinations that may be ok, - #and parcombok indicates which combinations of these match (almost) all F1's. + #parcombok indicates which combinations of these match (almost) all F1's. #Now, any of these combinations could be the correct one. We check all of them #and use the best. So for each combination: # (1) get the total set of parental haplotypes: these are all known to be @@ -1084,6 +1281,7 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname, # (that are not certainly present); leave for now # - minhap: the minimum number of haplotypes needed to explain (almost) # all individuals (as small as possible); harder, leave for now + indiv <- colnames(dosmat) parcombstats <- data.frame(hAbsent = rep(NA_integer_, nrow(parcombok)), indSingle = rep(NA_integer_, nrow(parcombok)), indExtra = rep(NA_integer_, nrow(parcombok)), @@ -1093,8 +1291,8 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname, parcomb <- cbind(P1combs[,parcombok[pco, 1]], P2combs[,parcombok[pco, 2]]) parhaplo <- which(rowSums(parcomb) > 0) - ih <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv , ploidy=ploidy, - minfrac=minfrac[1], sel_hap=parhaplo) + ih <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy, + minfrac=minfrac[1], sel_hap=parhaplo) ihlist[[pco ]] <- ih parcombstats$hAbsent[pco] <- sum(ih$nAbsent > 0.9 * length(indiv)) #parcombstats$indSingle (nr of ind with a unique haplotype combi): @@ -1119,61 +1317,12 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname, best2 <- which.max(parcombstats$hAbsent[best]) best <- best[best2] } - - #finally: - #we know which pco (parental combination) is the best. - #haplotype dosages for parents are available from parcombok[best,] - #haplotype dosages for F1 can then be recalculated with getF1combs - #haplotype dosages for the other samples are recalculated using - # parcombok[best,] and both minfrac values - - had <- matrix(NA_integer_, nrow=nrow(ihl$allhap), ncol=length(indiv)) - #had means haplotype dosages - colnames(had) <- indiv - rownames(had) <- paste0(haploblockname, padded(1:nrow(ihl$allhap))) - parcomb <- cbind(P1combs[, parcombok[best, 1]], #selected haplotype comb of P1 - P2combs[, parcombok[best, 2]]) #and P2 - had[, colnames(had) %in% P1] <- parcomb[,1] - had[, colnames(had) %in% P2] <- parcomb[,2] - #F1: it may be that among the F1 some SNPdid have only one possible combination - #of haplotypes, while the same SNPdid in the other samples has more than - #one possible combination (and hence would be discarded). Or that a solution - #that works in unrelated material is incompatible with the F1 parents. - #So we do the F1 separately from the others: - F1combs <- getF1combs(parcomb) - F1SNPdos <- haplocomb2SNPdosages(F1combs, allhap=ihl$allhap) - F1SNPdids <- colnames(F1SNPdos) #SNPdids may not be unique, - # several haplotype combs may give the same SNP dosages - for (f1SNPdid in unique(F1SNPdids)) { - if (sum(F1SNPdids == f1SNPdid) == 1) { - #f1SNPdid unique, only one haplotype comb in F1 matches f1SNPdid - SNPdidcomb <- F1combs[, F1SNPdids == f1SNPdid, drop=FALSE] - SNPdidind <- names(ihl$SNPdids[ihl$SNPdids == f1SNPdid]) #all indiv with this f1SNPdid - SNPdidind <- intersect(F1, SNPdidind) - had[, colnames(had) %in% SNPdidind] <- SNPdidcomb - } - } - other <- setdiff(indiv, c(do.call(c, F1), PF1)) #all except F1s and F1 parents - if (length(other) > 0) { - parhaplo <- which(rowSums(parcomb) > 0) - iho <- inferHaplotypes(SNPdosages=dosmat, indiv=other, ploidy=ploidy, - minfrac=minfrac, sel_hap=parhaplo) - for (oSNPdid in names(iho$SNPdidsTable)) { - ocombs <- iho$hclist[[oSNPdid]] - if (ncol(ocombs) == 1) { - oSNPdidind <- names(iho$SNPdids)[iho$SNPdids == oSNPdid] - had[, colnames(had) %in% oSNPdidind] <- ocombs - } - } - } - #for all indiv with a unique combination of haplotypes we now have - #stored the solution (haplotype combination) in had - if (dropUnused) { - had <- had[rowSums(had, na.rm=TRUE) > 0, , drop=FALSE] - #has 0 rows if no indiv has a unique solution - } - list(message="", hapdos=had) -} #haplotypeDosagesF1 + return(list(c(message="", lst[1:2], parcombok=parcombok[best,, drop=FALSE]))) + #note: the calling routine can get the set of parental haplootypes by: + #if(nrow(result$parcombok) != 1) integer(0) else + # which(rowSums(cbind(result[[2]][,result$parcombok[1,]], + # result[[3]][,result$parcombok[2,]])) > 0) +} #solveOneF1 #TODO: # - impute missing SNP dosages (1) in F1 (2) in other? @@ -1194,7 +1343,8 @@ haplotypeDosagesF1 <- function(SNPdosages, indiv=NULL, ploidy, haploblockname, #'obtain consensus scores #'@param solveConflicts if TRUE (default) and there are conflicting dosage #'assignments between the samples for the same SNP, the one with highest -#'frequency is used. If there are more dosages with the maximum frequency or if +#'frequency is used, provided the number of other dosages is not more than 1 +#'or not more than 10% of the frequency of the maximum dosage. If #'solveConflicts is FALSE and there are conflicting dosages, the consensus for #'that SNP will be NA #'@return a vector with one consensus dosage for each row of dosmat, and the @@ -1216,7 +1366,10 @@ getConsensusSNPdosages <- function(dosmat, indiv, solveConflicts=TRUE) { tb <- table(dosmat[i,]) # length 0 if all NA suppressWarnings({ maxtb <- which(tb == max(tb)) }) if (length(maxtb) == 1) { - mindos[i] <- as.integer(names(tb)[maxtb]) + othercount <- sum (!is.na(dosmat[i,])) - tb[maxtb] + if (othercount > 1 && othercount > 0.1 * tb[maxtb]) { + mindos[i] <- NA + } else mindos[i] <- as.integer(names(tb)[maxtb]) } else mindos[i] <- NA } } else { -- GitLab