diff --git a/R/PolyHaplotyper.R b/R/PolyHaplotyper.R index 27c8d1be8fc9f2617deb243882f9f3a16171ab66..ef87e013d50facd2084025a44e904dfb709c14e9 100644 --- a/R/PolyHaplotyper.R +++ b/R/PolyHaplotyper.R @@ -961,56 +961,73 @@ haplotypeDosagesF1 <- function(dosmat, ploidy, haploblockname, #had is matrix of haplotype dosages colnames(had) <- indiv rownames(had) <- paste0(haploblockname, padded(1:nrow(ihl$allhap))) + F1sNoSolution <- rep(FALSE, length(F1)) F1sDone <- !largeF1s; changes <- TRUE oldhap <- which(ihl$nPresent >= minfrac[1] * nind) - while (!all(F1sDone) && changes) { + while (!all(F1sDone | F1sNoSolution) && changes) { #continue until all large F1s done or none of the remaining ones #can be done - changes <- FALSE - for (f in F1order) if (!F1sDone[f]) { - print(paste("large F1s: f =", f)) - # check the current F1 using the current set of known haplotypes - ih <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy, - minfrac=minfrac[1], sel_hap=oldhap) - sof <- solveOneF1(dosmat=dosmat, ihl=ih, F1=F1[[f]], P=PF1[f,], - Phad=had[, PF1[f,]], maxparcombs=maxparcombs, - ploidy=ploidy, minfrac=minfrac, F1frac=F1frac, - DRrate=DRrate) - browser() - if (nrow(sof$parcombok) == 1) { - pSOF <- processSOF(sof, had, oldhap, changes, F1=F1[[f]], P=PF1[f,], - ihl=ih) - had <- pSOF$had; oldhap <- pSOF$oldhap; changes <- pSOF$changes - F1sDone[f] <- TRUE + changes <- FALSE; fo <- 1 + while (fo < length(F1order) && + (F1sDone[F1order[fo]] || F1sNoSolution[F1order[fo]])) + fo <- fo + 1 + while (fo <= length(F1order) && !all(F1sDone | F1sNoSolution) && !changes) { + f <- F1order[fo] + if (!(F1sDone[f] || F1sNoSolution[f])) { + print(paste("large F1s: f =", f)) + # check the current F1 using the current set of known haplotypes + ih <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy, + minfrac=minfrac[1], sel_hap=oldhap) + sof <- solveOneF1(dosmat=dosmat, ihl=ih, F1=F1[[f]], P=PF1[f,], + Phad=had[, PF1[f,]], maxparcombs=maxparcombs, + ploidy=ploidy, minfrac=minfrac, F1frac=F1frac, + DRrate=DRrate) + F1sNoSolution[f] <- sof$NoSolution + if (nrow(sof$parcombok) == 1) { + pSOF <- processSOF(sof, had, oldhap, changes, F1=F1[[f]], P=PF1[f,], + ihl=ih) + had <- pSOF$had; oldhap <- pSOF$oldhap; changes <- pSOF$changes + F1sDone[f] <- TRUE + } } + fo <- fo + 1 } } firstsmallround <- TRUE F1sDone[!largeF1s & F1sizes > 0] <- FALSE; changes <- TRUE - while (!all(F1sDone) && changes) { - #continue until all F1s (including the large F1s) done or none of the - #remaining ones can be done; exactly as for largeF1s - changes <- FALSE - for (f in F1order) if (!F1sDone[f] && - !(firstsmallround && largeF1s[f])) { - print(paste("all F1s: f =", f)) - # check the current F1 using the current set of known haplotypes - ih <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy, - minfrac=minfrac[1], sel_hap=oldhap) - sof <- solveOneF1(dosmat=dosmat, ihl=ih, F1=F1[[f]], P=PF1[f,], - Phad=had[, PF1[f,]], maxparcombs=maxparcombs, - ploidy=ploidy, minfrac=minfrac, F1frac=F1frac, - DRrate=DRrate) - browser() - if (nrow(sof$parcombok) == 1) { - pSOF <- processSOF(sof, had, oldhap, changes, F1=F1[[f]], P=PF1[f,], - ihl=ih) - had <- pSOF$had; oldhap <- pSOF$oldhap; changes <- pSOF$changes - F1sDone[f] <- TRUE + while (!all(F1sDone | F1sNoSolution) && changes) { + #continue until all large F1s done or none of the remaining ones + #can be done + changes <- FALSE; fo <- 1 + while (fo < length(F1order) && firstsmallround && largeF1s[F1order[fo]]) + fo <- fo + 1 + while (fo < length(F1order) && + (F1sDone[F1order[fo]] || F1sNoSolution[F1order[fo]]) && + !(firstsmallround && largeF1s[F1order[fo]])) + fo <- fo + 1 + firstsmallround <- FALSE + while (fo <= length(F1order) && !all(F1sDone | F1sNoSolution) && !changes) { + f <- F1order[fo] + if (!(F1sDone[f] || F1sNoSolution[f])) { + print(paste("all F1s: f =", f)) + # check the current F1 using the current set of known haplotypes + ih <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy, + minfrac=minfrac[1], sel_hap=oldhap) + sof <- solveOneF1(dosmat=dosmat, ihl=ih, F1=F1[[f]], P=PF1[f,], + Phad=had[, PF1[f,]], maxparcombs=maxparcombs, + ploidy=ploidy, minfrac=minfrac, F1frac=F1frac, + DRrate=DRrate) + F1sNoSolution[f] <- sof$NoSolution + if (nrow(sof$parcombok) == 1) { + pSOF <- processSOF(sof, had, oldhap, changes, F1=F1[[f]], P=PF1[f,], + ihl=ih) + had <- pSOF$had; oldhap <- pSOF$oldhap; changes <- pSOF$changes + F1sDone[f] <- TRUE + } } + fo <- fo + 1 } - firstsmallround <- FALSE } #oldhap now contains all haplotype nrs occurring in any of the @@ -1061,25 +1078,37 @@ processSOF <- function(sof, had, oldhap, changes, F1, P, ihl) { #assign hads for F1: #first make a conversion table from SNPdid to hapcomb for this F1: uniqDid <- which(!sof$didNhapcomb$dupdid) - convmat <- matrix(NA_integer_, ncol=length(uniqDid), nrow=nrow(parhapcomb)) - colnames(convmat) <- sof$didNhapcomb$SNPdid[uniqDid] + #convmat <- matrix(NA_integer_, ncol=length(uniqDid), nrow=nrow(parhapcomb)) + #colnames(convmat) <- sof$didNhapcomb$SNPdid[uniqDid] uniqx <- c(uniqDid, length(sof$didNhapcomb$SNPdid)+1) for (d in seq_along(uniqDid)) { + didHaplocomb <- rep(NA_integer_, nrow(parhapcomb)) if (uniqx[d+1] - uniqx[d] == 1) { #only one hapcomb corresponds to SNPdid, assign: - convmat[, d] <- sof$didNhapcomb$hapcomb[uniqx[, d]] + #convmat[, d] <- sof$didNhapcomb$hapcomb[, uniqx[d]] + didHaplocomb <- sof$didNhapcomb$hapcomb[, uniqx[d]] } else { #multiple hapcomb correspond to same SNPdid. Assign only if one #hapcomb has a very high probability compared with the rest (to avoid #e.g. that DR solutions get in the way of assigning a good solution) didprobs <- sof$didNhapcomb$expHCfreq[uniqx[d]:(uniqx[d+1]-1)] didprobs <- didprobs / sum(didprobs) - print(paste("max didprobs =", max(didprobs))) - sel <- which(didprobs >= 0.95) + srtprobs <- sort(didprobs, decreasing=TRUE) + if (srtprobs[1] >= 0.8 && srtprobs[2] <= 0.05) + sel <- which(didprobs >= 0.80) else sel <- integer(0) + print(paste("srtprobs: len=", length(srtprobs), srtprobs[1], srtprobs[2], + "sel=", sel)) if (length(sel) == 1) { - convmat[, d] <- sof$didNhapcomb$hapcomb[uniqx[, d+sel-1]] + #convmat[, d] <- sof$didNhapcomb$hapcomb[uniqx[, d+sel-1]] + didHaplocomb <- sof$didNhapcomb$hapcomb[uniqx[d+sel-1]] } # else convmat[, d] keeps its NA_integer_ values } + if (!is.na(didHaplocomb[1])) { + did <- sof$didNhapcomb$SNPdid[uniqDid[d]] + SNPdidind <- names(ihl$SNPdids)[ihl$SNPdids == did] #names of all indiv with this did + SNPdidind <- intersect(F1, SNPdidind) + had[, colnames(had) %in% SNPdidind] <- didHaplocomb + } } # old version (before 20170523) # F1combs <- getF1combs(parhapcomb) @@ -1209,11 +1238,13 @@ solveOneF1 <- function(dosmat, ihl, F1, P, Phad, maxparcombs, ploidy, minfrac, list(P1combs=P1combs, P2combs=P2combs, parcombok=parcombok) } #calcParcombok within solveOneF1 + NoSolution <- FALSE F1SNPdidsTable <- table(ihl$SNPdids[names(ihl$SNPdids) %in% F1], useNA="no") F1count <- sum(F1SNPdidsTable) Pdids <- ihl$SNPdids[match(P, names(ihl$SNPdids))] - if (anyNA(Pdids) || F1count==0) { + if (anyNA(Pdids) || F1count == 0) { message <- ("missing SNP data in parents or no F1 data; skipped") + NoSolution <- TRUE } else { # FIRST STEP: find acceptable parental haplotype combinations # (meaning that they match the parental SNP genotypes, and can explain @@ -1232,7 +1263,6 @@ solveOneF1 <- function(dosmat, ihl, F1, P, Phad, maxparcombs, ploidy, minfrac, } nhap <- nrow(ihl$allhap) message <- "" - browser() firstcombs <- ncol(P1combs) * ncol(P2combs) if (firstcombs == 0) stop("firstcombs==0, should not happen!") if (firstcombs > maxparcombs) { @@ -1276,13 +1306,16 @@ solveOneF1 <- function(dosmat, ihl, F1, P, Phad, maxparcombs, ploidy, minfrac, # just return the inferHaplotypes + haplotypeDosages result, # with a message. message <- "no valid F1 solutions found" + NoSolutions<- TRUE #after analyzing all possible parental combinations } } } } } if (message != "") { - return(list(message=message, parcombok=matrix(integer(0), ncol=2))) + print(message) + return(list(message=message, parcombok=matrix(integer(0), ncol=2), + NoSolution=NoSolution)) } # SECOND STEP: find the best fitting of the acceptable parental haplotype @@ -1408,7 +1441,7 @@ solveOneF1 <- function(dosmat, ihl, F1, P, Phad, maxparcombs, ploidy, minfrac, sum(F1SNPdidsTable[!(names(F1SNPdidsTable) %in% expF1SNPdids)]) Perr <- 0.04 #assumed probability of dosage errors over all SNPs parcombstats$Pinvalid[pco] <- - pbinom(F1count-invalidF1count, F1count, prob=Perr) + pbinom(F1count-invalidF1count, F1count, prob=1-Perr) #P value from chiquare test among valid F1 SNPdids: if (length(uniqfreq) == 1) { @@ -1435,11 +1468,12 @@ solveOneF1 <- function(dosmat, ihl, F1, P, Phad, maxparcombs, ploidy, minfrac, PxP <- parcombstats$Pinvalid * parcombstats$Psegr if (max(PxP) < 1e-8) return(list(message="F1 segregation doesn't match parents", - parcombok=matrix(integer(0), ncol=2))) + parcombok=matrix(integer(0), ncol=2), + NoSolution=TRUE)) # else we continue with the least bad one: sel <- which(PxP == max(PxP)) } - parcombstats <- parcombstats[sel,, drop=FALSE] + parcombstats <- parcombstats[sel,] parcombok <- parcombok[sel,, drop=FALSE] sel <- which(parcombstats$nhap == min(parcombstats$nhap)) if (length(sel) > 1) { @@ -1450,7 +1484,8 @@ solveOneF1 <- function(dosmat, ihl, F1, P, Phad, maxparcombs, ploidy, minfrac, } return(list(message="", P1combs=lst[[1]], P2combs=lst[[2]], parcombok=parcombok[sel,, drop=FALSE], - didNhapcomb=didNhapcombLst[[sel]])) + didNhapcomb=didNhapcombLst[[sel]], + NoSolution=FALSE)) #end of new version #note: the calling routine can get the set of parental haplotypes by: @@ -1701,6 +1736,8 @@ getHapcombFreq <- function(HapcombMatrix) { #hcfreq: the number of times each hapcomb occurs in HapcombMatrix #(utility function, works with all types of matrices as long as identical #columns are consecutive) + o <- do.call(order, lapply(1:nrow(HapcombMatrix), function(i) HapcombMatrix[i,])) + HapcombMatrix <- HapcombMatrix[, o, drop=FALSE] du <- duplicated(HapcombMatrix, MARGIN=2) uniqcomb <- which(!du) hapcounts <- c(uniqcomb[-1], length(du)+1) - uniqcomb @@ -1756,7 +1793,7 @@ getGameteFreqs <- function(hapdos, DRrate) { DR$freq <- DRrate * DR$freq / sum(DR$freq) freq <- c(noDR$freq, DR$freq) hapcomb <- cbind(noDR$hapcomb, DR$hapcomb) - hapcomb <- hapcomb[, freq>0] + hapcomb <- hapcomb[, freq>0, drop=FALSE] freq <- freq[freq>0] #find the order to sort the hapcomb columns (on all 2 or 3 rows): o <- do.call(order, lapply(1:nrow(hapcomb), function(i) hapcomb[i,])) @@ -1769,7 +1806,7 @@ getGameteFreqs <- function(hapdos, DRrate) { uhx <- c(uniqcomb, length(du)+1) for (u in seq_along(uniqfreq)) uniqfreq[u] <- sum(freq[uhx[u]:(uhx[u+1]-1)]) - list(hapcomb=hapcomb[, uniqcomb], freq=uniqfreq) + list(hapcomb=hapcomb[, uniqcomb, drop=FALSE], freq=uniqfreq) } } #getGameteFreqs @@ -1939,7 +1976,7 @@ solveAllHaploblocks <- function(SNPdosages, indiv, ploidy, haploblocks, F1 <- F1[o] PF1 <- PF1[o,] } #!nopops - result <- list(); hbrows <- rep(NA_integer_, length(haploblocks)) + result <- list(); hbrw <- rep(NA_integer_, length(haploblocks)) for (hb in seq_along(haploblocks)) { if (nopops) { ihl <- inferHaplotypes(SNPdosages=dosmat, indiv=indiv, ploidy=ploidy, @@ -1954,7 +1991,7 @@ solveAllHaploblocks <- function(SNPdosages, indiv, ploidy, haploblocks, haploblockname=names(haploblocks)[hb], PF1=PF1, F1=F1, minfrac=minfrac, F1frac=F1frac, dropUnused=dropUnused, maxparcombs=maxparcombs) - hbrows <- ifelse(is.matrix(result[[hb]]), nrow(result[[hb]]), 1) + hbrw <- ifelse(is.matrix(result[[hb]]), nrow(result[[hb]]), 1) } } } #solveAllHaploblocks