diff --git a/R/PolyHaplotyper.R b/R/PolyHaplotyper.R index 8c4f4e9227d8f561f6e7b5d8270371cdd6988890..460905201fe49f4562659437dceb97ee4674d5ba 100644 --- a/R/PolyHaplotyper.R +++ b/R/PolyHaplotyper.R @@ -659,25 +659,50 @@ single_cycle_infer <- function(allhap, mrkdidsTable, ploidy, knownHap, ahc <- allHaploComb(did=names(mrkdidsTable)[i], #mrkDosage may be missing! allhap=allhap, ploidy=ploidy, progress=progress) if (length(knownHap) > 0) { - # ***************************************************************** - # versions before 20171006: - # select the haplotype combination(s) with the highest total dosage - # of known haplotypes: - #sel_ahc <- ahc[knownHap,, drop=FALSE] - #sel_dos <- colSums(sel_ahc) #for each haplotype comb the total dosage of selected haplotypes - #sel_col <- sel_dos == max(sel_dos) #the combination(s) with the maximum dosage - #ahc <- ahc[, sel_col, drop=FALSE] #keep only that/those combination(s) - # ***************************************************************** - # version 20171006: - # select the haplotype combination(s) with the smallest number of - # new haplotypes, and among those select the combis - # with the highest total dosage of known haplotypes: - ahc_pres <- ahc > 0 - nwhapcount <- colSums(ahc_pres[-knownHap,, drop=FALSE]) - ahc <- ahc[, nwhapcount==min(nwhapcount), drop=FALSE] #least nr of new haplotypes - if (useKnownDosage) { - knownhapdos <- colSums(ahc[knownHap,, drop=FALSE]) - ahc <- ahc[, knownhapdos==max(knownhapdos), drop=FALSE] #max dosage of known haplotypes + # compare various hapcomb selection schemes based on a global variable + # Based on comparisons of 20171023, scheme 1 is clearly worse and of + # the other 5 scheme 2 has a marginally better argumentation. + # We leave all possibilities in so we can re-compare them with additional + # datasets later. + testscheme <- 2 # for now; overrules any global variable testscheme + if (testscheme==1) { + # ***************************************************************** + # versions before 20171006: + # select the haplotype combination(s) with the highest total dosage + # of known haplotypes: + knowndos <- colSums(ahc[knownHap,, drop=FALSE]) + ahc <- ahc[, knowndos==max(knowndos), drop=FALSE] + # ***************************************************************** + } else { + #testscheme==2: version 20171006 (minimize the nr of haplotypes, + #instead of maximize the dosage of known haplotypes, and if + #useKnownDosage is TRUE, maximize the dosage of known haplotypes + #among the selected hapcombs) + #testscheme==3: as 2, but always apply second selection for max + #dosage of known haplotypes + #testscheme==4: as 2, but the second selection maximizes the dosage over + #the knownhap and newhap together + #testscheme==5: combines 3 and 4 + #However: scheme 4 and 5 are incorrect: the total dosage of + #knownhap and the other haps is always equal to ploidy + # version 20171006: + # select the haplotype combination(s) with the smallest number of + # new haplotypes, and among those select the hapcombs + # with the highest total dosage of known haplotypes: + ahc_pres <- ahc > 0 + nwhapcount <- colSums(ahc_pres[-knownHap,, drop=FALSE]) + selhapcomb <- nwhapcount==min(nwhapcount) + ahc <- ahc[, selhapcomb, drop=FALSE] #least nr of new haplotypes + if (useKnownDosage || testscheme %in% c(3, 5)) { + if (testscheme %in% c(2, 3)) { + # version 20171006: + knownhapdos <- colSums(ahc[knownHap,, drop=FALSE]) + } else { #testscheme 4 or 5 + selhap <- which(rowSums(ahc) > 0) + knownhapdos <- colSums(ahc[selhap,, drop=FALSE]) + } + ahc <- ahc[, knownhapdos==max(knownhapdos), drop=FALSE] #max dosage of known haplotypes + } } # (a different approach could be: for each possible set of haplotypes in # addition to the known ones, from small to large, see how many @@ -2071,42 +2096,6 @@ getConsensusmrkDosage <- function(mrkDosage, indiv, solveConflicts=TRUE) { mindos } #getConsensusmrkDosage -#'@title check all parental sample sets -#'@description check that for P1 and P2 the correct number of sample sets are -#'present, that these contain no duplicates and that they overlap -#'completely or not at all, and that if they overlap, they have the same sample -#'as first one -#'@usage checkAllParentSets(P, npop, indiv) -#'@param P a list of 2 elements, each of which is a list of character vectors -#'containing the sample names for parents 1 and parents 2 -#'@param npop the number of F1 populations -#'@param indiv a character vector with all sample names, the parental samples -#'should all be in indiv -#'@return an error message, "" if all is well -checkAllParentSets <- function(P, npop, indiv) { - # not used any more, we require that there are no duplicates - if (length(P) !=2 || length(P[[1]]) != npop || length(P[[2]]) != npop) - return("for each F1 population there should be one set of samples for each parent") - # check that no parental set has duplicate samples and that all samples are in indiv: - P <- c(P[[1]], P[[2]]) - for (i in seq_len(2*npop)) { - if (length(P[[i]]) != length(unique(P[[i]]))) - return("some parental sample sets contain duplicate samples") - if (!all(P[[i]] %in% indiv)) - stop("Some parental samples are missing from indiv") - } - #check that if two parents are the same, all their samples are the same: - for (i in 1:(length(P)-1)) for (j in (i+1):length(P)) { - if (length(intersect(P[[i]], P[[j]])) > 0) { - if (!setequal(P[[i]], P[[j]])) - return("some parental sample sets overlap partially") - if (P[[i]][1] != P[[j]][1]) - return("parental sample sets for the same parents must start with the same sample") - } - } - return("") -} #checkAllParentSets - #'@title get all F1 haplotype combinations expected from two parental haplotype #'combinations #'@description get all F1 haplotype combinations expected from two parental @@ -2356,7 +2345,8 @@ pedigreeHapdosCheck_OneHaploblock <- function(ped, mrkDosage, hapdos, nhap) { peddat[, 2] <- FALSE #hap: assigned haplotype combi for this indiv? } else { # sort hapdos such that identical columns are adjacent and NA omitted: - hapdos <- hapdos[, !is.na(hapdos[1,])] #drop all indiv without assigned hapcomb + hapdos <- hapdos[, !is.na(hapdos[1,]), drop=FALSE] + # drop all indiv without assigned hapcomb parentsNodata <- parentnames[is.na(match(parentnames, colnames(hapdos)))] o <- do.call(order, lapply(1:nrow(hapdos), function(i) hapdos[i,])) hapdos <- hapdos[, o] @@ -2397,7 +2387,7 @@ pedigreeHapdosCheck_OneHaploblock <- function(ped, mrkDosage, hapdos, nhap) { parents[c(p1, p2), 4] <- #prog_mrkdata parents[c(p1, p2), 4] + sum(!is.na(colSums(mrkDosage[, colnames(mrkDosage) %in% FS[, 1], - drop=FALSE]))) + drop=FALSE]))) FShad <- hapdos[, colnames(hapdos) %in% FS[, 1], drop=FALSE] parents[c(p1, p2), 5] <- #prog_hapdata parents[c(p1, p2), 5] + ncol(FShad)