diff --git a/R/PolyHaplotyper.R b/R/PolyHaplotyper.R index 0b26e3561374eda41edf7d17f66b36cff121957b..83f7759f449ca00afb9ee0c969563ea2b4db8ab4 100644 --- a/R/PolyHaplotyper.R +++ b/R/PolyHaplotyper.R @@ -113,15 +113,17 @@ loadAllHaploCombList <- function(ploidy) { save(ahclist, file=paste0("ahclist_", ahcinfo$ploidy,"x.RData")) } #load the ahccompletelist: - x <- tryCatch(load(paste0("ahccompletelist_", ploidy,"x.RData"), - envir=.GlobalEnv), + x <- tryCatch( + suppressWarnings(load(paste0("ahccompletelist_", ploidy,"x.RData"), + envir=.GlobalEnv)), error=function(e) "error") if (identical(x, "error")) { newinfo$complete_nSNP <- 0 } else newinfo$complete_nSNP <- length(ahccompletelist) #load the ahclist: - x <- tryCatch(load(paste0("ahclist_", ploidy,"x.RData"), - envir=.GlobalEnv), + x <- tryCatch( + suppressWarnings(load(paste0("ahclist_", ploidy,"x.RData"), + envir=.GlobalEnv)), error=function(e) "error") if (identical(x, "error")) assign("ahclist", value=list(), envir=.GlobalEnv, inherits=FALSE) @@ -645,6 +647,7 @@ inferHaplotypes <- function(SNPdosages, indiv, ploidy, minfrac=c(0.1, 0.01), } } nind <- sum(SNPdidsTable) #includes only individuals with no missing SNP data + prev_sel_hap <- list() cycle <- 0 while (TRUE) { cycle <- cycle + 1 @@ -660,23 +663,25 @@ inferHaplotypes <- function(SNPdosages, indiv, ploidy, minfrac=c(0.1, 0.01), #warning(paste0("inferHaplotypes: lost in cycle ", cycle, ": ", # paste(haploLost, collapse=" "))) # it seems that it is possible to lose previous must-have haplotypes, - # but is does not happen often. + # but this does not happen often. # We must prevent infinite looping, so we store the sel_hap and # if the situation occurs again with the same sel_hap we break with - # an empty sel_hap - if (exists("prev_sel_hap")) { - if (any(apply(prev_sel_hap, 1, identical, sel_hap))) { - sel_hap <- orig_hap - break - } - rbind(prev_sel_hap, sel_hap) + # the original sel_hap and the result of a single-cycle infer with that + # sel_hap + if (length(prev_sel_hap) == 0 || + !any(sapply(prev_sel_hap, identical, sel_hap))) { + prev_sel_hap[[length(prev_sel_hap) + 1]] <- sel_hap } else { - prev_sel_hap <- matrix(sel_hap, nrow=1) + res <- single_cycle_infer(allhap=allhap, SNPdidsTable=SNPdidsTable, + ploidy=ploidy, sel_hap=orig_hap, + progress=FALSE) + sel_hap <- orig_hap + break } } #haploGained <- setdiff(sel_new, sel_hap) #if (length(haploGained) == 0) break; - if (identical(sel_new, sel_hap)) break #new, check + if (identical(sel_new, sel_hap)) break sel_hap <- sel_new } #Now we have haplotype combinations assignments for all SNPdids that @@ -770,7 +775,9 @@ single_cycle_infer <- function(allhap, SNPdidsTable, ploidy, sel_hap, progress) # that must have the haplotypes } nPresent <- apply(abpr[,2,,drop=FALSE], MARGIN=3, FUN=sum) + # vector with the nr of indiv in which each haplotype must be present nAbsent <- apply(abpr[,1,,drop=FALSE], MARGIN=3, FUN=sum) + # vector with the nr of indiv in which each haplotype must be absent names(hclist) <- names(SNPdidsTable) list(nPresent=nPresent, nAbsent=nAbsent, @@ -790,6 +797,10 @@ padded <- function(x, maxx=0) { formatC(x, width=nchar(max(x, maxx, na.rm=TRUE)), flag="0") } #padded +getHaplotypeNames <- function(haploblock, hapcount, sep="_") { + paste0(haploblock, sep, padded(1:hapcount)) +} + #'@title Assign haplotype dosages based on output of inferHaplotypes #'@description Assign haplotype dosages based on output of inferHaplotypes #'@usage haplotypeDosages(haploblockname, ihl, partial=FALSE, dropUnused=TRUE) @@ -815,7 +826,7 @@ padded <- function(x, maxx=0) { #'@export haplotypeDosages <- function(haploblockname, ihl, partial=FALSE, dropUnused=TRUE) { - haplotypeNames <- paste0(haploblockname, padded(1:nrow(ihl$allhap))) + haplotypeNames <- getHaplotypeNames(haploblockname, nrow(ihl$allhap)) nind <- sum(ihl$SNPdidsTable) #sel_hap <- which(ihl$nPresent >= ihl$threshold * nind) result <- matrix(NA_integer_, nrow=length(haplotypeNames), @@ -964,7 +975,7 @@ haplotypeDosagesF1 <- function(dosmat, ploidy, haploblockname, had <- matrix(NA_integer_, nrow=nrow(ihl$allhap), ncol=nind) #had is matrix of haplotype dosages colnames(had) <- indiv - rownames(had) <- paste0(haploblockname, padded(1:nrow(ihl$allhap))) + rownames(had) <- getHaplotypeNames(haploblockname, nrow(ihl$allhap)) F1sNoSolution <- rep(FALSE, length(F1)) F1sDone <- !largeF1s; changes <- TRUE F1sMessages <- rep("", length(F1)) @@ -2048,8 +2059,11 @@ solveAllHaploblocks <- function(SNPdosages, indiv=NULL, ploidy, haploblocks, } # for p in 1:2 } #!nopops nmrk <- sapply(haploblocks, length) - result <- list() + result <- list(rep(NA, length(haploblocks))) #made up to final length for (hb in seq_along(haploblocks)) { + pt <- proc.time() + cat(paste0("haploblock ", hb, " of ", length(haploblocks), ": ", + names(haploblocks)[hb])) result[[hb]] <- list() if (nmrk[hb] > maxSNP) { result[[hb]]$message <- paste("skipped:", nmrk[hb], @@ -2059,8 +2073,9 @@ solveAllHaploblocks <- function(SNPdosages, indiv=NULL, ploidy, haploblocks, if (nopops) { result[[hb]] <- inferHaplotypes(dosmat[match(haploblocks[[hb]], rownames(dosmat)),], indiv=colnames(dosmat), ploidy=ploidy, minfrac=minfrac) - result[[hb]]$had <- haplotypeDosages(haploblockname=names(haploblocks)[hb], - ihl=result[[hb]]) + result[[hb]]$hapdos <- + haplotypeDosages(haploblockname=names(haploblocks)[hb], + ihl=result[[hb]]) } else { result[[hb]] <- haplotypeDosagesF1(dosmat[match(haploblocks[[hb]], rownames(dosmat)),], @@ -2071,6 +2086,10 @@ solveAllHaploblocks <- function(SNPdosages, indiv=NULL, ploidy, haploblocks, dropUnused=dropUnused, maxparcombs=maxparcombs) } if (is.null(result[[hb]]$message)) result[[hb]]$message <- "" + if (result[[hb]]$message == "" && + !is.null(result[[hb]]$hapdos) && nrow(result[[hb]]$hapdos) == 0) + result[[hb]]$message <- "no solution found" + cat(paste0(" ", round((proc.time()-pt)[3], 3), " sec\n")) } # for hb names(result) <- names(haploblocks) result @@ -2124,176 +2143,183 @@ pedigreeHapdosCheck_OneHaploblock <- function(ped, SNPdos, hapdos, nhap) { # - for each parent: with which fraction of its progeny does it conflict? # * without and with DR - # sort hapdos such that identical columns are adjacent and NA omitted: - hapdos <- hapdos[, !is.na(hapdos[1,])] #drop all indiv without assigned hapcomb - o <- do.call(order, lapply(1:nrow(hapdos), function(i) hapdos[i,])) - hapdos <- hapdos[, o] - hapdos <- expandHapdos(hapdos, nhap) #incl rows for non-occurring haplotypes peddat <- matrix(NA, nrow=nrow(ped), ncol=4, dimnames=list(ped[,1], c("SNPs", "hap", "noDR", "withDR"))) - - + mode0_ped <- which(colnames(peddat) == "noDR") - 1 peddat[, 1] <- #SNPs: complete SNP dosage data for this indiv in this haploblock? !is.na(colSums(SNPdos[, match(rownames(peddat), colnames(SNPdos))])) - peddat[, 2] <- #hap: assigned haplotype combi for this indiv? - !is.na(hapdos[1, match(rownames(peddat), colnames(hapdos))]) - mode0_ped <- which(colnames(peddat) == "noDR") - 1 parentnames <- as.character(setdiff(unique(c(ped[,2], ped[,3])), NA)) - parentsNodata <- parentnames[is.na(match(parentnames, colnames(hapdos)))] parcols <- c("par_SNPdata", "par_hapdata", "totprogeny", "prog_SNPdata", "prog_hapdata", "nonDRmatch", "DRmatch") parents <- matrix(as.integer(0), nrow=length(parentnames), ncol=length(parcols), dimnames=list(parentnames, parcols)) - # data.frame(parent=parents, stringsAsFactors=FALSE) - # parents$par_SNPdata <- - # !is.na(colSums(SNPdos[, match(parents$parent, colnames(SNPdos))])) - # parents$par_hapdata <- - # !is.na(hapdos[1, match(parents$parent, colnames(hapdos))]) - # parents$totprogeny <- 0 - # parents$prog_SNPdata <- 0 - # parents$prog_hapdata <- 0 - # parents$nonDRmatch <- 0 - # parents$withDRmatch <- 0 mode0_par <- which(parcols == "nonDRmatch") - 1 # base column number for modes - # first create a list with the possible gametes from each parent: - pargamlist <- list() - for (par in parentnames) { - if (par %in% parentsNodata) { - pargamlist[[par]] <- "nodata" - } else { - pargamlist[[par]] <- list() - pargamlist[[par]]$noDR <- getGameteFreqs(hapdos=hapdos[, par], DRrate=0) - pargamlist[[par]]$DR <- getGameteFreqs(hapdos=hapdos[, par], DRrate=0.04) - # we only need the set of possible gametes; their freq (and so the exact - # DRrate) is not used + + if (nrow(hapdos) == 0) { + peddat[, 2] <- FALSE #hap: assigned haplotype combi for this indiv? + } else { + # sort hapdos such that identical columns are adjacent and NA omitted: + hapdos <- hapdos[, !is.na(hapdos[1,])] #drop all indiv without assigned hapcomb + parentsNodata <- parentnames[is.na(match(parentnames, colnames(hapdos)))] + o <- do.call(order, lapply(1:nrow(hapdos), function(i) hapdos[i,])) + hapdos <- hapdos[, o] + hapdos <- expandHapdos(hapdos, nhap) #incl rows for non-occurring haplotypes + peddat[, 2] <- #hap: assigned haplotype combi for this indiv? + !is.na(hapdos[1, match(rownames(peddat), colnames(hapdos))]) + + # data.frame(parent=parents, stringsAsFactors=FALSE) + # parents$par_SNPdata <- + # !is.na(colSums(SNPdos[, match(parents$parent, colnames(SNPdos))])) + # parents$par_hapdata <- + # !is.na(hapdos[1, match(parents$parent, colnames(hapdos))]) + # parents$totprogeny <- 0 + # parents$prog_SNPdata <- 0 + # parents$prog_hapdata <- 0 + # parents$nonDRmatch <- 0 + # parents$withDRmatch <- 0 + # first create a list with the possible gametes from each parent: + pargamlist <- list() + for (par in parentnames) { + if (par %in% parentsNodata) { + pargamlist[[par]] <- "nodata" + } else { + pargamlist[[par]] <- list() + pargamlist[[par]]$noDR <- getGameteFreqs(hapdos=hapdos[, par], DRrate=0) + pargamlist[[par]]$DR <- getGameteFreqs(hapdos=hapdos[, par], DRrate=0.04) + # we only need the set of possible gametes; their freq (and so the exact + # DRrate) is not used + } } - } - parents[, 1] <- #par_SNPdata - !is.na(colSums(SNPdos[, match(parentnames, colnames(SNPdos))])) - parents[, 2] <- #par_hapdata - !is.na(hapdos[1, match(parentnames, colnames(hapdos))]) - #for each fullsib family, get all possible progeny haplotype combinations - #and check the inferred ones against these: - for (p1 in seq_along(parentnames)) { - par1 <- parentnames[p1] - for (p2 in p1:nrow(parents)) { - par2 <- parentnames[p2] - if (class(parents[, 2]) != "integer") - stop(paste(p1, p2, class(parents[,2]))) #should never happen - FS <- ped[!is.na(ped[, 2]) & ped[, 2] %in% c(par1, par2) & - !is.na(ped[, 3]) & ped[, 3] %in% c(par1, par2) & - (par1 == par2 || ped[, 2] != ped[, 3]),, drop=FALSE] - if (nrow(FS) > 0) { - parents[c(p1, p2), 3] <- #totprogeny - parents[c(p1, p2), 3] + nrow(FS) - parents[c(p1, p2), 4] <- #prog_SNPdata - parents[c(p1, p2), 4] + - sum(!is.na(colSums(SNPdos[, colnames(SNPdos) %in% FS[, 1], + parents[, 1] <- #par_SNPdata + !is.na(colSums(SNPdos[, match(parentnames, colnames(SNPdos))])) + parents[, 2] <- #par_hapdata + !is.na(hapdos[1, match(parentnames, colnames(hapdos))]) + #for each fullsib family, get all possible progeny haplotype combinations + #and check the inferred ones against these: + for (p1 in seq_along(parentnames)) { + par1 <- parentnames[p1] + for (p2 in p1:nrow(parents)) { + par2 <- parentnames[p2] + if (class(parents[, 2]) != "integer") + stop(paste(p1, p2, class(parents[,2]))) #should never happen + # FS <- ped[!is.na(ped[, 2]) & ped[, 2] %in% c(par1, par2) & + # !is.na(ped[, 3]) & ped[, 3] %in% c(par1, par2) & + # (par1 == par2 || ped[, 2] != ped[, 3]),, drop=FALSE] + FS <- ped[!is.na(ped[, 2]) & ped[, 2] %in% c(par1, par2) & + !is.na(ped[, 3]) & ped[, 3] %in% c(par1, par2),, drop=FALSE] + if (par1 != par2) FS <- FS[FS[, 2] != FS[, 3],, drop=FALSE] + if (nrow(FS) > 0) { + parents[c(p1, p2), 3] <- #totprogeny + parents[c(p1, p2), 3] + nrow(FS) + parents[c(p1, p2), 4] <- #prog_SNPdata + parents[c(p1, p2), 4] + + sum(!is.na(colSums(SNPdos[, colnames(SNPdos) %in% FS[, 1], + drop=FALSE]))) + FShad <- hapdos[, colnames(hapdos) %in% FS[, 1], drop=FALSE] + parents[c(p1, p2), 5] <- #prog_hapdata + parents[c(p1, p2), 5] + ncol(FShad) + if (all(parents[c(p1, p2), 2]) && ncol(FShad) > 0) { + #both parents have haplotypes and at least some FS indiv in hapdos + rownames(FShad) <- NULL #else all.equal fails later + du <- duplicated(FShad, MARGIN=2) + uniqcomb <- which(!du) + #sum the freqs per set of duplicates: + uhx <- c(uniqcomb, length(du)+1) + uniqfreq <- uhx[-1] - uhx[-length(uhx)] + #now we have the inferred haplotype combinations and their frequencies; + #next we determine how many of these are expected without and with DR: + for (m in 1:2) { #modes: 1 = no DR, 2 = with DR + matchParents <- logical(ncol(FShad)) + names(matchParents) <- colnames(FShad) + F1exp <- getF1freqs_gamfrq(pargamlist[[par1]][[m]], + pargamlist[[par2]][[m]], + nhap=nrow(FShad)) + for (hc in seq_along(uniqcomb)) { + indiv <- colnames(FShad)[uhx[hc]:(uhx[hc+1]-1)] + #matchcols <- apply(F1exp$hapcomb, MARGIN=2, + # FUN=identical, FShad[, uniqcomb[hc]]) #0 or 1 TRUE + alleq <- function(x, y) { isTRUE(all.equal(x, y)) } + # identical returns FALSE, all.equal TRUE for a matching column, + # but all.equal returns a character for a non-matching column + matchcols <- apply(F1exp$hapcomb, MARGIN=2, + FUN=alleq, FShad[, uniqcomb[hc]]) #0 or 1 TRUE + if (any(matchcols)) { + peddat[rownames(peddat) %in% indiv, mode0_ped+m] <- TRUE + parents[c(p1, p2), mode0_par+m] <- + parents[c(p1, p2), mode0_par+m] + as.integer(uniqfreq[hc]) + } + } #for hc + #set the noDR or withDR column of peddat to FALSE for those + #FS indiv that have a non-NA FShad and are still NA in peddat: + indiv <- colnames(FShad)[!is.na(FShad[1,])] + peddat[rownames(peddat) %in% indiv & is.na(peddat[, mode0_ped+m]), + mode0_ped+m] <- FALSE + } # for m + } + } # nrow(FS) > 0 + } # for p2 + #and for the individuals of which only one parent is known and has hapdata + HS <- ped[(!is.na(ped[, 2]) & ped[, 2] == par1 & + (is.na(ped[, 3]) | ped[, 3] %in% parentsNodata)) | + (!is.na(ped[, 3]) & ped[, 3] == par1 & + (is.na(ped[, 2]) | ped[, 2] %in% parentsNodata)),, drop=FALSE] + # includes all indiv with par1 as one parent and the other parent + # unknown or with missing data + if (nrow(HS) > 0) { + # only the HS indivs without known second parent must be added to + # the totals, the others were already counted + OneParent <- HS[rowSums(is.na(HS[, 2:3, drop=FALSE])) == 1,, drop=FALSE] + parents[p1, 3] <- parents[p1, 3] + nrow(OneParent) #totprogeny + parents[p1, 4] <- #prog_SNPdata + parents[p1, 4] + + sum(!is.na(colSums(SNPdos[, colnames(SNPdos) %in% OneParent[, 1], drop=FALSE]))) - FShad <- hapdos[, colnames(hapdos) %in% FS[, 1], drop=FALSE] - parents[c(p1, p2), 5] <- #prog_hapdata - parents[c(p1, p2), 5] + ncol(FShad) - if (all(parents[c(p1, p2), 2]) && ncol(FShad) > 0) { - #both parents have haplotypes and at least some FS indiv in hapdos - rownames(FShad) <- NULL #else all.equal fails later - du <- duplicated(FShad, MARGIN=2) - uniqcomb <- which(!du) - #sum the freqs per set of duplicates: - uhx <- c(uniqcomb, length(du)+1) - uniqfreq <- uhx[-1] - uhx[-length(uhx)] - #now we have the inferred haplotype combinations and their frequencies; - #next we determine how many of these are expected without and with DR: - for (m in 1:2) { #modes: 1 = no DR, 2 = with DR - matchParents <- logical(ncol(FShad)) - names(matchParents) <- colnames(FShad) - F1exp <- getF1freqs_gamfrq(pargamlist[[par1]][[m]], - pargamlist[[par2]][[m]], - nhap=nrow(FShad)) - for (hc in seq_along(uniqcomb)) { - indiv <- colnames(FShad)[uhx[hc]:(uhx[hc+1]-1)] - #matchcols <- apply(F1exp$hapcomb, MARGIN=2, - # FUN=identical, FShad[, uniqcomb[hc]]) #0 or 1 TRUE - alleq <- function(x, y) { isTRUE(all.equal(x, y)) } - # identical returns FALSE, all.equal TRUE for a matching column, - # but all.equal returns a character for a non-matching column - matchcols <- apply(F1exp$hapcomb, MARGIN=2, - FUN=alleq, FShad[, uniqcomb[hc]]) #0 or 1 TRUE - if (any(matchcols)) { - peddat[rownames(peddat) %in% indiv, mode0_ped+m] <- TRUE - parents[c(p1, p2), mode0_par+m] <- - parents[c(p1, p2), mode0_par+m] + as.integer(uniqfreq[hc]) - } - } #for hc - #set the noDR or withDR column of peddat to FALSE for those - #FS indiv that have a non-NA FShad and are still NA in peddat: - indiv <- colnames(FShad)[!is.na(FShad[1,])] - peddat[rownames(peddat) %in% indiv & is.na(peddat[, mode0_ped+m]), - mode0_ped+m] <- FALSE - } # for m - } - } # nrow(FS) > 0 - } # for p2 - #and for the individuals of which only one parent is known and has hapdata - HS <- ped[(!is.na(ped[, 2]) & ped[, 2] == par1 & - (is.na(ped[, 3]) | ped[, 3] %in% parentsNodata)) | - (!is.na(ped[, 3]) & ped[, 3] == par1 & - (is.na(ped[, 2]) | ped[, 2] %in% parentsNodata)),, drop=FALSE] - # includes all indiv with par1 as one parent and the other parent - # unknown or with missing data - if (nrow(HS) > 0) { - # only the HS indivs without known second parent must be added to - # the totals, the others were already counted - OneParent <- HS[rowSums(is.na(HS[, 2:3, drop=FALSE])) == 1,, drop=FALSE] - parents[p1, 3] <- parents[p1, 3] + nrow(OneParent) #totprogeny - parents[p1, 4] <- #prog_SNPdata - parents[p1, 4] + - sum(!is.na(colSums(SNPdos[, colnames(SNPdos) %in% OneParent[, 1], - drop=FALSE]))) - parents[p1, 5] <- #prog_hapdata - parents[p1, 5] + - sum(!is.na(hapdos[1, colnames(hapdos) %in% OneParent[, 1]])) - # for the whole HS (all indiv with only one known parent plus - # all indiv with two parents where the second parent has missing haplodata) - # we calculate the nr that match with this parent (which was not done - # earlier) - HShad <- hapdos[, colnames(hapdos) %in% HS[, 1], drop=FALSE] - if (parents[p1, 2] && ncol(HShad) > 0 ) { - du <- duplicated(HShad, MARGIN=2) - uniqcomb <- which(!du) - #sum the freqs per set of duplicates: - uhx <- c(uniqcomb, length(du)+1) - uniqfreq <- uhx[-1] - uhx[-length(uhx)] - #now we have the inferred haplotype combinations and their frequencies; - #next we determine how many of these are expected without and with DR. - #we have only the expected gametes of par1; a hapcomb in HS can only - #occur if there is at least one gamete of par1 that has less than or - #equal the nr of copies of all haplotypes - for (m in 1:2) { #modes: 1 = no DR, 2 = with DR - matchParents <- logical(ncol(HShad)) - names(matchParents) <- colnames(HShad) - expgam <- apply(pargamlist[[par1]][[m]]$hapcomb, - MARGIN=2, FUN=tabulate, nbins=nrow(HShad)) - for (hc in seq_along(uniqcomb)) { - indiv <- colnames(HShad)[uhx[hc]:(uhx[hc+1]-1)] - smeq <- function(x, y) { all(x <= y) } - matchcols <- apply(expgam, MARGIN=2, - FUN=smeq, HShad[, uniqcomb[hc]]) #0, 1 or more TRUE - if (any(matchcols)) { - peddat[rownames(peddat) %in% indiv, mode0_ped+m] <- TRUE - parents[p1, mode0_par+m] <- - parents[p1, mode0_par+m] + as.integer(uniqfreq[hc]) - } - } # for hc - #set the noDR or withDR column of peddat to FALSE for those - #FS indiv that have a non-NA FShad and are still NA in peddat: - indiv <- colnames(HShad)[!is.na(HShad[1,])] - peddat[rownames(peddat) %in% indiv & is.na(peddat[, mode0_ped+m]), - mode0_ped+m] <- FALSE - } # for m - } - } # nrow(HS) > 0 - } # for p1 + parents[p1, 5] <- #prog_hapdata + parents[p1, 5] + + sum(!is.na(hapdos[1, colnames(hapdos) %in% OneParent[, 1]])) + # for the whole HS (all indiv with only one known parent plus + # all indiv with two parents where the second parent has missing haplodata) + # we calculate the nr that match with this parent (which was not done + # earlier) + HShad <- hapdos[, colnames(hapdos) %in% HS[, 1], drop=FALSE] + if (parents[p1, 2] && ncol(HShad) > 0 ) { + du <- duplicated(HShad, MARGIN=2) + uniqcomb <- which(!du) + #sum the freqs per set of duplicates: + uhx <- c(uniqcomb, length(du)+1) + uniqfreq <- uhx[-1] - uhx[-length(uhx)] + #now we have the inferred haplotype combinations and their frequencies; + #next we determine how many of these are expected without and with DR. + #we have only the expected gametes of par1; a hapcomb in HS can only + #occur if there is at least one gamete of par1 that has less than or + #equal the nr of copies of all haplotypes + for (m in 1:2) { #modes: 1 = no DR, 2 = with DR + matchParents <- logical(ncol(HShad)) + names(matchParents) <- colnames(HShad) + expgam <- apply(pargamlist[[par1]][[m]]$hapcomb, + MARGIN=2, FUN=tabulate, nbins=nrow(HShad)) + for (hc in seq_along(uniqcomb)) { + indiv <- colnames(HShad)[uhx[hc]:(uhx[hc+1]-1)] + smeq <- function(x, y) { all(x <= y) } + matchcols <- apply(expgam, MARGIN=2, + FUN=smeq, HShad[, uniqcomb[hc]]) #0, 1 or more TRUE + if (any(matchcols)) { + peddat[rownames(peddat) %in% indiv, mode0_ped+m] <- TRUE + parents[p1, mode0_par+m] <- + parents[p1, mode0_par+m] + as.integer(uniqfreq[hc]) + } + } # for hc + #set the noDR or withDR column of peddat to FALSE for those + #FS indiv that have a non-NA FShad and are still NA in peddat: + indiv <- colnames(HShad)[!is.na(HShad[1,])] + peddat[rownames(peddat) %in% indiv & is.na(peddat[, mode0_ped+m]), + mode0_ped+m] <- FALSE + } # for m + } + } # nrow(HS) > 0 + } # for p1 + } # nrow(hapdos) > 0 list(ped=peddat, parents=parents) } #pedigreeHapdosCheck_OneHaploblock @@ -2371,6 +2397,7 @@ pedigreeHapdosCheck <- function(ped, SNPdosages, haploblocks, names(hapdosF1results)[hbdone])) for (h in seq_along(hbdone)) { hb <- hbdone[h] + print(paste(h, hb)) pedchk <- pedigreeHapdosCheck_OneHaploblock( ped=ped, SNPdos=SNPdosages[rownames(SNPdosages) %in% haploblocks[[hb]],], @@ -2384,7 +2411,7 @@ pedigreeHapdosCheck <- function(ped, SNPdosages, haploblocks, list(ped_arr=pedarr, parents_arr=pararr) } #pedigreeHapdosCheck -#'@title Add dropped rows bach to haplotype dosage matrix +#'@title Add dropped rows back to haplotype dosage matrix #'@description Haplotype dosage matrices generated with the dropUnused=TRUE #'lack some haplotypes that are required by pedigreeHapdosCheck; this #'function adds them back. @@ -2392,7 +2419,7 @@ pedigreeHapdosCheck <- function(ped, SNPdosages, haploblocks, #'@param hapdos a haplotype dosage matrix as returned by a.o. haplotypeDosagesF1. #'In particular the rownames are assumed to have all the same length: #'they are composed of the haploblock name to which the padded haplotype number -#'is appended, without separator +#'is appended, separated by "_" #'@param nhap the total number of haplotypes for the haploblock:\cr #'nhap = 2^nSNP with nSNP the number of SNPs in the haploblock #'@return a matrix similar to hapdos with rows for the dropped haplotypes @@ -2442,16 +2469,17 @@ expandHapdos <- function(hapdos, nhap) { #'columns:\cr #' * nSNP: the number of SNPs in the haploblock\cr #' * nhap: the number of different haplotypes assigned over all individuals -#' (NA if no soltion was found for this haplotype; the reason for that is +#' (NA if no solution was found for this haplotype; the reason for that is #' listed in the first column of item messages of the return value)\cr #' * for each F1 population a set of 4 columns: parSNPdat (0, 1 or 2: the #' number of parents with complete SNP data), done (0=FALSE or 1=TRUE) #' indicating if a solution for the F1 was found based on polysomic #' inheritance), nonNA (the number of F1 progeny with complete SNP data) and #' ok (the number of F1 progeny with assigned haplotype combinations; this is -#' less than the nonNA value if the same SNP genotype can be produced with +#' less than the nonNA value if the same F1 SNP genotype can be produced with #' different combinations of haplotypes that are all compatible with the -#' parental haplotype combinations)\cr +#' parental haplotype combinations) or if some F1 SNP genotypes cannot be +#' produced by the parental haplotype combinations\cr #' * for "rest" (all individuals that are not part of the F1's or their #' parents) and "all" (all individuals) there are also columns nonNA and ok, #' similar to those for the F1 populations\cr @@ -2498,10 +2526,15 @@ overviewByF1 <- function(haploblocks, F1, PF1, hapdosF1results) { ovw[hb, 4*pop + 1] <- sum(!is.na(hapdosF1results[[hb]]$SNPdids[ names(hapdosF1results[[hb]]$SNPdids) %in% F1[[pop]] ])) - # ok: how many F1 indiv have an inferred haplotype dosages? - ovw[hb, 4*pop + 2] <- - sum(!is.na(hapdosF1results[[hb]]$hapdos[1, - colnames(hapdosF1results[[hb]]$hapdos) %in% F1[[pop]] ])) + # ok: how many F1 indiv have inferred haplotype dosages? + print(hb) + if (nrow(hapdosF1results[[hb]]$hapdos) == 0) { + ovw[hb, 4*pop + 2] <- 0 + } else { + ovw[hb, 4*pop + 2] <- + sum(!is.na(hapdosF1results[[hb]]$hapdos[1, + colnames(hapdosF1results[[hb]]$hapdos) %in% F1[[pop]] ])) + } } # for pop #data for "rest": the non-F1, non-F1-parents indiv if (is.null(rest)) @@ -2510,14 +2543,22 @@ overviewByF1 <- function(haploblocks, F1, PF1, hapdosF1results) { ovw[hb, lastF1col + 1] <- # nonNArest: how many rest indiv with complete SNP data sum(!is.na(hapdosF1results[[hb]]$SNPdids[ names(hapdosF1results[[hb]]$SNPdids) %in% rest])) - ovw[hb, lastF1col + 2] <- # okrest: how many rest indiv with inferred haplotypes - sum(!is.na(hapdosF1results[[hb]]$hapdos[1, - colnames(hapdosF1results[[hb]]$hapdos) %in% rest])) + if (nrow(hapdosF1results[[hb]]$hapdos) == 0) { + ovw[hb, lastF1col + 2] <- 0 + } else { + ovw[hb, lastF1col + 2] <- # okrest: how many rest indiv with inferred haplotypes + sum(!is.na(hapdosF1results[[hb]]$hapdos[1, + colnames(hapdosF1results[[hb]]$hapdos) %in% rest])) + } #data for all indiv together: ovw[hb, lastF1col + 3] <- # nonNAall # how any indiv with complete SNP data sum(!is.na(hapdosF1results[[hb]]$SNPdids)) - ovw[hb, lastF1col + 4] <- #okall: how many indiv with inferred haplotype dosages + if (nrow(hapdosF1results[[hb]]$hapdos) == 0) { + ovw[hb, lastF1col + 4] <- 0 + } else { + ovw[hb, lastF1col + 4] <- #okall: how many indiv with inferred haplotype dosages sum(!is.na(hapdosF1results[[hb]]$hapdos[1,])) + } } # message == "" } # for hb list(ovw=ovw, messages=messages) @@ -2609,4 +2650,29 @@ calcStatistics <- function(pedchk, ovwF1) { calcSNPsHaptable <- function(ovwF1){ table(ovwF1$ovw[ ,1], ovwF1$ovw[, 2], useNA="ifany", dnn=c("SNPs", "haplotypes")) -} +} # calcSNPsHaptable + +#'@title convert a haploblock-defining data frame to a list +#'@description convert a haploblock-defining data frame to a list as needed +#'by solveAllHaploblocks +#'@usage haploblock_df2list(df, mrkcol, hbcol) +#'@param df a data.frame with at least the following two columns +#'@param mrkcol the name or number of the column with the markers +#'@param hbcol the name or number of the column with the haploblocks +#'@details function solveAllHaploblocks needs a list where each item is a +#'vector of all markers in one haploblock. This function produces such a list +#'from a data.frame where the markers are in one column and the haploblocks +#'in another column. The markers and haploblocks columns may be character, +#'factor or numeric, and the columns may be indicated by name or number. +#'@return the desired list +#'@export +haploblock_df2list <- function(df, mrkcol, hbcol) { + uhb <- unique(df[, hbcol]) + if (is.factor(df[, mrkcol])) df[, mrkcol] <- as.character(df[, mrkcol]) + lst <- list() + for (hb in seq_along(uhb)) { + lst[[hb]] <- df[df[, hbcol]==uhb[hb], mrkcol] + } + names(lst) <- as.character(uhb) + lst +} # haploblock_df2list