diff --git a/DESCRIPTION b/DESCRIPTION index 25a2a96213192f7d1a7fca9e529c9286007f6718..a2bda099b4a5eda0da0215af8e04050c1057296c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -5,7 +5,8 @@ Version: 1.0.0 Author: Roeland E. Voorrips Maintainer: Roeland E. Voorrips <roeland.voorrips@wur.nl> Description: Infer the genetic composition of individuals - in terms of haplotype dosages for a haploblock, based on SNP dosages, + in terms of haplotype dosages for a haploblock, based + on bi-allelic marker dosages, for any ploidy level. License: GPL-2 Encoding: UTF-8 diff --git a/R/PolyHaplotyper.R b/R/PolyHaplotyper.R index 2b96e6162655f7d5d41c66b7d7705c29e556d854..5f71acad06d980dd77c751e9e746ff9f491a9650 100644 --- a/R/PolyHaplotyper.R +++ b/R/PolyHaplotyper.R @@ -61,14 +61,38 @@ #'@importFrom utils combn flush.console read.table write.table #'@importFrom stats chisq.test pchisq pbinom qbinom -# documentation of the objects in Data/PolyHaplotyper_demo.RData: -# +# documentation of the objects in PolyHaplotyper_demo.RData: + #'@title dosages of SNP alleles #'@description A data.frame with the allele dosages for 30 SNPs in 661 -#'isamples +#'samples; used in vignette, stored in PolyHaplotyper_demo.RData +#'@docType data +#'@keywords datasets +#'@name demo_snpdos +#'@format a data.frame with 30 rows (SNPs) and 663 columns. The first +#'two columns contain the SNP name and contig; the remaining columns +#'contain the allele dosages (integers in 0..6 or NA) of the SNPs in +#'661 samples +NULL + +#'@title pedigree +#'@description A data.frame with a pedigree; used in vignette, stored in +#'PolyHaplotyper_demo.RData +#'@docType data +#'@keywords datasets +#'@name demo_ped +#'@format a data.frame with 661 rows (samples) and +#'4 columns: genotype, mother, father, sample_nr +NULL + +# documentation of the objects in PolyHaplotyper_small.RData: + +#'@title dosages of SNP alleles +#'@description A matrix with the allele dosages for 8 SNPs in 661 +#'samples; used in manuals, stored in PolyHaplotyper_small.RData #'@docType data #'@keywords datasets -#'@name snpdos +#'@name phdos #'@format a data.frame with 30 rows (SNPs) and 663 columns. The first #'two columns contain the SNP name and contig; the remaining columns #'contain the allele dosages (integers in 0..6 or NA) of the SNPs in @@ -76,14 +100,56 @@ NULL #'@title pedigree -#'@description A data.frame with a pedigree +#'@description A data.frame with a pedigree; used in manuals, stored in +#'PolyHaplotyper_small.RData #'@docType data #'@keywords datasets -#'@name ped +#'@name phped #'@format a data.frame with 661 rows (samples) and #'4 columns: genotype, mother, father, sample_nr NULL +#'@title List of markers per haploblock +#'@description A list with for each haploblock the names of the markers it +#'contains; used in manuals, stored in PolyHaplotyper_small.RData +#'@docType data +#'@keywords datasets +#'@name phblocks +#'@format a list with 2 character vectors +NULL + +#'@title parents of FS families +#'@description A matrix with the names of the parents of each FS family; +#'used in manuals, stored in PolyHaplotyper_small.RData +#'@docType data +#'@keywords datasets +#'@name phpar +#'@format a matrix with 4 rows, 1 per FS family, and 2 columns, 1 for each +#'parent +NULL + +#'@title members of FS families +#'@description A list with for each FS family the names of the individuals it +#'contains; used in manuals, stored in PolyHaplotyper_small.RData +#'@docType data +#'@keywords datasets +#'@name phFS +#'@format a list with 4 vectors, each with the (here numeric) names of +#'the FS members +NULL + +#'@title haplotyping results +#'@description A list with the results of function inferHaplotypes; used in +#'manuals, stored in PolyHaplotyper_small.RData +#'@docType data +#'@keywords datasets +#'@name phresults +#'@format a list with 2 elements, one per haploblock; each element itself +#'a list with multiple components +NULL + +# Start of function definitions + #'@title get all haplotypes for the given markers #'@description Given a set of bi-allelic (SNP) marker names, generate all #'possible haplotypes @@ -94,6 +160,9 @@ NULL #'haplotypes in rows. 0: haplotype contains the non-dosage-counted marker #'allele (the reference allele); 1: haplotype contains the dosage-counted #'(alternative) marker allele. The colnames are the marker names. +#'@examples +#'# show the 8 possible haplotypes with 3 bi-allelic markers: +#'allHaplotypes(mrknames=c("mrkA", "mrkB", "mrkC")) #'@export allHaplotypes <- function(mrknames) { lst <- list() @@ -106,7 +175,7 @@ allHaplotypes <- function(mrknames) { eg } #allHaplotypes -getahcdir <- function(ahcdir) { +getahcdir <- function(ahcdir=NULL) { # for internal use only; get ahcdir from (1) param ahcdir or (2) working dir # result ends always with "/" so a filename can be pasted onto it if (is.null(ahcdir)) ahcdir <- getwd() else { @@ -132,8 +201,11 @@ loadAllHapcombLists <- function(ploidy, nmrk, ahcdir) { if (!is.list(lst) || length(lst) < 1) return(FALSE) if (!all(sapply(lst, class) == "list")) return(FALSE) if (length(lst[[length(lst)]]) == 0) return(FALSE) # in highest nmrk at least one mrkdid should be present - cl <- lapply(lst, FUN=function(x) sapply(x, class)) - if (!all(unlist(cl) == "matrix")) return(FALSE) + # cl <- lapply(lst, FUN=function(x) sapply(x, class)) + # if (!all(unlist(cl) == "matrix")) return(FALSE) + # due to change in class of matrix in R version 4: + cl <- lapply(lst, FUN=function(x) sapply(x, inherits, "matrix")) + if (!all(unlist(cl))) return(FALSE) cl <- lapply(lst, FUN=function(x) sapply(x, nrow)) if (!all(unlist(cl) == ploidy)) return(FALSE) lnames <- names(lst[[length(lst)]]) @@ -197,29 +269,29 @@ loadAllHapcombLists <- function(ploidy, nmrk, ahcdir) { ahcinfo } # loadAllHapcombLists -#'@title get all haplotype combinations that result in the given marker -#'dosages -#'@description get all haplotype combinations that result in the given -#'marker dosages -#'@usage getAllHapcomb(mrkDosage=NULL, mrkdid=NULL, nmrk, ahcinfo) -#'@param mrkDosage an integer vector with the dosages of each marker in one -#'individual; all dosages must be in 0:ahcinfo$ploidy or NA -#'Either mrkDosage or mrkdid must be specified, not both -#'@param mrkdid marker dosage ID: in combination with nmrk and ploidy specifying -#'the mrkDosage. Can be NA or an integer, numeric or character which must -#'specify an integer > 0 -#'Either mrkDosage or mrkdid must be specified, not both -#'@param nmrk the number of markers (should match mrkDosage if that is specified) -#'@param ahcinfo a list as returned by loadAllHapcombLists -#'@details Each column of the return value represents one combination of -#'haplotypes that yields the observed marker dosages. If any of the marker -#'dosages, or mrkdid, are NA a matrix with 0 columns is returned.\cr -#'This function makes use of precalculated lists ahcinfo$ahccompletelist -#'and ahcinfo$ahclist; the mrkdid must be present in one of them. -#'@return an integer matrix with <ploidy> rows and one column for each haplotype -#'combination that matches the marker dosages; the haplotype numbers -#'within a column are in increasing order -#'@export +# not exported function (relies on ahcinfo) +#_#@title get all haplotype combinations that result in the given marker +#_#'dosages +#_#'@description get all haplotype combinations that result in the given +#_#'marker dosages +#_#'@usage getAllHapcomb(mrkDosage=NULL, mrkdid=NULL, nmrk, ahcinfo) +#_#'@param mrkDosage an integer vector with the dosages of each marker in one +#_#'individual; all dosages must be in 0:ahcinfo$ploidy or NA +#_#'Either mrkDosage or mrkdid must be specified, not both +#_#'@param mrkdid marker dosage ID: in combination with nmrk and ploidy specifying +#_#'the mrkDosage. Can be NA or an integer, numeric or character which must +#_#'specify an integer > 0 +#_#'Either mrkDosage or mrkdid must be specified, not both +#_#'@param nmrk the number of markers (should match mrkDosage if that is specified) +#_#'@param ahcinfo a list as returned by loadAllHapcombLists +#_#'@details Each column of the return value represents one combination of +#_#'haplotypes that yields the observed marker dosages. If any of the marker +#_#'dosages, or mrkdid, are NA a matrix with 0 columns is returned.\cr +#_#'This function makes use of precalculated lists ahcinfo$ahccompletelist +#_#'and ahcinfo$ahclist; the mrkdid must be present in one of them. +#_#'@return an integer matrix with <ploidy> rows and one column for each haplotype +#_#'combination that matches the marker dosages; the haplotype numbers +#_#'within a column are in increasing order getAllHapcomb <- function(mrkDosage=NULL, mrkdid=NULL, nmrk, ahcinfo) { # mrkDosage has already been checked at start of inferHaplotypes, # mrkdid are computed @@ -324,47 +396,49 @@ getHapcombCount_1mrk <- function(mrkdosage, ploidy, nhap) { #'= 2^nmrk. The shorter vector of ploidy and nmrk is recycled. #'@return a vector with the number of possible haplotype combinations for #'each pair of ploidy and nmrk values +#'@examples +#'totHapcombCount(ploidy=4, nmrk=c(1:8)) #'@export totHapcombCount <- function(ploidy, nmrk) { nhap <- 2^nmrk choose(nhap + ploidy - 1, ploidy) } #totHapcombCount -#'@title generate all haplotype combinations -#'@description generate a list which contains for each marker dosage combination -#'at a given ploidy all matching haplotype combinations -#'@usage completeAllHaploComb(ploidy, nmrk, savesec=1800, printsec=60, -#'fname=NULL) -#'@param ploidy ploidy (may be even or odd) -#'@param nmrk number of markers in the haploblock -#'@param savesec default 1800: number of seconds between successive saves of -#'the intermediate results -#'@param printsec default 60: number of seconds between printout of -#'the current set of haplotypes (the last two are always equal at the time of -#'printing). NA or 0 suppresses printing. -#'@param fname filename (to which the extension .RData will be added) to store -#'the results. Intermediate saves will go to fname + the current set of -#'haplotypes; these intermediate files are temporary and will be deleted.\cr -#'If NULL (default), fname will be set to e.g. ahc_4x_nmrk6, where 4 is the -#'ploidy and nmrk = 6 -#'@details This function is used by build_ahccompletelist and would not -#'normally be called by the user -#'@examples -#'#Generate an ahccompletelist for a given ploidy, for 1 .. nmrk markers: -#'\dontrun{ -#'ahccompletelist <- list(1:nmrk) -#'for (n in seq_len(nmrk)) { -#' ahccompletelist[[n]] <- -#' completeAllHaploComb(ploidy=ploidy, nmrk=n) -#' save(ahccompletelist, file=paste0("ahccompletelist_", ploidy, "x.RData")) -#'} -#'Note that this takes lots of time and memory already for -#'ploidy 4, nmrk = 7 and ploidy 6, nmrk = 5 -#'} -#'@return a list with for each mrkdid (each combination of marker dosages) a -#'matrix with one row per haplotype and one column per haplotype combination, -#'containing the dosages of the haplotypes in each haplotype combination. -#'@export +# not exported function - users should use build_ahccompletelist() +#_#'@title generate all haplotype combinations +#_#'@description generate a list which contains for each marker dosage combination +#_#'at a given ploidy all matching haplotype combinations +#_#'@usage completeAllHaploComb(ploidy, nmrk, savesec=1800, printsec=60, +#_#'fname=NULL) +#_#'@param ploidy ploidy (may be even or odd) +#_#'@param nmrk number of markers in the haploblock +#_#'@param savesec default 1800: number of seconds between successive saves of +#_#'the intermediate results +#_#'@param printsec default 60: number of seconds between printout of +#_#'the current set of haplotypes (the last two are always equal at the time of +#_#'printing). NA or 0 suppresses printing. +#_#'@param fname filename (to which the extension .RData will be added) to store +#_#'the results. Intermediate saves will go to fname + the current set of +#_#'haplotypes; these intermediate files are temporary and will be deleted.\cr +#_#'If NULL (default), fname will be set to e.g. ahc_4x_nmrk6, where 4 is the +#_#'ploidy and nmrk = 6 +#_#'@details This function is used by build_ahccompletelist and would not +#_#'normally be called by the user +#_#'@examples +#_#'# Generate an ahccompletelist for a given ploidy, for 1 .. nmrk markers: +#_#'\dontrun{ +#_#'ahccompletelist <- list(1:nmrk) +#_#'for (n in seq_len(nmrk)) { +#_#' ahccompletelist[[n]] <- +#_#' completeAllHaploComb(ploidy=ploidy, nmrk=n) +#_#' save(ahccompletelist, file=paste0("ahccompletelist_", ploidy, "x.RData")) +#_#'} +#_#'Note that this takes lots of time and memory already for +#_#'ploidy 4, nmrk = 7 and ploidy 6, nmrk = 5 +#_#'} +#_#'@return a list with for each mrkdid (each combination of marker dosages) a +#_#'matrix with one row per haplotype and one column per haplotype combination, +#_#'containing the dosages of the haplotypes in each haplotype combination. completeAllHaploComb <- function(ploidy, nmrk, savesec=1800, printsec=60, fname=NULL) { if (is.null(fname)) fname <- paste0("ahc_", ploidy, "x_nmrk", nmrk) @@ -459,8 +533,7 @@ completeAllHaploComb <- function(ploidy, nmrk, savesec=1800, printsec=60, #'An ahccompletelist reduces the processing time of inferHaplotypes enormously #'but takes a long time to build. This should therefore be done when #'PolyHaplotyper will be used multiple times.\cr -#'If an ahccompletelist file already exists in the working directory, or else -#'in the directory specified by options("PolyHaplotyper_ahcdir"), this file is +#'If an ahccompletelist file already exists in the working directory, this file is #'used as starting point; if maxmrk is larger than the length of the existing #'list only the additional items are calculated.\cr #'If this function crashes (due to exceeding the memory limits of the computer @@ -468,9 +541,7 @@ completeAllHaploComb <- function(ploidy, nmrk, savesec=1800, printsec=60, #'contains an intact version of the ahccompletelist for the last finished #'marker count. Also, the temporary file #'"buildAHCcompletelist_<datetime>"_"<nmrk>.RData" contains a part of the -#'list item for the number of markers being calculated at that moment. This -#'file is produced by function completeAllHaploComb, which does the actual -#'computations.\cr +#'list item for the number of markers being calculated at that moment.\cr #'Note that this function takes lots of time and memory already for #'ploidy 4 / maxmrk 7 and ploidy 6 / maxmrk 5, and is not practicable above #'ploidy 4 / maxmrk 8, ploidy 6 / maxmrk 6, ploidy 8 / maxmrk 5. Also, the @@ -479,18 +550,23 @@ completeAllHaploComb <- function(ploidy, nmrk, savesec=1800, printsec=60, #'not be taken larger than necessary. #'@return NULL (invisible); the actual result is a file in the working directory #'named ahccompletelist_<ploidy>x.RData +#'@examples +#'\donttest{ +#'# this example will create a file ahccompletelist_4x.RData +#'# in the working directory: +#'# for ploidy=4x and 1 - 4 markers: +#'build_ahccompletelist(ploidy=4, maxmrk=4, overwrite=FALSE) +#'} #'@export build_ahccompletelist <- function(ploidy, maxmrk, savesec=1800, printsec=300, overwrite, shorten=FALSE) { fname <- paste0("ahccompletelist_", ploidy, "x.RData") - tmpfname <- paste0("buildAHCcompletelist_",Sys.time(),"_") + tmpfname <- paste0("buildAHCcompletelist_", gsub("[^0-9]", "", Sys.time()), "_") ahccompletelist <- NULL - ad <- getwd() if (missing(overwrite) || length(overwrite) != 1 || is.na(overwrite) || !is.logical(overwrite)) stop("overwrite must be FALSE or TRUE") if (file.exists(fname)) { # in working directory - otherloc <- FALSE x <- load(fname) # contains ahccompletelist if (length(x) != 1 || x !="ahccompletelist") stop(paste("the existing file", fname, "does not contain (only) ahccompletelist")) @@ -500,22 +576,16 @@ build_ahccompletelist <- function(ploidy, maxmrk, savesec=1800, printsec=300, return(invisible(NULL)) } if (!overwrite) - stop(paste0("file ", fname, "already exists in working directory; aborted")) + stop(paste("file", fname, "already exists in working directory; aborted")) } else { - ad <- getahcdir() - otherloc <- file.path(ad) != file.path(getwd()) - if (file.exists(paste0(ad, fname))) { - load(paste0(ad, fname)) # contains ahccompletelist - } else { - ahccompletelist <- vector(0, mode="list") - } + ahccompletelist <- vector(0, mode="list") } newmaxmrk <- ifelse(shorten, maxmrk, length(ahccompletelist)) if (length(ahccompletelist) >= maxmrk) { ahccompletelist <- ahccompletelist[1:newmaxmrk] - file.rename(from=fname, to=paste0(fname,".backup")) + if (file.exists(fname)) file.rename(from=fname, to=paste0(fname,".backup")) save(ahccompletelist, file=fname, version=2) - file.remove(paste0(fname,".backup")) + if (file.exists(paste0(fname,".backup"))) file.remove(paste0(fname,".backup")) } else { if (totHapcombCount(ploidy=ploidy, nmrk=maxmrk) > 2e9) { ans <- readline("this will result in a very large list and may crash the computer; proceed (y/n)? ") @@ -523,21 +593,20 @@ build_ahccompletelist <- function(ploidy, maxmrk, savesec=1800, printsec=300, stop("aborted") } for (nmrk in (length(ahccompletelist)+1):maxmrk) { - cat(paste0("starting block size ", nmrk, " markers/n")) - file.rename(from=fname, to=paste0(fname,".backup")) + cat(paste0("starting block size ", nmrk, " markers\n")) + if (file.exists(fname)) file.rename(from=fname, to=paste0(fname,".backup")) ahccompletelist[[nmrk]] <- completeAllHaploComb(ploidy=ploidy, nmrk=nmrk, savesec=savesec, printsec=printsec, fname=paste0(tmpfname, nmrk)) save(ahccompletelist, file=fname, version=2) - cat(paste0("saved ", fname, "for block sizes up to ", nmrk, "markers\n")) - file.remove(paste0(tmpfname, nmrk, ".RData")) #left by completeAllHaploComb - file.remove(paste0(fname,".backup")) + cat(paste("saved", fname, "for block sizes up to", nmrk, "markers\n")) + fn <- paste0(tmpfname, nmrk, ".RData") + if (file.exists(fn)) file.remove(fn) #left by completeAllHaploComb + fn <- paste0(fname,".backup") + if (file.exists(fn)) file.remove(fn) } } - if (otherloc) - message(paste0("The new ", fname, "is located in the working directory,\n", - "but options()$PolyHaplotyper_ahcdir still points to the old version\n")) invisible(NULL) } #buildAHCcompletelist @@ -558,6 +627,16 @@ build_ahccompletelist <- function(ploidy, maxmrk, savesec=1800, printsec=300, #'from allhap #'@details if hapdos contains NA values, all values in the corresponding #'column of the result will also be NA +#'@examples +#'# get a matrix of all haplotypes with the 3 specified markers: +#'ah <- allHaplotypes(mrknames=c("mrkA", "mrkB", "mrkC")) +#'# specify haplotype dosages of 4 tetraploid individuals, +#'# only the 3 occurring haplotypes (1, 5 and 6) are given: +#'haplodosg <- +#' matrix(c(1,2,1, 4,0,0, 0,4,0, 0,0,4), nrow=3, +#' dimnames=list(paste0("demohap_", c(1,5,6)), paste0("indiv", 1:4))) +#'# calculate the corresponding marker (SNP) dosages: +#'hapdos2mrkdos(hapdos=haplodosg, allhap=ah) #'@export hapdos2mrkdos <- function(hapdos, allhap) { if (is.vector(hapdos)) hapdos <- as.matrix(hapdos) @@ -568,6 +647,8 @@ hapdos2mrkdos <- function(hapdos, allhap) { } #hapdos2mrkdos hapcomb2mrkdos <- function(hapcomb, allhap) { + # similar to hapdos2mrkdos, but with haplotype combinations specified as + # <ploidy> haplotype id's instead of the dosages of each haplotype if (is.vector(hapcomb)) hapcomb <- as.matrix(hapcomb) if (!is.matrix(allhap)) stop("allhap must be a matrix") if (!all(hapcomb %in% seq_len(nrow(allhap)))) @@ -630,6 +711,14 @@ split_hapnames <- function(hapnames) { #'@param hbname the haploblock name (only used when hapdos has 0 rows) #'@return a matrix similar to hapdos with rows for the dropped haplotypes #'re-inserted, with the correct rownames and containing only 0 and NA values +#'@examples +#'# specify haplotype dosages of 4 tetraploid individuals, +#'# only the 3 occurring haplotypes (1, 5 and 6) are given: +#'haplodosg <- matrix(c(1,2,1, 4,0,0, 0,4,0, 0,0,4), nrow=3, +#' dimnames=list(paste0("demohap_", c(1,5,6)), paste0("indiv", 1:4))) +#'# add the rows for the absent haplotypes, assuming the haploblock consists +#'# of 3 markers, so 8 haplotypes are possible: +#'expandHapdos(hapdos=haplodosg, nhap=8) #'@export expandHapdos <- function(hapdos, nhap, hbname="x") { if (!is.matrix(hapdos)) stop("invalid hapdos, must be matrix") @@ -672,6 +761,15 @@ expandHapdos <- function(hapdos, nhap, hbname="x") { #'@return a sorted vector of length ploidy with all haplotype numbers present, #'or a matrix where each column is such a vector, with the same #'colnames as hapdos +#'@examples +#'# specify haplotype dosages of 4 tetraploid individuals, +#'# only the 3 occurring haplotypes (1, 5 and 6) are given: +#'haplodosg <- matrix(c(1,2,1, 4,0,0, 0,4,0, 0,0,4), nrow=3, +#' dimnames=list(paste0("demohap_", c(1,5,6)), paste0("indiv", 1:4))) +#'# usage with hapdos as matrix: +#'hapdos2hapcomb(hapdos=haplodosg, ploidy=4) +#'# usage with hapdos as vector: +#'hapdos2hapcomb(hapdos=haplodosg[, 1], ploidy=4) #'@export hapdos2hapcomb <- function(hapdos, ploidy) { fn <- function(x, hapnrs) unlist(mapply(rep, hapnrs, x)) @@ -711,6 +809,13 @@ hapdos2hapcomb <- function(hapdos, ploidy) { #'@return matrix with nhap rows (one row for each possible haplotype) and as many #'columns as in hapmat, giving the dosages of each haplotype in each column #'of hapmat. +#'@examples +#'# specify haplotype combinations of 2 individuals: +#'haplocomb <- matrix(c(1,5,5,6, 5,6,6,6), ncol=2, +#' dimnames=list(NULL, c("indiv1", "indiv2"))) +#'# convert to dosage matrix, +#'# assuming haploblock has 3 markers, so 8 possible haplotypes: +#'hapcomb2hapdos(hapcomb=haplocomb, nhap=8) #'@export hapcomb2hapdos <- function(hapcomb, nhap) { if (!is.matrix(hapcomb)) hapcomb <- matrix(hapcomb, ncol=1) @@ -828,33 +933,33 @@ findcomb_wReplacement <- function(n, k, p, check=TRUE) { delimiterpos2comb(delimpos, k) } # findcomb_wReplacement -#'@title Find the index number of a combination-with-replacement -#'@description Find the index number of a combination-with-replacement assuming -#'the combinations are ordered -#'@usage findpos_wReplacement(n, comb, check=TRUE) -#'@param n a single integer > 0: the number of different object types -#'(which are numbered 1 to n) available -#'@param comb an integer vector representing the combination of objects; -#'sorted from small to large; all items must be in 1:n -#'@param check if TRUE (default) the input data are checked against -#'incorrect length or values and an eror is generated if needed; also -#'comb is sorted first. If FALSE and n<=0 or comb not sorted correctly -#'or any elements of comb not in 1:n an incorrect value -#'may be returned without warning. -#'@details Assume that all possible combinations-with-replacement are ordered like -#'(for n=3, k=3) 111, 112, 113, 122, 123, 133, 222, 223, 233, 333 (there are -#'choose(n+k-1, k) such combinations). This function returns the index number -#'of a given combination in this sequence.\cr -#'If comb == integer(0), i.e. an empty combination, the index number is 1 -#'as there is just one way of selecting 0 elements from any collection.\cr -#'This function relies on integer accuracy. Above 2^31-1 (2.1e9) R uses doubles -#'instead of integers and at some point exact -#'accuracy is lost. This can be overcome by using integer64 from package bit64 -#'for the intermediate (ix) and returned results (and an integer64 - adapted -#'version of choose) - the maximum value is then 2^63-1 or 9.2e18. -#'@return an integer: the position (index number) of the given combination, -#'1-based. -#'@export +# don't export this function +#_#'@title Find the index number of a combination-with-replacement +#_#'@description Find the index number of a combination-with-replacement assuming +#_#'the combinations are ordered +#_#'@usage findpos_wReplacement(n, comb, check=TRUE) +#_#'@param n a single integer > 0: the number of different object types +#_#'(which are numbered 1 to n) available +#_#'@param comb an integer vector representing the combination of objects; +#_#'sorted from small to large; all items must be in 1:n +#_#'@param check if TRUE (default) the input data are checked against +#_#'incorrect length or values and an eror is generated if needed; also +#_#'comb is sorted first. If FALSE and n<=0 or comb not sorted correctly +#_#'or any elements of comb not in 1:n an incorrect value +#_#'may be returned without warning. +#_#'@details Assume that all possible combinations-with-replacement are ordered like +#_#'(for n=3, k=3) 111, 112, 113, 122, 123, 133, 222, 223, 233, 333 (there are +#_#'choose(n+k-1, k) such combinations). This function returns the index number +#_#'of a given combination in this sequence.\cr +#_#'If comb == integer(0), i.e. an empty combination, the index number is 1 +#_#'as there is just one way of selecting 0 elements from any collection.\cr +#_#'This function relies on integer accuracy. Above 2^31-1 (2.1e9) R uses doubles +#_#'instead of integers and at some point exact +#_#'accuracy is lost. This can be overcome by using integer64 from package bit64 +#_#'for the intermediate (ix) and returned results (and an integer64 - adapted +#_#'version of choose) - the maximum value is then 2^63-1 or 9.2e18. +#_#'@return an integer: the position (index number) of the given combination, +#_#'1-based. findpos_wReplacement <- function(n, comb, check=TRUE) { #this function is used to assign an ID (index nr) to haplotype combinations #for easy comparing and countig in solveOneBlock @@ -921,6 +1026,15 @@ mrkdidfun <- function(dos, ploidy) { #'@return a vector of marker dosage IDs, one for each column of mrkDosage: #'each a number in 1:((ploidy+1)^nrow(mrkDosage)), NA for each column in #'dosages where any of the dosages are NA +#'@examples +#'# dosages of 3 markers in 3 tetraploid individuals: +#'mrkdosg <- +#' matrix(c(1,2,2, 4,0,0, 3,0,2), nrow=3, +#' dimnames=list(c("mrkA", "mrkB", "mrkC"), c("indiv1", "indiv2", "indiv3"))) +#'# get the "marker dosage IDs": +#'dids <- mrkdos2mrkdid(mrkDosage=mrkdosg, ploidy=4) +#'# convert dids back to marker dosages: +#'mrkdid2mrkdos(dosageIDs=dids, nmrk=3, ploidy=4, mrknames=c("mrkA", "mrkB", "mrkC")) #'@export mrkdos2mrkdid <- function(mrkDosage, indiv=NULL, ploidy, check=TRUE) { if (check) { @@ -942,6 +1056,16 @@ mrkdos2mrkdid <- function(mrkDosage, indiv=NULL, ploidy, check=TRUE) { #'@return a matrix with in columns the marker dosages corresponding to the #'marker dosageIDs, with these mrkdids as colnames, and one row per marker, #'with marker names as rownames if mrknames are specified +#' +#'@examples +#'# dosages of 3 markers in 3 tetraploid individuals: +#'mrkdosg <- +#' matrix(c(1,2,2, 4,0,0, 3,0,2), nrow=3, +#' dimnames=list(c("mrkA", "mrkB", "mrkC"), c("indiv1", "indiv2", "indiv3"))) +#'# get the "marker dosage IDs": +#'dids <- mrkdos2mrkdid(mrkDosage=mrkdosg, ploidy=4) +#'# convert dids back to marker dosages: +#'mrkdid2mrkdos(dosageIDs=dids, nmrk=3, ploidy=4, mrknames=c("mrkA", "mrkB", "mrkC")) #'@export mrkdid2mrkdos <- function(dosageIDs, nmrk, ploidy, mrknames=NULL) { mrkdos <- matrix(NA_integer_, nrow=nmrk, ncol=length(dosageIDs)) @@ -999,6 +1123,9 @@ haplofrqMinMax <- function(hapdos) { #'if 0 (default) the largest value in x will be used #'@return a character vector representing the values of x left-padded with 0's #'to the length of integer maxx or of max(x) +#'@examples +#'padded(c(21, 1, 121, NA, 0)) +#'padded(c(21, 1, 121, NA, 0), maxx=1000) #'@export padded <- function(x, maxx=0) { formatC(x, width=nchar(max(x, maxx, na.rm=TRUE)), flag="0") @@ -1085,6 +1212,17 @@ hapcomb_from_IHlist <- function(ihl, mrkdids, ahcinfo, nocombs=FALSE) { #'selected rows in order of markers (or all columns or rows in the original #'order if indiv or markers are not specified), with #'names of individuals as column names, marker names as row names +#'@examples +#'mrkdos <- data.frame( +#' marker=paste0("mrk", 1:3), +#' indiv1=c(1, 2, 2), +#' indiv2=c(4, 0, 0), +#' indiv3=c(3, 0, 2)) +#'# use all rows and columns: +#'checkmrkDosage(mrkDosage=mrkdos, ploidy=4) +#'# use only first and last row and column and change the order: +#'checkmrkDosage(mrkDosage=mrkdos, ploidy=4, indiv=c("indiv3", "indiv1"), +#' markers=c("mrk3", "mrk1")) #'@export checkmrkDosage <- function(mrkDosage, ploidy, indiv=NULL, markers=NULL, generateMarkernames=TRUE) { @@ -3545,7 +3683,8 @@ addNewMrkdids <- function(newMrkdids, ahcinfo, progress) { #'for haploblocks from 1 up to some maximum number of markers. This list can be #'computed with function build_ahccompletelist. If this list is not available #'(or is some haploblocks contain more markers than the list), the ahc for -#'the (extra) marker +#'the (extra) marker.\cr +#'See the PolyHaplotyper vignette for an illustrated explanation. #'@return a list with for each haploblock one item that itself is a list #'with items:\cr #'message; if this is "" the haploblock is processed and further @@ -3586,6 +3725,15 @@ addNewMrkdids <- function(newMrkdids, ahcinfo, progress) { #' models in case none was selected\cr #'If for new combinations of marker dosages the possible haplotype combinations #'have to be calculated, an ahclist file is written to ahcdir +#'@examples +#'\donttest{ +#'# this example takes about 1 minute to run: +#'data(PolyHaplotyper_small) +#'results <- inferHaplotypes(mrkDosage=phdos, ploidy=6, +#'haploblock=phblocks, parents=phpar, FS=phFS) +#'names(results) +#'names(results[[1]]) +#'} #'@export inferHaplotypes <- function(mrkDosage, indiv=NULL, ploidy, haploblock, parents=NULL, FS=NULL, @@ -3671,6 +3819,12 @@ inferHaplotypes <- function(mrkDosage, indiv=NULL, ploidy, #'combinations result in the same marker dosages)\cr #'usedhap: a matrix with the dosage (0 or 1) of each marker in each of the #'used haplotypes; haplotype nrs in columns, markers in rows +#'@examples +#'data(PolyHaplotyper_small) +#'# show the results of the first FS family in the first haploblock: +#'showOneFS(FSnr=1, hbresults=phresults[[1]], mrkDosage=phdos, +#' FS=phFS, parents=phpar) +#' #'@export showOneFS <- function(FSnr, hbresults, mrkDosage, FS, parents) { if (!all(colnames(hbresults$hapdos) %in% colnames(mrkDosage))) @@ -3789,6 +3943,11 @@ getHaploblockResults <- function(hapresults, haploblock) { #'of dropUnused does not affect this function #'@return an array with all possible haplotypes. The haplotypes are in columns, #'with the haplotype numbers as colnames; the markers are in rows. +#'@examples +#'data(PolyHaplotyper_small) +#'# show the composition of all possible haplotypes with the markers +#'# in the first haploblock: +#'allhap(hapresults=phresults, haploblock=1) #'@export allhap <- function(hapresults, haploblock) { hapresults <- getHaploblockResults(hapresults, haploblock) @@ -3797,18 +3956,6 @@ allhap <- function(hapresults, haploblock) { r } # allhap -# getHapnrs <- function(hapnames) { -# # get the haplotype numbers present in hapnames, -# # assuming that all hapnames have equal length and the haplotype number -# # is a zero-padded string suffixed to a common haploblock name -# # separated by "_" -# # #Used to get the haplotypes present in a hapdos matrix produced -# # with dropUnused=TRUE -# pos <- unlist(gregexpr("_", hapnames[1])) -# pos <- pos[length(pos)] + 1 -# as.integer(substr(hapnames, pos, nchar(hapnames[1]))) -# } #getHapnrs - #'@title Find all used (inferred) haplotypes #'@description Find all haplotypes for a haploblock that were inferred to be #'present in the population (i.e. all haplotypes used for haplotyping any of @@ -3825,6 +3972,11 @@ allhap <- function(hapresults, haploblock) { #'@return an array with the haplotypes that are used in the #'population. The haplotypes are in columns, with the haplotype numbers as #'colnames; the markers are in rows. +#'@examples +#'data(PolyHaplotyper_small) +#'# show the composition of haplotypes inferred to be present +#'# in the first haploblock: +#'usedhap(hapresults=phresults, haploblock=1) #'@export usedhap <- function(hapresults, haploblock) { hapresults <- getHaploblockResults(hapresults, haploblock) @@ -3844,7 +3996,7 @@ usedhap <- function(hapresults, haploblock) { #'of a set of replicates #'@param solveConflicts if TRUE (default) and there are conflicting dosage #'assignments between replicates for the same marker, the one with highest -#'frequency is used, provided the total freq of other dosages <= 1 OR +#'frequency is used, provided the total freq of other dosages = 1 OR #'<= 10\% of the frequency of the most frequent dosage. If #'solveConflicts is FALSE and there are conflicting dosages, the consensus for #'that marker will be NA @@ -3856,6 +4008,23 @@ usedhap <- function(hapresults, haploblock) { #'is retained; this column (the first in its set as specified in replist) #'now has the consensus scores over all replicates. Also, if mrkDosage was a #'data.frame, it is converted into a matrix. +#'@examples +#'# construct a dosage matrix with some missing data: +#'dosmat <- +#' matrix(c(rep(c(3,0,1),3), rep(c(1,1,2),4)), nrow=3, +#' dimnames=list(c("mrk1","mrk2","mrk3"), +#' c("a1","a2","a3", "b1","b2","b3","b4"))) +#'ix <- matrix(c(1,1, 3,1, 2,2, 1,3, 2,4, 1,5, 2,6), ncol=2, byrow=TRUE) +#'dosmat[ix] <- NA +#'dosmat +#'# define 2 sets of replicates: +#'reps <- list(c("a1","a2","a3"), c("b1","b2","b3","b4")) +#'# merge: +#'mergeReplicates(mrkDosage=dosmat, replist=reps) +#'# introduce a conflicting dosage: +#'dosmat[3,2] <- 2 +#'# merge: +#'mergeReplicates(mrkDosage=dosmat, replist=reps) #'@export mergeReplicates <- function(mrkDosage, replist, solveConflicts=TRUE) { mrkDosage <- checkmrkDosage(mrkDosage) @@ -3874,24 +4043,24 @@ mergeReplicates <- function(mrkDosage, replist, solveConflicts=TRUE) { mrkDosage } #mergeReplicates -#'@title get consensus marker dosages from one or more samples -#'@description get consensus marker dosages from one or more samples -#'@usage getConsensusmrkDosage(mrkDosage, indiv, solveConflicts=TRUE) -#'@param mrkDosage a dosage matrix with markers in rows and individuals in columns. -#'row names are marker names, column names are individual names. -#'@param indiv character vector with the names of the samples from which to -#'obtain consensus scores -#'@param solveConflicts if TRUE (default) and there are conflicting dosage -#'assignments between the samples for the same marker, the one with highest -#'frequency is used, provided the total freq of other dosages <= 1 OR -#'<= 10\% of the frequency of the maximum dosage. If -#'solveConflicts is FALSE and there are conflicting dosages, the consensus for -#'that marker will be NA -#'@details getConsensusmrkDosage is primarily used by mergeReplicates -#'but can also be used by itself -#'@return a vector with one consensus dosage for each row of mrkDosage, and the -#'row names of mrkDosage as names -#'@export +# not exported: user function is mergeReplicates +#_#'@title get consensus marker dosages from one or more samples +#_#'@description get consensus marker dosages from one or more samples +#_#'@usage getConsensusmrkDosage(mrkDosage, indiv, solveConflicts=TRUE) +#_#'@param mrkDosage a dosage matrix with markers in rows and individuals in columns. +#_#'row names are marker names, column names are individual names. +#_#'@param indiv character vector with the names of the samples from which to +#_#'obtain consensus scores +#_#'@param solveConflicts if TRUE (default) and there are conflicting dosage +#_#'assignments between the samples for the same marker, the one with highest +#_#'frequency is used, provided the total freq of other dosages <= 1 OR +#_#'<= 10\% of the frequency of the maximum dosage. If +#_#'solveConflicts is FALSE and there are conflicting dosages, the consensus for +#_#'that marker will be NA +#_#'@details getConsensusmrkDosage is primarily used by mergeReplicates +#_#'but can also be used by itself +#_#'@return a vector with one consensus dosage for each row of mrkDosage, and the +#_#'row names of mrkDosage as names getConsensusmrkDosage <- function(mrkDosage, indiv, solveConflicts=TRUE) { indiv <- as.character(indiv) if (is.null(colnames(mrkDosage))) @@ -3924,18 +4093,18 @@ getConsensusmrkDosage <- function(mrkDosage, indiv, solveConflicts=TRUE) { mindos } #getConsensusmrkDosage -#'@title get all FS haplotype combinations expected from two parental haplotype -#'combinations -#'@description get all FS haplotype combinations expected from two parental -#'haplotype combinations, no considering DR, and without their frequencies -#'@usage getFScombs(parcomb) -#'@param parcomb matrix with one column for each parent and <ploidy> rows, -#'giving the haplotype combinations of the parents -#'@return a matrix with <ploidy> rows and one column for each -#'haplotype combination that can be generated from these two parents, -#'assuming polysomic inheritance with no double reduction.\cr -#'The colnames are NULL: haplotype combinations are not named. -#'@export +# not exported +#_#'@title get all FS haplotype combinations expected from two parental haplotype +#_#'combinations +#_#'@description get all FS haplotype combinations expected from two parental +#_#'haplotype combinations, no considering DR, and without their frequencies +#_#'@usage getFScombs(parcomb) +#_#'@param parcomb matrix with one column for each parent and <ploidy> rows, +#_#'giving the haplotype combinations of the parents +#_#'@return a matrix with <ploidy> rows and one column for each +#_#'haplotype combination that can be generated from these two parents, +#_#'assuming polysomic inheritance with no double reduction.\cr +#_#'The colnames are NULL: haplotype combinations are not named. getFScombs <- function(parcomb) { gamP1 <- unique(getAllGametes(parcomb[, 1]), MARGIN=2) gamP2 <- unique(getAllGametes(parcomb[, 2]), MARGIN=2) @@ -4002,6 +4171,13 @@ getHapcombFreq <- function(HapcombMatrix) { #'to right, first on row 1, then on row 2 etc\cr #'freq: a vector of length ncol(hapcomb), with for each gamete in hapcomb its #'frequency (0.0 ... 1.0) +#'@examples +#'# specify combination of haplotypes in a tetraploid parent: +#'hapcomb <- c(2,2,5,6) # 2 copies of haplotype 2, 1 each of 5 and 6 +#'# gamete frequencies without double reduction: +#'getGameteFreqs(parhac=hapcomb, DRrate=0) +#'# gamete frequencies with 5\% double reduction: +#'getGameteFreqs(parhac=hapcomb, DRrate=0.05) #'@export getGameteFreqs <- function(parhac, DRrate) { noDR <- getHapcombFreq(getAllGametes(parhac)) @@ -4077,13 +4253,19 @@ getGameteFreqs <- function(parhac, DRrate) { #'(each with a frequency of 0.16) #'@return a list of 2 elements:\cr #'FShac: a matrix with one column per unique FS haplotype combination and -#'the same row count as parhac, giving the FS haplotype combinations. -#'The order of the columns in the matrix is the "natural" order. There are +#'the same row count as parhac, giving the FS haplotype combinations. There are #'no duplicated columns but several (not necessarily adjacent) columns #'may correspond to the same mrkdid\cr #'freq: a vector of length ncol(hapcomb), with for each FS haplotype combination #'in hapcomb its frequency (0.0 ... 1.0). The colnames are NULL: haplotype #'combinations are not named. +#'@examples +#'# specify combinations of haplotypes in two tetraploid parents: +#'hapcomb <- matrix(c(2,2,5,6, 1,2,5,5), ncol=2) +#'# FS frequencies without double reduction: +#'getFSfreqs(parhac=hapcomb, DRrate=0) +#'# FS frequencies with 5\% double reduction: +#'getFSfreqs(parhac=hapcomb, DRrate=0.05) #'@export getFSfreqs <- function(parhac, DRrate) { gamP1 <- getGameteFreqs(parhac[, 1], DRrate) @@ -4118,7 +4300,10 @@ getFSfreqs_gamfrq <- function(gamP1, gamP2) { uhx <- c(uniqcomb, length(du)+1) for (u in seq_along(uniqfreq)) uniqfreq[u] <- sum(FSfreq[uhx[u]:(uhx[u+1]-1)]) - list(FShac=FScomb[, uniqcomb, drop=FALSE], freq=uniqfreq) + #remove the duplicated columns and the column names: + FScomb <- FScomb[, uniqcomb, drop=FALSE] + colnames(FScomb) <- NULL + list(FShac=FScomb, freq=uniqfreq) } #getFSfreqs_gamfrq checkFS_TwoParents <- function(pargamlist, FShac) { @@ -4447,6 +4632,15 @@ pedigreeHapCheck_OneHaploblock <- function(ped, parentnames, mrkDosage, #'Both ped_arr and parents_arr contain all haploblocks in haploblock, also #'those skipped because of too many markers and those without any haplotyped #'individuals. These can be excluded by excluding them from the haploblock list. +#'@examples +#'data(PolyHaplotyper_small) +#'phchk <- pedigreeHapCheck(ped=phped, mrkDosage=phdos, haploblock=phblocks, +#' hapresults=phresults) +#'# show the top of the ped_arr for haploblock 1: +#'phchk$ped_arr[1:6,,1] +#'# show the top of the parents_arr for haploblock 1: +#'phchk$parents_arr[1:6,,1] + #'@export pedigreeHapCheck <- function(ped, mrkDosage, haploblock, hapresults) { @@ -4564,6 +4758,10 @@ pedigreeHapCheck <- function(ped, mrkDosage, haploblock, #' can indicate a failure to find a solution for the FS family but may also #' describe less significant problems, such as some progeny with unexpected #' marker dosages etc. +#'@examples +#'data(PolyHaplotyper_small) +#'overviewByFS(haploblock=phblocks, parents=phpar, FS=phFS, +#' hapresults=phresults) #'@export overviewByFS <- function(haploblock, parents, FS, hapresults) { hbnames <- names(haploblock) @@ -4795,6 +4993,13 @@ calcFSstats <- function(ovwFS, haploblocks) { #'once, even if some parents produced more than one FS family), which are not #'included in any of the other rows; therefore "all" is not equal to the sum of #'the rows above. +#'@examples +#'data(PolyHaplotyper_small) +#'phchk <- pedigreeHapCheck(ped=phped, mrkDosage=phdos, haploblock=phblocks, +#' hapresults=phresults) +#'phovw <- overviewByFS(haploblock=phblocks, parents=phpar, FS=phFS, +#' hapresults=phresults) +#'calcStatistics(pedchk=phchk, ovwFS=phovw) #'@export calcStatistics <- function(pedchk, ovwFS, indiv, haploblocks) { if (missing(indiv)) indiv <- NULL @@ -4832,6 +5037,13 @@ calcStatistics <- function(pedchk, ovwFS, indiv, haploblocks) { #'The column with haplotype count NA (if any) shows the haploblocks for which #'no haplotype solution was found (the reason for that would usually be found #'in column 1 of ovwFS$messages) +#'@examples +#'data(PolyHaplotyper_small) +#'phovw <- overviewByFS(haploblock=phblocks, parents=phpar, FS=phFS, +#' hapresults=phresults) +#'# in this small dataset there are only 2 haploblocks, each with 4 markers: +#'calcMrkHaptable(ovwFS=phovw) +#'# in both haploblocks 5 haplotypes are inferred #'@export calcMrkHaptable <- function(ovwFS){ table(ovwFS$ovw[ ,1], ovwFS$ovw[, 2], useNA="ifany", @@ -4854,6 +5066,12 @@ calcMrkHaptable <- function(ovwFS){ #'in another column. The markers and haploblocks columns may be character, #'factor or numeric, and the columns may be indicated by name or number. #'@return the desired list +#'@examples +#'df1 <- data.frame( +#' marker=paste0("mrk",1:9), +#' block=LETTERS[c(1,2,3,2,3,1,1,2,2)], +#' extracol=runif(9)) +#'haploblock_df2list(df=df1, mrkcol="marker", hbcol=2) #'@export haploblock_df2list <- function(df, mrkcol, hbcol, sorted=TRUE) { uhb <- unique(df[, hbcol]) @@ -4902,6 +5120,11 @@ haploblock_df2list <- function(df, mrkcol, hbcol, sorted=TRUE) { #'The contents of file fname can be pasted into the input data box of #'the ShesisPlus web interface at http://shesisplus.bio-x.cn/SHEsis.html #'@return a data.frame in the described ShesisPlus input format +#'@examples +#'data(PolyHaplotyper_small) +#'SSPin <- make.ShesisPlus.input(mrkDosage=phdos, markers=phblocks[[1]], +#' ploidy=6) +#'SSPin[1:6,1:8] #'@export make.ShesisPlus.input <- function (mrkDosage, indiv=NULL, markers, ploidy, phenotype=0, fname="") { @@ -4926,10 +5149,11 @@ make.ShesisPlus.input <- function (mrkDosage, indiv=NULL, markers, #'@title Read the haplotyping results from the ShesisPlus output #'@description Read the haplotyping results from the ShesisPlus output -#'@usage read.ShesisPlus.output(fname, order.by=c("", "hapnr", "count")[2]) -#'@param fname filename of a text file with ShesisPlus output (as copied from +#'@usage read.ShesisPlus.output(SSPout, order.by=c("", "hapnr", "count")[2]) +#'@param SSPout filename of a text file with ShesisPlus output (as copied from #'the web page produced by running ShesisPlus web interface -#'at http://shesisplus.bio-x.cn/SHEsis.html) +#'at http://shesisplus.bio-x.cn/SHEsis.html); or a character vector with the +#'same output #'@param order.by how to order the rows of the hapstat data.frame. "hapnr" means #'ordering by haplotype number, "count" means ordering by decreasing Total.count, #'anything else results in no reordering. By default order by hapnr. @@ -4943,12 +5167,39 @@ make.ShesisPlus.input <- function (mrkDosage, indiv=NULL, markers, #'according to an email from Zhiqiang Li of 13-04-2020) BETA: Regression coefficient, #'SE: Standard error, R2: Regression r-squared, T: t-distribution statistics, #'P: p-value -#' +#'@examples +#'# we give a typical SSP output as character vector; instead we could also +#'# give the name of a text file +#'SSPout <- c( +#' " Please cite:", +#' "", +#' " Shen, J. et al. SHEsisPlus, a toolset for genetic studies ...", +#' " Shi, Y. et al. SHEsis, a powerful software platform ...", +#' " Li, Z. et al. A partition-ligation-combination-subdivision ...", +#' "", +#' "if you find this tool useful in your research. Thanks!", +#' "", +#' "Haplotype Analysis:", +#' "", +#' "Haplotypes with frequency <0.03 are ignored.", +#' "Loci chosen for haplotype analysis: m1, m2, m3, m4", +#' "Haplotype Total count Beta SE R2 T p", +#' "1122 2232 0.002 0.01 1.01e-04 0.252 0.8", +#' "1222 230 -0.019 0.017 0.002 -1.15 0.25", +#' "1221 152 -0.01 0.024 2.94e-04 -0.43 0.667", +#' "2222 288 0.008 0.02 2.93e-04 0.429 0.667", +#' "1121 142 -0.009 0.022 2.81e-04 -0.42 0.674" +#') +#'read.ShesisPlus.output(SSPout) #'@export -read.ShesisPlus.output <- function(fname, order.by=c("", "hapnr", "count")[2]) { - order.by <- order.by[1] +read.ShesisPlus.output <- function(SSPout, order.by=c("", "hapnr", "count")[2]) { if (missing(order.by) || is.null(order.by) || is.na(order.by)) order.by <- "" - txt <- readLines(fname) + order.by <- order.by[1] + fromfile <- length(SSPout)==1 && file.exists(SSPout) + if (!fromfile && !(is.character(SSPout) && length(SSPout)>1)) + stop("SSPout must be a text file name or a character vector") + if (fromfile) con <- file(SSPout) else con=textConnection(SSPout) + txt <- readLines(con) i <- grep("Loci chosen for haplotype analysis:", txt, value=FALSE) markernames <- character(0) if (length(i) == 1) { @@ -4965,8 +5216,10 @@ read.ShesisPlus.output <- function(fname, order.by=c("", "hapnr", "count")[2]) { # find out if there is an LD section after the statistics: LD <- grep("Linkage Disequilibrium Analysis:", txt, value=FALSE) if (length(LD) > 0) nrows <- LD[1] - i - 1 else nrows <- -1 - hapstat <- read.table(fname, header=TRUE, sep="\t", skip=i-1, nrows=nrows, + if (fromfile) con <- file(SSPout) else con <- textConnection(SSPout) + hapstat <- read.table(con, header=TRUE, sep="\t", skip=i-1, nrows=nrows, na.strings=c("NA", "-NA", "nan", "-nan", "")) + close(con) if (order.by == "count") hapstat <- hapstat[order(hapstat$Total.count, decreasing=TRUE),] # check length of haplotypes: nc <- nchar(hapstat$Haplotype) @@ -4993,7 +5246,7 @@ read.ShesisPlus.output <- function(fname, order.by=c("", "hapnr", "count")[2]) { #'@description convert PolyHaplotyper marker data (input) to SATlotyper #'format for a single haploblock. #'@usage make.SATlotyper.input(mrkDosage, indiv=NULL, markers, -#'ploidy, phenotype=0, fname="") +#'ploidy, phenotype=0, fname) #'@param mrkDosage matrix or data.frame of allele dosages; same as input for #'inferHaplotypes. Markers are in rows, individuals in columns, each cell has #'a marker dosage. All marker dosages must be in 0:ploidy or NA @@ -5006,14 +5259,19 @@ read.ShesisPlus.output <- function(fname, order.by=c("", "hapnr", "count")[2]) { #'of the columns of dosmat; default 0 #'@param fname filename of a *.csv output file: this will contain the data #'in the SATlotyper format (the saved data.frame is also the return value). -#'If "" (default) no file is written +#'If "" no file is written #'@return a data.frame in the SATlotyper input format: a header row with #'"Genotype" and the marker names, and one row per individual with the #'individual name plus for each marker the genotype as a sorted #'string of <ploidy> A's and B's, or <ploidy> N's +#'@examples +#'data(PolyHaplotyper_small) +#'SATin <- make.SATlotyper.input(mrkDosage=phdos, markers=phblocks[[1]], +#' ploidy=6, fname="") +#'head(SATin) #'@export make.SATlotyper.input <- function (mrkDosage, indiv=NULL, markers, - ploidy, phenotype=0, fname="") { + ploidy, phenotype=0, fname) { sep <- "\t" # separator of the columns in the output file dm <- checkmrkDosage(mrkDosage, ploidy=ploidy, markers=markers) # also converts to matrix result <- matrix(NA_character_, nrow=ncol(dm), ncol=nrow(dm), @@ -5031,12 +5289,14 @@ make.SATlotyper.input <- function (mrkDosage, indiv=NULL, markers, for (i in seq_len(ncol(result))) { result[, i] <- sapply(dm[i,], FUN=dos2str, ploidy=ploidy) } - con <- file(fname) - writeLines(paste("Genotype", paste(colnames(result), collapse=sep), sep=sep), - con=con, sep="\n") - close(con) - write.table(result, file=fname, col.names=FALSE, row.names=TRUE, quote=FALSE, - sep=sep, append=TRUE) + if (length(fname)==1 && !is.na(fname) && fname != "") { + con <- file(fname) + writeLines(paste("Genotype", paste(colnames(result), collapse=sep), sep=sep), + con=con, sep="\n") + close(con) + write.table(result, file=fname, col.names=FALSE, row.names=TRUE, quote=FALSE, + sep=sep, append=TRUE) + } result } #make.SATlotyper.input @@ -5180,7 +5440,6 @@ read.SATlotyper.output <- function(fname, output="", #'$result: screen output from SATlotyper; this #'contains some extra info not present in the output xml file)\cr #'The main result is the outfile -#' #'@export run.SATlotyper <- function(path_to_SATlotyper, infile, outfile, SAT_solver="sat4j.conf") { @@ -5216,13 +5475,18 @@ run.SATlotyper <- function(path_to_SATlotyper, infile, outfile, #'@param ploidy single integer: the ploidy level #'@param fname filename of a tab-separated output file: this will contain the #'data in Happy-inf format (the saved data.frame is also the return value). -#'If "" (default) no file is written +#'If "" no file is written #'@return a data.frame in the Happy-inf input format: a header row with #'"SNPID", "block" and names of the individuals and one row per marker, with #'only the individuals, markers and blocks as specified. #'#'SNPID has the marker names, "Block" the haploblock names. All markers #'of a haploblock are in contiguous rows. Missing dosages #'are represented by "NA" +#'@examples +#'data(PolyHaplotyper_small) +#'HAPin <- make.Happyinf.input(mrkDosage=phdos, haploblock=phblocks, +#' ploidy=6, fname="") +#'HAPin[,1:8] #'@export make.Happyinf.input <- function (mrkDosage, indiv=NULL, haploblock, ploidy, fname) { @@ -5234,9 +5498,11 @@ make.Happyinf.input <- function (mrkDosage, indiv=NULL, haploblock, check.names=FALSE, stringsAsFactors=FALSE) rownames(result) <- NULL - if (fname != "") write.table(result, file=fname, - row.names=FALSE, col.names=TRUE, - sep="\t", na="NA", quote=FALSE) + if (length(fname)==1 && !is.na(fname) && fname != "") { + write.table(result, file=fname, + row.names=FALSE, col.names=TRUE, + sep="\t", na="NA", quote=FALSE) + } invisible(result) } #make.Happyinf.input @@ -5417,6 +5683,11 @@ compareHapresults <- function(haploblock, hapresultsA, hapresultsB) { #'match (TRUE if the non-missing marker dosages match the haplotype dosages, #'FALSE is there is a conflict, NA if mrkNA and/or hapNA are TRUE) #'Each element is itself a list with elements: +#'@examples +#'data(PolyHaplotyper_small) +#'chmd <- compareHapMrkDosages(mrkDosage=phdos, hapresults=phresults) +#'# show results for first haploblock, first 8 individuals: +#'chmd[1, 1:8,] #'@export compareHapMrkDosages <- function(mrkDosage, hapresults) { if (!all(sapply(hapresults, diff --git a/data/PolyHaplotyper_demo.RData b/data/PolyHaplotyper_demo.RData index 324cec872e681f2e61fa6edfcc8ec637c4d9bbe0..bb3cc335d66662bf31e304301bcf7fd0cf108ef2 100644 Binary files a/data/PolyHaplotyper_demo.RData and b/data/PolyHaplotyper_demo.RData differ diff --git a/tests/testthat/test_1_supporting_functions.R b/tests/testthat/test_1_supporting_functions.R index 3f25f8159aa0fe944ef7979cb97742e68ecb1841..347d59be5ead944c32ca69d74e8d88bb630d6abf 100644 --- a/tests/testthat/test_1_supporting_functions.R +++ b/tests/testthat/test_1_supporting_functions.R @@ -1,34 +1,28 @@ context("tests of supporting functions") -wd <- getwd() -#wd <- setwd("m:/Polyploids/PolyHaplotyper/data") - -test_that("load the demo data") { - result <- data(demo) - #result <- load("demo.RData") - expect_equal(result, c("snpdos", "ped")) - expect_equal(dim(snpdos), c(28, 661)) +test_that("load the demo data", { + result <- data(PolyHaplotyper_demo) + expect_equal(dim(snpdos), c(30, 663)) + expect_equal(dim(ped), c(661, 4)) expect_equal(names(ped), c("genotype", "mother", "father", "sample_nr")) -} +}) -test_that("test several functions") { +test_that("test several functions", { result <- allHaplotypes(mrknames=letters[1:4]) expect_equal(dim(result), c(16, 4)) + expect_equal(colnames(result), letters[1:4]) expect_true(all(result[nrow(result),] == 1)) expect_equal(result[1:2, ncol(result)], c(0, 1)) result <- mrkdos2mrkdid( mrkDosage=matrix(c(1,2,3,4,4,3,2,1), ncol=2, dimnames=list(letters[1:4], c("ind1", "ind2"))), ploidy=4) - expect_equal(result, c(195, 587)) + expect_equal(result, setNames(c(195, 587), c("ind1", "ind2"))) parhac <- matrix(c(1,1,2,2,5,6,1,1,1,1,1,3), ncol=2, dimnames=list(NULL, c("P1", "P2"))) - result <- getF1freqs(parhac=parhac, nhap=8, DRrate=0.1) - expect_equal(names(result), c("F1hac", "freq")) - expect_equal(dim(result$F1hac), c(6, 54)) + result <- getFSfreqs(parhac=parhac, DRrate=0.1) + expect_equal(names(result), c("FShac", "freq")) + expect_equal(dim(result$FShac), c(6, 54)) expect_equal(sum(result$freq), 1, tolerance=1e-6) -} - -test_that("inferHaplotypes without F1 populations") { +}) -} diff --git a/vignettes/PolyHaplotyper.Rmd b/vignettes/PolyHaplotyper.Rmd index 045afc5634777324991664f2c1669cb9bfff4d14..03d6bae68386303c50ce603dddb2fea1c8a69e8d 100644 --- a/vignettes/PolyHaplotyper.Rmd +++ b/vignettes/PolyHaplotyper.Rmd @@ -36,9 +36,11 @@ are available, or if the parents have missing data. # Input data Load the package and demo data: ```{r} +rm(list=ls()) # clear existing data from memory library(PolyHaplotyper) data(PolyHaplotyper_demo) -# PolyHaplotyper_demo contains data.frames snpdos (SNP dosages) and ped (pedigree) +# show the demo data: +ls() ``` Two inputs are necessary for the assignment of haplotypes: a matrix or data.frame with the dosages of the markers, and a list indicating which markers @@ -72,11 +74,11 @@ The markers must be assigned to haploblocks which are stored as a list. In the example the haploblock information is contained in an extra column "contig" in the marker dosage data: ```{r} -snpdos[1:6, 1:8] +demo_snpdos[1:6, 1:8] ``` We convert the haploblock information to list format: ```{r} -hblist <- haploblock_df2list(snpdos, mrkcol=1, hbcol=2) +hblist <- haploblock_df2list(demo_snpdos, mrkcol=1, hbcol=2) # number of markers in each haploblock: sapply(hblist, length) ``` @@ -86,12 +88,12 @@ This small example has data for 7 haploblocks, each with 4 or 5 markers. The marker data must be supplied as a matrix or data.frame, with markers in rows and individuals in columns, with each cell containing the dosage (0 ... ploidy) of the target marker allele. Our data are in data.frame -snpdos, which, as we saw above, contains an extra column "contig" defining the -haploblocks. We remove the contig column from snpdos to get it in the required -format: +demo_snpdos, which, as we saw above, contains an extra column "contig" defining +the haploblocks. We remove the contig column from demo_snpdos to get it in the +required format: ```{r} -snpdos <- snpdos[, -2] -snpdos[1:6, 1:8] +demo_snpdos <- demo_snpdos[, -2] +demo_snpdos[1:6, 1:8] ``` The SNP dosages are all in the range 0...6, as these demo data are obtained @@ -102,10 +104,10 @@ The pedigree must be a data.frame or matrix with in the first 3 columns the names of the individual, its mother and its father. Parents may be missing (NA) but individuals may not, and there may not be more than one line for the same individual. -Additional columns may be present. In our case ped has a fourth column +Additional columns may be present. In our case demo_ped has a fourth column containing the sample nr: ```{r} -head(ped) +head(demo_ped) ``` In this example several individuals are represented by more than one line, because here the pedigree is also used to specify the (sometimes duplicated) @@ -114,46 +116,47 @@ In order to remove the duplicates from the pedigree we build a list with the samples representing each replicated individual. ```{r} # create a list of of replicates: -tb <- table(ped$genotype) +tb <- table(demo_ped$genotype) replist <- list() for (dup in names(tb[tb>1])) { - replist[[dup]] <- ped$sample_nr[ped$genotype == dup] + replist[[dup]] <- demo_ped$sample_nr[demo_ped$genotype == dup] } ``` We remove the replicated individuals from the pedigree, keeping only the first row for each set of duplicates: ```{r} -dim(ped) +dim(demo_ped) for (dup in seq_along(replist)) { dupsamp <- as.character(replist[[dup]])[-1] - ped <- ped[!(ped$sample_nr %in% dupsamp),] + demo_ped <- demo_ped[!(demo_ped$sample_nr %in% dupsamp),] } -dim(ped) +dim(demo_ped) ``` We see that 30 lines containing duplicates have been discarded. -In snpdos, the data.frame containing the SNP dosages, the column names are the -sample names, and duplicated samples are still present. First we merge the +In demo_snpdos, the data.frame containing the SNP dosages, the column names are +the sample names, and duplicated samples are still present. First we merge the dosage data of the replicates for each individual, again using replist (the list of replicates): ```{r} # merge all the duplicated samples: -dim(snpdos) -snpdos <- mergeReplicates(mrkDosage=snpdos, replist=replist, - solveConflicts=TRUE) -dim(snpdos) +dim(demo_snpdos) +demo_snpdos <- mergeReplicates(mrkDosage=demo_snpdos, replist=replist, + solveConflicts=TRUE) +dim(demo_snpdos) ``` -Function mergeReplicates has converted the snpdos data.frame to a matrix, +Function mergeReplicates has converted the demo_snpdos data.frame to a matrix, with the rownames taken from the column "marker". For each individual present in multiple replicates only the first column name is retained, and the merged data are the consensus scores over all replicates. We see that -snpdos now has 31 less columns: 1 was the removed "marker" column and +demo_snpdos now has 31 less columns: 1 was the removed "marker" column and 30 were columns for the duplicated samples, corresponding to the 30 duplicates removed from the pedigree. Now we must change the sample numbers to the corresponding individual names: ```{r} -colnames(snpdos) <- ped$genotype[match(colnames(snpdos), ped$sample_nr)] +colnames(demo_snpdos) <- + demo_ped$genotype[match(colnames(demo_snpdos), demo_ped$sample_nr)] ``` @@ -170,8 +173,9 @@ parents # find the FS individuals by looking in the pedigree for their mother and father: FS <- list() for (p in seq_len(nrow(parents))) { - FS[[p]] <- ped$genotype[!is.na(ped$mother) & ped$mother==parents[p, 1] & - !is.na(ped$father) & ped$father==parents[p, 2]] + FS[[p]] <- + demo_ped$genotype[!is.na(demo_ped$mother) & demo_ped$mother==parents[p, 1] & + !is.na(demo_ped$father) & demo_ped$father==parents[p, 2]] } # sizes of the FS families: sapply(FS, length) @@ -184,7 +188,7 @@ To perform the haplotyping for all haploblocks in one go we use function inferHaplotypes: ```{r results="hide"} -results <- inferHaplotypes(mrkDosage=snpdos, ploidy=6, +results <- inferHaplotypes(mrkDosage=demo_snpdos, ploidy=6, haploblock=hblist, parents=parents, FS=FS) ``` @@ -222,8 +226,8 @@ function usedhap is often more useful; this shows only the haplotypes that are present in at least one individual. Matrix hapdos has the dosages of each haplotype in each individual, in the same -layout as the marker dosages in snpdos. Only the haplotypes that occur in the -population are shown: in this case haplotypes 1, 3, 7, 8 and 11 (appended to +layout as the marker dosages in demo_snpdos. Only the haplotypes that occur in +the population are shown: in this case haplotypes 1, 3, 7, 8 and 11 (appended to the haploblock name). For each individual the haplotype dosages sum to the ploidy (6). Non-haplotyped individuals have NA dosages for all haplotypes. @@ -235,7 +239,7 @@ dosages: ```{r} results[[1]]$imputedGeno[, 1:8] -snpdos[hblist[[1]], colnames(results[[1]]$imputedGeno)[1:8]] +demo_snpdos[hblist[[1]], colnames(results[[1]]$imputedGeno)[1:8]] ``` The other items in the results for each haploblock are mostly intended for @@ -282,7 +286,7 @@ Next we check each individual for a match between the possible gametes of its parents and its own haplotype composition, using function pedigreeHapdosCheck: ```{r} -pedchk <- pedigreeHapCheck(ped=ped, mrkDosage=snpdos, +pedchk <- pedigreeHapCheck(ped=demo_ped, mrkDosage=demo_snpdos, haploblock=hblist, hapresults=results) #show part of ped_arr for haploblock 1: @@ -342,7 +346,8 @@ haplotypes is shown. ```{r} # show the segregation for FS number 1 (FSnr=1) and # haploblock number 1 (hbresults=results[[1]]) -showOneFS(FSnr=1, hbresults=results[[1]], mrkDosage=snpdos, FS=FS, parents=parents) +showOneFS(FSnr=1, hbresults=results[[1]], mrkDosage=demo_snpdos, + FS=FS, parents=parents) ``` The result of showOneFS is a list with 3 elements. The first two are matrices that show the segregation in terms of marker dosages and haplotype dosages, @@ -401,12 +406,10 @@ The ahccompletelist files can be computed using function build_ahccompletelist: ```{r eval=FALSE} build_ahccompletelist(ploidy=6, maxmrk=5, overwrite=FALSE) ``` -Apart from the file ahccompletelist_6x.RData this will also generate -separate files for each number of markers, with names ahc_6x_nmrk1.RData ... -ahc_6x_nmrk5.RData; these can be deleted. -Alternatively the following ahccompletelists are available upon request -from roeland.voorrips@wur.nl. + +Alternatively the following ahccompletelists are (currently) available upon +request from roeland.voorrips@wur.nl: ploidy 4, up to 7 markers: ahccompletelist_4x.RData, 41 MB ploidy 4, up to 8 markers: ahccompletelist_4x.RData, 769 MB ploidy 6, up to 5 markers: ahccompletelist_6x.RData, 9 MB