diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100755 index 0000000..2d19fc7 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml new file mode 100755 index 0000000..0f2fe08 --- /dev/null +++ b/.github/workflows/R-CMD-check.yaml @@ -0,0 +1,52 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +name: R-CMD-check + +permissions: read-all + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: macos-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'oldrel-1'} + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + + - uses: r-lib/actions/check-r-package@v2 + with: + upload-snapshots: true + build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 index 1d31672..66d5d24 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,53 @@ -.Rhistory -.Rproj.user -*.Rproj -..Rcheck/ +# CUSTOM +.Rprofile +renv/ + +# History files +.Rhistory +.Rapp.history + +# Session Data files +.RData +.RDataTmp + +# User-specific files +.Ruserdata + +# Example code in package build process +*-Ex.R + +# Output files from R CMD build +/*.tar.gz + +# Output files from R CMD check +/*.Rcheck/ + +# RStudio files +.Rproj.user/ + +# produced vignettes +vignettes/*.html +vignettes/*.pdf + +# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 +.httr-oauth + +# knitr and R markdown default cache directories +*_cache/ +/cache/ + +# Temporary files created by R markdown +*.utf8.md +*.knit.md + +# R Environment Variables +.Renviron + +# pkgdown site +docs/ + +# translation temp files +po/*~ + +# RStudio Connect folder +rsconnect/ \ No newline at end of file diff --git a/.travis.yml b/.travis.yml old mode 100644 new mode 100755 diff --git a/DESCRIPTION b/DESCRIPTION index eccbca8..a79314d 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,18 +1,18 @@ -Package: usedist -Type: Package -Title: Distance Matrix Utilities -Version: 0.4.0.9000 -Authors@R: person("Kyle", "Bittinger", email = "kylebittinger@gmail.com", - role = c("aut", "cre")) -Description: Functions to re-arrange, extract, and work with distances. -License: GPL-3 -Encoding: UTF-8 -LazyData: true -RoxygenNote: 7.2.3 -Suggests: - testthat, - tibble, - tidyr (>= 1.0.0), - rlang, - future.apply, - ggplot2 +Package: usedist +Type: Package +Title: Distance Matrix Utilities +Version: 0.4.0.9000 +Authors@R: person("Kyle", "Bittinger", email = "kylebittinger@gmail.com", + role = c("aut", "cre")) +Description: Functions to re-arrange, extract, and work with distances. +License: GPL-3 +Encoding: UTF-8 +LazyData: true +RoxygenNote: 7.3.1 +Suggests: + future.apply, + rlang, + tibble, + tidyr (>= 1.0.0), + testthat, + ggplot2 diff --git a/NAMESPACE b/NAMESPACE index 7db2e05..9a0bf69 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,7 +10,3 @@ export(dist_subset) export(dist_to_centroids) export(pivot_to_matrix) export(pivot_to_numeric_matrix) -importFrom(rlang,as_name) -importFrom(rlang,ensym) -importFrom(tibble,column_to_rownames) -importFrom(tidyr,pivot_wider) diff --git a/NEWS.md b/NEWS.md old mode 100644 new mode 100755 diff --git a/R/centroid.R b/R/centroid.R old mode 100644 new mode 100755 index cd5a521..53fc1bb --- a/R/centroid.R +++ b/R/centroid.R @@ -1,195 +1,196 @@ -#' Compute distances from each item to group centroids -#' -#' @param d A distance matrix object of class \code{dist}. -#' @param g A factor representing the groups of items in \code{d}. -#' @param squared If \code{TRUE}, return the squared distance to group -#' centroids. -#' @return A data frame with distances to the group centroids: -#' -#' \describe{ -#' \item{Item}{ -#' A character vector of item labels from the dist object, or an integer -#' vector of item locations if labels are not present.} -#' \item{CentroidGroup}{ -#' The group for which the centroid distance is given. The column type -#' should match that of the argument g (the \code{unique} function is used -#' to generate this column).} -#' \item{CentroidDistance}{ -#' Inferred distance from the item to the centroid position of the -#' indicated group.}} -#' -#' @details -#' This function computes the distance from each item to the centroid positions -#' of groups defined in the argument \code{g}. This is accomplished without -#' determining the centroid positions directly; see the documentation for -#' \code{\link{dist_between_centroids}} for details on this procedure. -#' -#' If the distance can't be represented in a Euclidean space, the -#' \code{CentroidDistance} is set to \code{NaN}. See the documentation for -#' \code{\link{dist_between_centroids}} for further details. -#' -#' @export -dist_to_centroids <- function (d, g, squared = FALSE) { - d <- stats::as.dist(d) - d2 <- d ** 2 - items <- attr(d, "Labels") - # Use numeric index for items if the distance matrix has no labels - items <- if (is.null(items)) 1:attr(d, "Size") else items - group_items <- tapply(items, g, c) - group_sizes <- lapply(group_items, length) - group_d2s <- lapply(group_items, function (x) dist_subset(d2, x)) - within_group_sums <- lapply(group_d2s, sum) - df <- expand.grid(Item=items, CentroidGroup=unique(g), stringsAsFactors = F) - dist_to_group_centroid <- function (idx2, group) { - idx1 <- group_items[[group]] - n1 <- group_sizes[[group]] - sum1 <- within_group_sums[[group]] - sum12 <- sum(as.matrix(d2)[idx1, idx2]) - term1 <- sum1 / (n1 ** 2) - term12 <- sum12 / n1 - result_squared <- term12 - term1 - if (squared) { - result_squared - } else { - is_negative <- result_squared < 0 - if (any(is_negative)) { - msg <- paste0( - "When computing distance to centroids, negative values were ", - "produced before taking a square root. ", - "This happens because the distances cannot be represented in a ", - "Euclidean coordinate system. ", - "These distances are being returned as NaN. ", - "Alternately, you may set `squared = TRUE` to return the squared ", - "distances. In this case, you will never get NaN, but you might ", - "receive negative numbers for the squared distance.") - warning(msg) - result <- numeric(length(result_squared)) - result[!is_negative] <- sqrt(result_squared[!is_negative]) - result[is_negative] <- NaN - result - } else { - sqrt(result_squared) - } - } - } - df$CentroidDistance <- mapply( - dist_to_group_centroid, df$Item, df$CentroidGroup) - df -} - -#' Compute the distance between group centroids -#' -#' @param d A distance matrix object of class \code{dist}. -#' @param idx1 A vector of items in group 1. -#' @param idx2 A vector of items in group 2. -#' @param squared If \code{TRUE}, return the squared distance between centroids. -#' @return The distance between group centroids (see details). -#' -#' @details -#' If you have a distance matrix, and the objects are partitioned into groups, -#' you might like to know the distance between the group centroids. The -#' centroid of each group is simply the center of mass for the group. -#' -#' It is possible to infer the distance between group centroids directly from -#' the distances between items in each group. The \code{adonis} test in the -#' ecology package \code{vegan} takes advantage of this approach to carry out -#' an ANOVA-like test on distances. -#' -#' The approach rests on the assumption that the objects occupy some -#' high-dimensional Euclidean space. However, we do not have to actually -#' create the space to find the distance between centroids. Based on the -#' assumption that such a space exists, we can use an algebraic formula to -#' perform the computation. -#' -#' The formulas for this were presented by Apostol and Mnatsakanian in 2003, -#' though we need to re-arrange equation 28 in their paper to get the value -#' we want: -#' -#' \deqn{| c_1 - c_2 | = \sqrt{ -#' \frac{1}{n_1 n_2} \sum_{(1,2)} - -#' \frac{1}{n_1^2} \sum_{(1)} - -#' \frac{1}{n_2^2} \sum_{(2)}},} -#' -#' where \eqn{n_1} is the number of samples in group 1, \eqn{\sum_{(1)}} is the -#' sum of squared distances between items in group 1, and \eqn{\sum_{(1,2)}} is -#' the sum of squared distances between items in group 1 and those in group 2. -#' -#' Sometimes, the distance between centroids is not a real number, because it -#' is not possible to create a space where this distance exists. Mathematically, -#' we get a negative number underneath the square root in the equation above. -#' If this happens, the function returns \code{NaN}. If you'd like to have -#' access to this value, you can set \code{squared = TRUE} to return the -#' squared distance between centroids. In this case, you will never get -#' \code{NaN}, but you might receive negative numbers in your result. -#' -#' @references Apostol, T.M. and Mnatsakanian, M.A. Sums of squares of distances -#' in m-space. Math. Assoc. Am. Monthly 110, 516 (2003). -#' -#' @export -dist_between_centroids <- function (d, idx1, idx2, squared = FALSE) { - if (is.logical(idx1)) { - n1 <- sum(idx1) - } else { - n1 <- length(idx1) - } - if (is.logical(idx2)) { - n2 <- sum(idx2) - } else { - n2 <- length(idx2) - } - d2 <- d ** 2 - sum1 <- sum(dist_subset(d2, idx1)) - sum2 <- sum(dist_subset(d2, idx2)) - sum12 <- sum(as.matrix(d2)[idx1, idx2]) - term1 <- sum1 / (n1 ** 2) - term2 <- sum2 / (n2 ** 2) - term12 <- sum12 / (n1 * n2) - result_squared <- term12 - term1 - term2 - if (squared) { - result_squared - } else { - is_negative <- result_squared < 0 - if (any(is_negative)) { - msg <- paste0( - "When computing distance between centroids, negative values were ", - "produced before taking a square root. ", - "This happens because the distances cannot be represented in a ", - "Euclidean coordinate system. ", - "These distances are being returned as NaN. ", - "Alternately, you may set `squared = TRUE` to return the squared ", - "distances. In this case, you will never get NaN, but you might ", - "receive negative numbers for the squared distance.") - warning(msg) - result <- numeric(length(result_squared)) - result[!is_negative] <- sqrt(result_squared[!is_negative]) - result[is_negative] <- NaN - result - } else { - sqrt(result_squared) - } - } -} - -#' Make a new distance matrix of centroid distances between multiple groups -#' @param d A distance matrix object of class \code{dist}. -#' @param g A factor representing the groups of items in \code{d}. -#' @param squared If \code{TRUE}, return the squared distance between centroids. -#' @return A distance matrix of distances between the group centroids. -#' @export -dist_multi_centroids <- function (d, g, squared = FALSE) { - group_idxs <- tapply(seq_along(g), g, c, simplify = FALSE) - centroid_distance_from_groups <- function (gg) { - g1 <- gg[1] - g2 <- gg[2] - idx1 <- group_idxs[[g1]] - idx2 <- group_idxs[[g2]] - dist_between_centroids(d, idx1, idx2, squared = squared) - } - dc <- utils::combn(names(group_idxs), 2, centroid_distance_from_groups) - attr(dc, "Size") <- length(names(group_idxs)) - attr(dc, "Labels") <- names(group_idxs) - attr(dc, "Diag") <- FALSE - attr(dc, "Upper") <- FALSE - class(dc) <- "dist" - dc -} +#' Compute distances from each item to group centroids +#' +#' @param d A distance matrix object of class \code{dist}. +#' @param g A factor representing the groups of items in \code{d}. +#' @param squared If \code{TRUE}, return the squared distance to group +#' centroids. +#' @return A data frame with distances to the group centroids: +#' +#' \describe{ +#' \item{Item}{ +#' A character vector of item labels from the dist object, or an integer +#' vector of item locations if labels are not present.} +#' \item{CentroidGroup}{ +#' The group for which the centroid distance is given. The column type +#' should match that of the argument g (the \code{unique} function is used +#' to generate this column).} +#' \item{CentroidDistance}{ +#' Inferred distance from the item to the centroid position of the +#' indicated group.}} +#' +#' @details +#' This function computes the distance from each item to the centroid positions +#' of groups defined in the argument \code{g}. This is accomplished without +#' determining the centroid positions directly; see the documentation for +#' \code{\link{dist_between_centroids}} for details on this procedure. +#' +#' If the distance can't be represented in a Euclidean space, the +#' \code{CentroidDistance} is set to \code{NaN}. See the documentation for +#' \code{\link{dist_between_centroids}} for further details. +#' +#' @export +dist_to_centroids <- function (d, g, squared = FALSE) { + d <- stats::as.dist(d) + d2 <- d ** 2 + items <- attr(d, "Labels") + # Use numeric index for items if the distance matrix has no labels + items <- if (is.null(items)) 1:attr(d, "Size") else items + group_items <- tapply(items, g, c) + group_sizes <- lapply(group_items, length) + group_d2s <- lapply(group_items, function (x) dist_subset(d2, x)) + within_group_sums <- lapply(group_d2s, sum) + df <- expand.grid(Item=items, CentroidGroup=unique(g), stringsAsFactors = F) + dist_to_group_centroid <- function (idx2, group) { + idx1 <- group_items[[group]] + n1 <- group_sizes[[group]] + sum1 <- within_group_sums[[group]] + sum12 <- sum(as.matrix(d2)[idx1, idx2]) + term1 <- sum1 / (n1 ** 2) + term12 <- sum12 / n1 + result_squared <- term12 - term1 + if (squared) { + result_squared + } else { + is_negative <- result_squared < 0 + if (any(is_negative)) { + msg <- paste0( + "When computing distance to centroids, negative values were ", + "produced before taking a square root. ", + "This happens because the distances cannot be represented in a ", + "Euclidean coordinate system. ", + "These distances are being returned as NaN. ", + "Alternately, you may set `squared = TRUE` to return the squared ", + "distances. In this case, you will never get NaN, but you might ", + "receive negative numbers for the squared distance.") + warning(msg) + result <- numeric(length(result_squared)) + result[!is_negative] <- sqrt(result_squared[!is_negative]) + result[is_negative] <- NaN + result + } else { + sqrt(result_squared) + } + } + } + df$CentroidDistance <- mapply( + dist_to_group_centroid, df$Item, df$CentroidGroup) + df +} + +#' Compute the distance between group centroids +#' +#' @param d A distance matrix object of class \code{dist}. +#' @param idx1 A vector of items in group 1. +#' @param idx2 A vector of items in group 2. +#' @param squared If \code{TRUE}, return the squared distance between centroids. +#' @return The distance between group centroids (see details). +#' +#' @details +#' If you have a distance matrix, and the objects are partitioned into groups, +#' you might like to know the distance between the group centroids. The +#' centroid of each group is simply the center of mass for the group. +#' +#' It is possible to infer the distance between group centroids directly from +#' the distances between items in each group. The \code{adonis} test in the +#' ecology package \code{vegan} takes advantage of this approach to carry out +#' an ANOVA-like test on distances. +#' +#' The approach rests on the assumption that the objects occupy some +#' high-dimensional Euclidean space. However, we do not have to actually +#' create the space to find the distance between centroids. Based on the +#' assumption that such a space exists, we can use an algebraic formula to +#' perform the computation. +#' +#' The formulas for this were presented by Apostol and Mnatsakanian in 2003, +#' though we need to re-arrange equation 28 in their paper to get the value +#' we want: +#' +#' \deqn{| c_1 - c_2 | = \sqrt{ +#' \frac{1}{n_1 n_2} \sum_{(1,2)} - +#' \frac{1}{n_1^2} \sum_{(1)} - +#' \frac{1}{n_2^2} \sum_{(2)}},} +#' +#' where \eqn{n_1} is the number of samples in group 1, \eqn{\sum_{(1)}} is the +#' sum of squared distances between items in group 1, and \eqn{\sum_{(1,2)}} is +#' the sum of squared distances between items in group 1 and those in group 2. +#' +#' Sometimes, the distance between centroids is not a real number, because it +#' is not possible to create a space where this distance exists. Mathematically, +#' we get a negative number underneath the square root in the equation above. +#' If this happens, the function returns \code{NaN}. If you'd like to have +#' access to this value, you can set \code{squared = TRUE} to return the +#' squared distance between centroids. In this case, you will never get +#' \code{NaN}, but you might receive negative numbers in your result. +#' +#' @references Apostol, T.M. and Mnatsakanian, M.A. Sums of squares of distances +#' in m-space. Math. Assoc. Am. Monthly 110, 516 (2003). +#' +#' @export +dist_between_centroids <- function (d, idx1, idx2, squared = FALSE) { + if (is.logical(idx1)) { + n1 <- sum(idx1) + } else { + n1 <- length(idx1) + } + if (is.logical(idx2)) { + n2 <- sum(idx2) + } else { + n2 <- length(idx2) + } + d2 <- d ** 2 + sum1 <- sum(dist_subset(d2, idx1)) + sum2 <- sum(dist_subset(d2, idx2)) + sum12 <- sum(as.matrix(d2)[idx1, idx2]) + term1 <- sum1 / (n1 ** 2) + term2 <- sum2 / (n2 ** 2) + term12 <- sum12 / (n1 * n2) + result_squared <- term12 - term1 - term2 + if (squared) { + result_squared + } else { + is_negative <- result_squared < 0 + if (any(is_negative)) { + msg <- paste0( + "When computing distance between centroids, negative values were ", + "produced before taking a square root. ", + "This happens because the distances cannot be represented in a ", + "Euclidean coordinate system. ", + "These distances are being returned as NaN. ", + "Alternately, you may set `squared = TRUE` to return the squared ", + "distances. In this case, you will never get NaN, but you might ", + "receive negative numbers for the squared distance.") + warning(msg) + result <- numeric(length(result_squared)) + result[!is_negative] <- sqrt(result_squared[!is_negative]) + result[is_negative] <- NaN + result + } else { + sqrt(result_squared) + } + } +} + +#' Make a new distance matrix of centroid distances between multiple groups +#' @param d A distance matrix object of class \code{dist}. +#' @param g A factor representing the groups of items in \code{d}. +#' @param squared If \code{TRUE}, return the squared distance between centroids. +#' @return A distance matrix of distances between the group centroids. +#' +#' @export +dist_multi_centroids <- function (d, g, squared = FALSE) { + group_idxs <- tapply(seq_along(g), g, c, simplify = FALSE) + centroid_distance_from_groups <- function (gg) { + g1 <- gg[1] + g2 <- gg[2] + idx1 <- group_idxs[[g1]] + idx2 <- group_idxs[[g2]] + dist_between_centroids(d, idx1, idx2, squared = squared) + } + dc <- utils::combn(names(group_idxs), 2, centroid_distance_from_groups) + attr(dc, "Size") <- length(names(group_idxs)) + attr(dc, "Labels") <- names(group_idxs) + attr(dc, "Diag") <- FALSE + attr(dc, "Upper") <- FALSE + class(dc) <- "dist" + dc +} diff --git a/R/dist.R b/R/dist.R index 0396c8b..a0130f7 100755 --- a/R/dist.R +++ b/R/dist.R @@ -1,174 +1,161 @@ -#' usedist: a package for working with distance matrices in R -#' -#' In usedist, we provide a number of functions to help with distance matrix -#' objects, such as those produced by the \code{dist} function. Some functions -#' are geared towards making or altering distance matrix objects. Others -#' relate to groups of items in the distance matrix. They provide access to -#' within- or between-group distances, or use these distances to infer the -#' distance to group centroids. -#' -#' @docType package -#' @name usedist -NULL - -#' Set the names/labels of a \code{dist} object. -#' -#' @param d A distance matrix object of class \code{dist}. -#' @param nm New labels for the rows/columns. -#' @return A distance matrix with new row/column labels. -#' @export -#' @examples -#' m4 <- matrix(1:16, nrow=4, dimnames=list(LETTERS[1:4])) -#' dm4 <- dist(m4) -#' dist_setNames(dm4, LETTERS[9:12]) -dist_setNames <- function (d, nm) { - # Convert to matrix so errors are generated on assignment - # if nm does not contain the same number of elements as d - dm <- as.matrix(d) - dimnames(dm) <- list(nm, nm) - stats::as.dist(dm) -} - -#' Retrieve distances from a \code{dist} object. -#' -#' @param d A distance matrix object of class \code{dist}. -#' @param idx1,idx2 Indices specifying the distances to extract. -#' @return A vector of distances. -#' @export -#' @examples -#' m4 <- matrix(1:16, nrow=4, dimnames=list(LETTERS[1:4])) -#' dm4 <- dist(m4) -#' dist_get(dm4, "A", "C") -#' dist_get(dm4, "A", c("A", "B", "C", "D")) -#' dist_get(dm4, c("A", "B", "C"), c("B", "D", "B")) -dist_get <- function (d, idx1, idx2) { - d <- stats::as.dist(d) - if (is.character(idx1)) { - idx1 <- match(idx1, attr(d, "Labels")) - } - if (is.character(idx2)) { - idx2 <- match(idx2, attr(d, "Labels")) - } - n <- attr(d, "Size") - if (any(is.na(idx1) | (idx1 < 1) | (idx1 > n))) { - stop("idx1 out of range") - } - if (any(is.na(idx2) | (idx2 < 1) | (idx2 > n))) { - stop("idx2 out of range") - } - i <- pmin(idx1, idx2) - j <- pmax(idx1, idx2) - # Zeros are eliminated from index vectors - # Need to fill with NA if i and j are equal - idx <- ifelse(i == j, NA, n*(i-1) - i*(i-1)/2 + j-i) - ifelse(i == j, 0, d[idx]) -} - -#' Extract parts of a \code{dist} object. -#' -#' Extract a subset of values from a distance matrix. This function also works -#' to re-arrange the rows of a distance matrix, if they are provided in the -#' desired order. -#' -#' @param d A distance matrix object of class \code{dist}. -#' @param idx Indices specifying the subset of distances to extract. -#' @return A distance matrix. -#' @export -#' @examples -#' m4 <- matrix(1:16, nrow=4, dimnames=list(LETTERS[1:4])) -#' dm4 <- dist(m4) -#' dist_subset(dm4, c("A", "B", "C")) -#' dist_subset(dm4, c("D", "C", "B", "A")) -dist_subset <- function (d, idx) { - stats::as.dist(as.matrix(d)[idx, idx]) -} - -#' Create a data frame of distances between groups of items. -#' -#' @param d A distance matrix object of class \code{dist}. -#' @param g A factor representing the groups of objects in \code{d}. -#' @return A data frame with 6 columns: -#' \describe{ -#' \item{Item1, Item2}{The items being compared.} -#' \item{Group1, Group2}{The groups to which the items belong.} -#' \item{Label}{A convenient label for plotting or comparison.} -#' \item{Distance}{The distance between Item1 and Item2.}} -#' @export -#' @examples -#' m4 <- matrix(1:16, nrow=4, dimnames=list(LETTERS[1:4])) -#' dm4 <- dist(m4) -#' g4 <- rep(c("Control", "Treatment"), each=2) -#' dist_groups(dm4, g4) -dist_groups <- function(d, g) { - d <- stats::as.dist(d) - g <- as.factor(g) - dsize <- attr(d, "Size") - if (length(g) != dsize) { - stop( - "Length of grouping vector (g) must equal number of observations in ", - "dist object (d)") - } - dlabels <- attr(d, "Labels") - idxs <- utils::combn(dsize, 2) - idx1 <- idxs[1,] - idx2 <- idxs[2,] - - # For the between group labels, we need to keep the groups in factor order. - # Here, we record the level of the group to use for the first and second - # parts of the label. - level1 <- levels(g)[pmin(as.numeric(g[idx1]), as.numeric(g[idx2]))] - level2 <- levels(g)[pmax(as.numeric(g[idx1]), as.numeric(g[idx2]))] - - data.frame( - Item1 = if (is.null(dlabels)) idx1 else dlabels[idx1], - Item2 = if (is.null(dlabels)) idx2 else dlabels[idx2], - Group1 = g[idx1], - Group2 = g[idx2], - Label = factor(ifelse( - level1 == level2, - paste("Within", level1), - paste("Between", level1, "and", level2))), - Distance = dist_get(d, idx1, idx2), - stringsAsFactors = FALSE) -} - -#' Make a distance matrix using a custom distance function -#' -#' @param x A matrix of observations, one per row -#' @param distance_fcn A function used to compute the distance between two -#' rows of the data matrix. The two rows will be passed as the first and -#' second arguments to \code{distance_fcn}. -#' @param ... Additional arguments passed to \code{distance_fcn}. -#' @return A \code{dist} object containing the distances between rows of the -#' data matrix. -#' @details We do not set the \code{call} or \code{method} attributes of the -#' \code{dist} object. -#' @export -#' @examples -#' x <- matrix(sin(1:30), nrow=5) -#' rownames(x) <- LETTERS[1:5] -#' manhattan_distance <- function (v1, v2) sum(abs(v1 - v2)) -#' dist_make(x, manhattan_distance) -dist_make <- function (x, distance_fcn, ...) { - distance_from_idxs <- function (idxs) { - i1 <- idxs[1] - i2 <- idxs[2] - distance_fcn(x[i1,], x[i2,], ...) - } - size <- nrow(x) - ##future::plan(future::multicore) should we assume the users will do this on their end? - if (is.element("future.apply", loadedNamespaces())) { - d <- future.apply::future_apply(utils::combn(size, 2), 2, distance_from_idxs) - } else { - d <- apply(utils::combn(size, 2), 2, distance_from_idxs) - } - attr(d, "Size") <- size - xnames <- rownames(x) - if (!is.null(xnames)) { - attr(d, "Labels") <- xnames - } - attr(d, "Diag") <- FALSE - attr(d, "Upper") <- FALSE - class(d) <- "dist" - d -} +#' Set the names/labels of a \code{dist} object. +#' +#' @param d A distance matrix object of class \code{dist}. +#' @param nm New labels for the rows/columns. +#' @return A distance matrix with new row/column labels. +#' @export +#' @examples +#' m4 <- matrix(1:16, nrow=4, dimnames=list(LETTERS[1:4])) +#' dm4 <- dist(m4) +#' dist_setNames(dm4, LETTERS[9:12]) +dist_setNames <- function (d, nm) { + # Convert to matrix so errors are generated on assignment + # if nm does not contain the same number of elements as d + dm <- as.matrix(d) + dimnames(dm) <- list(nm, nm) + stats::as.dist(dm) +} + +#' Retrieve distances from a \code{dist} object. +#' +#' @param d A distance matrix object of class \code{dist}. +#' @param idx1,idx2 Indices specifying the distances to extract. +#' @return A vector of distances. +#' @export +#' @examples +#' m4 <- matrix(1:16, nrow=4, dimnames=list(LETTERS[1:4])) +#' dm4 <- dist(m4) +#' dist_get(dm4, "A", "C") +#' dist_get(dm4, "A", c("A", "B", "C", "D")) +#' dist_get(dm4, c("A", "B", "C"), c("B", "D", "B")) +dist_get <- function (d, idx1, idx2) { + d <- stats::as.dist(d) + if (is.character(idx1)) { + idx1 <- match(idx1, attr(d, "Labels")) + } + if (is.character(idx2)) { + idx2 <- match(idx2, attr(d, "Labels")) + } + n <- attr(d, "Size") + if (any(is.na(idx1) | (idx1 < 1) | (idx1 > n))) { + stop("idx1 out of range") + } + if (any(is.na(idx2) | (idx2 < 1) | (idx2 > n))) { + stop("idx2 out of range") + } + i <- pmin(idx1, idx2) + j <- pmax(idx1, idx2) + # Zeros are eliminated from index vectors + # Need to fill with NA if i and j are equal + idx <- ifelse(i == j, NA, n*(i-1) - i*(i-1)/2 + j-i) + ifelse(i == j, 0, d[idx]) +} + +#' Extract parts of a \code{dist} object. +#' +#' Extract a subset of values from a distance matrix. This function also works +#' to re-arrange the rows of a distance matrix, if they are provided in the +#' desired order. +#' +#' @param d A distance matrix object of class \code{dist}. +#' @param idx Indices specifying the subset of distances to extract. +#' @return A distance matrix. +#' @export +#' @examples +#' m4 <- matrix(1:16, nrow=4, dimnames=list(LETTERS[1:4])) +#' dm4 <- dist(m4) +#' dist_subset(dm4, c("A", "B", "C")) +#' dist_subset(dm4, c("D", "C", "B", "A")) +dist_subset <- function (d, idx) { + stats::as.dist(as.matrix(d)[idx, idx]) +} + +#' Create a data frame of distances between groups of items. +#' +#' @param d A distance matrix object of class \code{dist}. +#' @param g A factor representing the groups of objects in \code{d}. +#' @return A data frame with 6 columns: +#' \describe{ +#' \item{Item1, Item2}{The items being compared.} +#' \item{Group1, Group2}{The groups to which the items belong.} +#' \item{Label}{A convenient label for plotting or comparison.} +#' \item{Distance}{The distance between Item1 and Item2.}} +#' @export +#' @examples +#' m4 <- matrix(1:16, nrow=4, dimnames=list(LETTERS[1:4])) +#' dm4 <- dist(m4) +#' g4 <- rep(c("Control", "Treatment"), each=2) +#' dist_groups(dm4, g4) +dist_groups <- function(d, g) { + d <- stats::as.dist(d) + g <- as.factor(g) + dsize <- attr(d, "Size") + if (length(g) != dsize) { + stop( + "Length of grouping vector (g) must equal number of observations in ", + "dist object (d)") + } + dlabels <- attr(d, "Labels") + idxs <- utils::combn(dsize, 2) + idx1 <- idxs[1,] + idx2 <- idxs[2,] + + # For the between group labels, we need to keep the groups in factor order. + # Here, we record the level of the group to use for the first and second + # parts of the label. + level1 <- levels(g)[pmin(as.numeric(g[idx1]), as.numeric(g[idx2]))] + level2 <- levels(g)[pmax(as.numeric(g[idx1]), as.numeric(g[idx2]))] + + data.frame( + Item1 = if (is.null(dlabels)) idx1 else dlabels[idx1], + Item2 = if (is.null(dlabels)) idx2 else dlabels[idx2], + Group1 = g[idx1], + Group2 = g[idx2], + Label = factor(ifelse( + level1 == level2, + paste("Within", level1), + paste("Between", level1, "and", level2))), + Distance = dist_get(d, idx1, idx2), + stringsAsFactors = FALSE) +} + +#' Make a distance matrix using a custom distance function +#' +#' @param x A matrix of observations, one per row +#' @param distance_fcn A function used to compute the distance between two +#' rows of the data matrix. The two rows will be passed as the first and +#' second arguments to \code{distance_fcn}. +#' @param ... Additional arguments passed to \code{distance_fcn}. +#' @return A \code{dist} object containing the distances between rows of the +#' data matrix. +#' @details We do not set the \code{call} or \code{method} attributes of the +#' \code{dist} object. +#' @export +#' @examples +#' x <- matrix(sin(1:30), nrow=5) +#' rownames(x) <- LETTERS[1:5] +#' manhattan_distance <- function (v1, v2) sum(abs(v1 - v2)) +#' dist_make(x, manhattan_distance) +dist_make <- function (x, distance_fcn, ...) { + distance_from_idxs <- function (idxs) { + i1 <- idxs[1] + i2 <- idxs[2] + distance_fcn(x[i1,], x[i2,], ...) + } + size <- nrow(x) + ##future::plan(future::multicore) should we assume the users will do this on their end? + if (is.element("future.apply", loadedNamespaces())) { + d <- future.apply::future_apply(utils::combn(size, 2), 2, distance_from_idxs) + } else { + d <- apply(utils::combn(size, 2), 2, distance_from_idxs) + } + attr(d, "Size") <- size + xnames <- rownames(x) + if (!is.null(xnames)) { + attr(d, "Labels") <- xnames + } + attr(d, "Diag") <- FALSE + attr(d, "Upper") <- FALSE + class(d) <- "dist" + d +} diff --git a/R/long_format.R b/R/long_format.R index 77d945c..6c14173 100755 --- a/R/long_format.R +++ b/R/long_format.R @@ -1,84 +1,80 @@ -#' Convert a data frame in long format to a matrix -#' -#' @param data A data frame in long format. -#' @param rows_from The column indicating the row of the matrix. -#' @param cols_from The column indicating the column of the matrix. -#' @param values_from The column indicating the value to be placed inside the -#' matrix. -#' @param fill The value to use for missing combinations of rows and columns. -#' @param obs_col,feature_col,value_col The same as \code{rows_from}, -#' \code{cols_from}, and \code{values_from}, respectively. -#' -#' @importFrom tidyr pivot_wider -#' @importFrom tibble column_to_rownames -#' @importFrom rlang as_name ensym -#' -#' @details -#' The parameters \code{rows_from}, \code{cols_from}, and \code{values_from} -#' should be provided as bare column names. -#' -#' This function requires the packages \code{tidyr}, \code{rlang}, and -#' \code{tibble} to be installed. If they are not installed, the function will -#' generate an error, with a message to install the appropriate packages. -#' -#' @export -#' @examples -#' longdata <- data.frame( -#' sample_id = paste0("Sample", c(1, 1, 1, 2, 2, 3, 3)), -#' feature_id = paste0("Feature", c(1, 2, 3, 1, 2, 2, 3)), -#' counts = c(132, 41, 7, 56, 11, 929, 83)) -#' pivot_to_matrix(longdata, sample_id, feature_id, counts) -pivot_to_matrix <- function (data, rows_from, cols_from, values_from, fill = 0) { - check_pkg_functions( - c("rlang", "as_name"), c("rlang", "ensym"), c("tidyr", "pivot_wider"), - c("tibble", "column_to_rownames")) - - values_fill <- list(fill) - names(values_fill) <- rlang::as_name(rlang::ensym(values_from)) - data_wide <- tidyr::pivot_wider( - data, - id_cols = {{ rows_from }}, - names_from = {{ cols_from }}, - values_from = {{ values_from }}, - values_fill = values_fill) - data_wide <- tibble::column_to_rownames( - data_wide, rlang::as_name(rlang::ensym(rows_from))) - as.matrix(as.data.frame(data_wide)) -} - -#' @describeIn pivot_to_matrix Specialized version for numeric values. -#' Deprecated; use \code{pivot_to_matrix} instead. -#' @export -pivot_to_numeric_matrix <- function (data, obs_col, feature_col, value_col) { - warning("Deprecated - please use pivot_to_matrix() instead.") - pivot_to_matrix( - data, {{ obs_col }}, {{ feature_col }}, {{ value_col }}, fill = 0) -} - -check_pkg_functions <- function (...) { - to_check <- list(...) - messages <- vapply( - to_check, error_message_for_pkg_function, FUN.VALUE = "a") - messages <- unique(messages[!is.na(messages)]) - if (length(messages) > 0) { - combined_messages <- paste( - "The following packages or functions are not available:", - messages, sep = " ", collapse = " ") - stop(combined_messages, call. = FALSE) - } -} - -error_message_for_pkg_function <- function (x) { - pkg <- x[1] - fcn <- x[2] - pkg_is_installed <- requireNamespace(pkg, quietly = TRUE) - if (!pkg_is_installed) { - return(paste("Package", pkg, "is not installed.")) - } - fcn_is_available <- exists(fcn, where=asNamespace(pkg), mode="function") - if (!fcn_is_available) { - return(paste( - "Package", pkg, "is installed but function", fcn, "is not available.")) - } - NA_character_ -} +#' Convert a data frame in long format to a matrix +#' +#' @param data A data frame in long format. +#' @param rows_from The column indicating the row of the matrix. +#' @param cols_from The column indicating the column of the matrix. +#' @param values_from The column indicating the value to be placed inside the +#' matrix. +#' @param fill The value to use for missing combinations of rows and columns. +#' @param obs_col,feature_col,value_col The same as \code{rows_from}, +#' \code{cols_from}, and \code{values_from}, respectively. +#' +#' @details +#' The parameters \code{rows_from}, \code{cols_from}, and \code{values_from} +#' should be provided as bare column names. +#' +#' This function requires the packages \code{tidyr}, \code{rlang}, and +#' \code{tibble} to be installed. If they are not installed, the function will +#' generate an error, with a message to install the appropriate packages. +#' +#' @export +#' @examples +#' longdata <- data.frame( +#' sample_id = paste0("Sample", c(1, 1, 1, 2, 2, 3, 3)), +#' feature_id = paste0("Feature", c(1, 2, 3, 1, 2, 2, 3)), +#' counts = c(132, 41, 7, 56, 11, 929, 83)) +#' pivot_to_matrix(longdata, sample_id, feature_id, counts) +pivot_to_matrix <- function (data, rows_from, cols_from, values_from, fill = 0) { + check_pkg_functions( + c("rlang", "as_name"), c("rlang", "ensym"), c("tidyr", "pivot_wider"), + c("tibble", "column_to_rownames")) + + values_fill <- list(fill) + names(values_fill) <- rlang::as_name(rlang::ensym(values_from)) + data_wide <- tidyr::pivot_wider( + data, + id_cols = {{ rows_from }}, + names_from = {{ cols_from }}, + values_from = {{ values_from }}, + values_fill = values_fill) + data_wide <- tibble::column_to_rownames( + data_wide, rlang::as_name(rlang::ensym(rows_from))) + as.matrix(as.data.frame(data_wide)) +} + +#' @describeIn pivot_to_matrix Specialized version for numeric values. +#' Deprecated; use \code{pivot_to_matrix} instead. +#' @export +pivot_to_numeric_matrix <- function (data, obs_col, feature_col, value_col) { + warning("Deprecated - please use pivot_to_matrix() instead.") + pivot_to_matrix( + data, {{ obs_col }}, {{ feature_col }}, {{ value_col }}, fill = 0) +} + +check_pkg_functions <- function (...) { + to_check <- list(...) + messages <- vapply( + to_check, error_message_for_pkg_function, FUN.VALUE = "a") + messages <- unique(messages[!is.na(messages)]) + if (length(messages) > 0) { + combined_messages <- paste( + "The following packages or functions are not available:", + messages, sep = " ", collapse = " ") + stop(combined_messages, call. = FALSE) + } +} + +error_message_for_pkg_function <- function (x) { + pkg <- x[1] + fcn <- x[2] + pkg_is_installed <- requireNamespace(pkg, quietly = TRUE) + if (!pkg_is_installed) { + return(paste("Package", pkg, "is not installed.")) + } + fcn_is_available <- exists(fcn, where=asNamespace(pkg), mode="function") + if (!fcn_is_available) { + return(paste( + "Package", pkg, "is installed but function", fcn, "is not available.")) + } + NA_character_ +} diff --git a/R/usedist-package.R b/R/usedist-package.R new file mode 100755 index 0000000..8794ab2 --- /dev/null +++ b/R/usedist-package.R @@ -0,0 +1,10 @@ +#' usedist: a package for working with distance matrices in R +#' +#' In usedist, we provide a number of functions to help with distance matrix +#' objects, such as those produced by the \code{dist} function. Some functions +#' are geared towards making or altering distance matrix objects. Others +#' relate to groups of items in the distance matrix. They provide access to +#' within- or between-group distances, or use these distances to infer the +#' distance to group centroids. +#' @keywords internal +"_PACKAGE" \ No newline at end of file diff --git a/README.Rmd b/README.Rmd index bb5c8f0..2932df4 100755 --- a/README.Rmd +++ b/README.Rmd @@ -1,241 +1,240 @@ ---- -output: github_document ---- - - - -```{r, echo = FALSE} -knitr::opts_chunk$set( - fig.path = "tools/readme/" -) -``` - -```{r echo=FALSE, message=FALSE} -devtools::load_all() -set.seed(0) -``` - -# usedist - -This package provides useful functions for distance matrix objects in R. - - -[![R-CMD-check](https://github.com/kylebittinger/usedist/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/kylebittinger/usedist/actions/workflows/R-CMD-check.yaml) -[![Codecov test coverage](https://codecov.io/gh/kylebittinger/usedist/branch/master/graph/badge.svg)](https://app.codecov.io/gh/kylebittinger/usedist?branch=master) - - -## Installation - -You can install usedist from github with: - -```{r eval=FALSE} -# install.packages("devtools") -devtools::install_github("kylebittinger/usedist") -``` - -## Introduction to distance matrices in R - -In R, the `dist()` function is used to compute a distance matrix. But the -result you get back isn't really a matrix, it's a `"dist"` object. Under -the hood, the `"dist"` object is stored as a simple vector. When it's -printed out, R knows how to make it look like a matrix. Let's make a distance -object representing the distances between six rows of data. - -Here is our data matrix, `X`: - -```{r} -X <- matrix(rnorm(30), nrow=6) -rownames(X) <- c("A", "B", "C", "D", "E", "F") -X -``` - -And here is our `"dist"` object, `d`, representing the distance between rows of -`X`: - -```{r} -d <- dist(X) -d -``` - -These `"dist"` objects are great, but R does not provide a set of functions to -work with them conveniently. That's where the `usedist` package comes in. - -## Working with "dist" objects - -The `usedist` package provides some basic functions for altering or selecting -distances from a `"dist"` object. - -```{r eval=FALSE} -library(usedist) -``` - -To start, we can make a new `"dist"` object, containing the distances between -rows B, C, F, and D. Our new object contains the rows *in the order we -specified*: - -```{r} -dist_subset(d, c("B", "C", "F", "D")) -``` - -This is especially helpful when arranging a distance matrix to match a data -frame, for instance with the `adonis()` function in `vegan`. - -We can extract distances between specified pairs of rows. For example, -we'll pull out the distances for rows A-to-D, B-to-E, and C-to-F. To extract -specific distance values, we use `dist_get()`. This function takes two vectors -of row labels: one vector for the rows of origin, and another for the rows of -destination. - -```{r} -origin_row <- c("A", "B", "C") -destination_row <- c("D", "E", "F") -dist_get(d, origin_row, destination_row) -``` - -If rows are arranged in groups, we might like to have a data frame listing the -distances alongside the groups for each pair of rows. The `dist_groups()` -function makes a data frame from the groups, and also adds in a nice label that -you might use for plots. - -```{r} -item_groups <- rep(c("Control", "Treatment"), each=3) -dist_groups(d, item_groups) -``` - -You might have your own distance function that you'd like to use, beyond the -options available in `dist()` or `vegan::vegdist()`. For example, the RMS -distance is kind of like the Euclidean distance, but you take the mean of the -squared differences instead of the sum inside the square root. Let's define the -distance function: - -```{r} -rms_distance <- function (r1, r2) { - sqrt(mean((r2- r1) ^ 2)) -} -``` - -Then, we can pass it to `dist_make()` to create a new distance matrix of RMS -distances. - -```{r} -dist_make(X, rms_distance) -``` - -## Centroid functions - -The `usedist` package contains functions for computing the distance to group -centroid positions. This is accomplished without finding the location of the -centroids themselves, though it is assumed that some high-dimensional Euclidean -space exists where the centroids can be situated. References for the formulas -used can be found in the function documentation. - -To illustrate, let's create a set of points in 2-dimensional space. Four -points will be centered around the origin, and four around the point (3, 0). - -```{r centroid_example} -pts <- data.frame( - x = c(-1, 0, 0, 1, 2, 3, 3, 4), - y = c(0, 1, -1, 0, 0, 1, -1, 0), - Item = LETTERS[1:8], - Group = rep(c("Control", "Treatment"), each=4)) - -library(ggplot2) -ggplot(pts, aes(x=x, y=y)) + - geom_point(aes(color=Group)) + - geom_text(aes(label=Item), hjust=1.5) + - coord_equal() -``` - -Our goal is to figure out distances for the group centroids using only the -distances between points. First, we need to put the data in matrix format. - -```{r} -pts_matrix <- as.matrix(pts[,c("x", "y")]) -rownames(pts_matrix) <- pts$Item -``` - -Now, we'll compute the point-to-point distances with `dist()`. - -```{r} -pts_distances <- dist(pts_matrix) -pts_distances -``` - -The function `dist_between_centroids()` will calculate the distance between -the centroids of the two groups. Here, we expect to get a distance of 3. - -```{r} -dist_between_centroids( - pts_distances, c("A", "B", "C", "D"), c("E", "F", "G", "H")) -``` - -The function is only using the distance matrix; it doesn't know where the -individual points are in space. - -We can use another function, `dist_to_centroids()`, to calculate the distance -from each individual point to the group centroids. Again, this works without -knowing the point locations, only the distances between points. In our example, -the distances within the Control group and within the Treatment group should -all be equal to 1. - -```{r} -dist_to_centroids(pts_distances, pts$Group) -``` - -You can use the Pythagorean theorem to check that the other distances are -correct. The distance between point "G" and the centroid for the *Control* -group should be sqrt(3^2^ + 1^2^) = sqrt(10) = 3.162278. - -## Long format data - -Many times, the data is not stored as a matrix, but is represented in "long" -format as a data frame. In this case, one column of the data frame gives the -row label for the matrix, another indicates the column label, and a -third provides the value. To get a real data matrix, we have to "pivot" the -data frame and convert to matrix form. Because this is such a common operation, -`usedist` includes a convenience function, `pivot_to_matrix()`. - -Here is an example of data in long format: - -```{r} -data_long <- data.frame( - row_id = c("A", "A", "A", "B", "B", "C", "C"), - column_id = c("x", "y", "z", "x", "y", "y", "z"), - value = rpois(7, 12)) -data_long -``` - -The data table has no value for row "B" and column "z". By default, a -value of 0 is filled in for missing combinations when we convert to matrix -format. Here is how we convert: - -```{r} -data_matrix <- pivot_to_matrix(data_long, row_id, column_id, value) -data_matrix -``` - -Note that we provide bare column names in the call to -`pivot_to_matrix()`. This function requires some extra packages to -be installed. They are listed as suggestions for `usedist`. If the -additional packages are not installed on your system, you'll get an error -message with the missing packages listed. - -The matrix format is what we need for distance calculations. If you want to -convert from long format and use a custom distance function, you can combine -`pivot_to_matrix()` with `dist_make()`: - -```{r} -dist_make(data_matrix, rms_distance) -``` - -## Parallelization - -Distance calculations can get computationally expensive with large sample sizes. -With the installation of future.apply package, you cna compute the distances in -parallel to save time. - -```{r message=F, warning=F} -library(future.apply) -future::plan(future::multisession) -dist_make(data_matrix, rms_distance) -``` +--- +output: github_document +--- + + + +```{r, echo = FALSE} +knitr::opts_chunk$set( + fig.path = "tools/readme/" +) +``` + +```{r echo=FALSE, message=FALSE} +devtools::load_all() +set.seed(0) +``` + +# usedist + +This package provides useful functions for distance matrix objects in R. + + +[![R-CMD-check](https://github.com/kylebittinger/usedist/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/kylebittinger/usedist/actions/workflows/R-CMD-check.yaml) + + +## Installation + +You can install usedist from github with: + +```{r eval=FALSE} +# install.packages("devtools") +devtools::install_github("kylebittinger/usedist") +``` + +## Introduction to distance matrices in R + +In R, the `dist()` function is used to compute a distance matrix. But the +result you get back isn't really a matrix, it's a `"dist"` object. Under +the hood, the `"dist"` object is stored as a simple vector. When it's +printed out, R knows how to make it look like a matrix. Let's make a distance +object representing the distances between six rows of data. + +Here is our data matrix, `X`: + +```{r} +X <- matrix(rnorm(30), nrow=6) +rownames(X) <- c("A", "B", "C", "D", "E", "F") +X +``` + +And here is our `"dist"` object, `d`, representing the distance between rows of +`X`: + +```{r} +d <- dist(X) +d +``` + +These `"dist"` objects are great, but R does not provide a set of functions to +work with them conveniently. That's where the `usedist` package comes in. + +## Working with "dist" objects + +The `usedist` package provides some basic functions for altering or selecting +distances from a `"dist"` object. + +```{r eval=FALSE} +library(usedist) +``` + +To start, we can make a new `"dist"` object, containing the distances between +rows B, C, F, and D. Our new object contains the rows *in the order we +specified*: + +```{r} +dist_subset(d, c("B", "C", "F", "D")) +``` + +This is especially helpful when arranging a distance matrix to match a data +frame, for instance with the `adonis()` function in `vegan`. + +We can extract distances between specified pairs of rows. For example, +we'll pull out the distances for rows A-to-D, B-to-E, and C-to-F. To extract +specific distance values, we use `dist_get()`. This function takes two vectors +of row labels: one vector for the rows of origin, and another for the rows of +destination. + +```{r} +origin_row <- c("A", "B", "C") +destination_row <- c("D", "E", "F") +dist_get(d, origin_row, destination_row) +``` + +If rows are arranged in groups, we might like to have a data frame listing the +distances alongside the groups for each pair of rows. The `dist_groups()` +function makes a data frame from the groups, and also adds in a nice label that +you might use for plots. + +```{r} +item_groups <- rep(c("Control", "Treatment"), each=3) +dist_groups(d, item_groups) +``` + +You might have your own distance function that you'd like to use, beyond the +options available in `dist()` or `vegan::vegdist()`. For example, the RMS +distance is kind of like the Euclidean distance, but you take the mean of the +squared differences instead of the sum inside the square root. Let's define the +distance function: + +```{r} +rms_distance <- function (r1, r2) { + sqrt(mean((r2- r1) ^ 2)) +} +``` + +Then, we can pass it to `dist_make()` to create a new distance matrix of RMS +distances. + +```{r} +dist_make(X, rms_distance) +``` + +## Centroid functions + +The `usedist` package contains functions for computing the distance to group +centroid positions. This is accomplished without finding the location of the +centroids themselves, though it is assumed that some high-dimensional Euclidean +space exists where the centroids can be situated. References for the formulas +used can be found in the function documentation. + +To illustrate, let's create a set of points in 2-dimensional space. Four +points will be centered around the origin, and four around the point (3, 0). + +```{r centroid_example} +pts <- data.frame( + x = c(-1, 0, 0, 1, 2, 3, 3, 4), + y = c(0, 1, -1, 0, 0, 1, -1, 0), + Item = LETTERS[1:8], + Group = rep(c("Control", "Treatment"), each=4)) + +library(ggplot2) +ggplot(pts, aes(x=x, y=y)) + + geom_point(aes(color=Group)) + + geom_text(aes(label=Item), hjust=1.5) + + coord_equal() +``` + +Our goal is to figure out distances for the group centroids using only the +distances between points. First, we need to put the data in matrix format. + +```{r} +pts_matrix <- as.matrix(pts[,c("x", "y")]) +rownames(pts_matrix) <- pts$Item +``` + +Now, we'll compute the point-to-point distances with `dist()`. + +```{r} +pts_distances <- dist(pts_matrix) +pts_distances +``` + +The function `dist_between_centroids()` will calculate the distance between +the centroids of the two groups. Here, we expect to get a distance of 3. + +```{r} +dist_between_centroids( + pts_distances, c("A", "B", "C", "D"), c("E", "F", "G", "H")) +``` + +The function is only using the distance matrix; it doesn't know where the +individual points are in space. + +We can use another function, `dist_to_centroids()`, to calculate the distance +from each individual point to the group centroids. Again, this works without +knowing the point locations, only the distances between points. In our example, +the distances within the Control group and within the Treatment group should +all be equal to 1. + +```{r} +dist_to_centroids(pts_distances, pts$Group) +``` + +You can use the Pythagorean theorem to check that the other distances are +correct. The distance between point "G" and the centroid for the *Control* +group should be sqrt(3^2^ + 1^2^) = sqrt(10) = 3.162278. + +## Long format data + +Many times, the data is not stored as a matrix, but is represented in "long" +format as a data frame. In this case, one column of the data frame gives the +row label for the matrix, another indicates the column label, and a +third provides the value. To get a real data matrix, we have to "pivot" the +data frame and convert to matrix form. Because this is such a common operation, +`usedist` includes a convenience function, `pivot_to_matrix()`. + +Here is an example of data in long format: + +```{r} +data_long <- data.frame( + row_id = c("A", "A", "A", "B", "B", "C", "C"), + column_id = c("x", "y", "z", "x", "y", "y", "z"), + value = rpois(7, 12)) +data_long +``` + +The data table has no value for row "B" and column "z". By default, a +value of 0 is filled in for missing combinations when we convert to matrix +format. Here is how we convert: + +```{r} +data_matrix <- pivot_to_matrix(data_long, row_id, column_id, value) +data_matrix +``` + +Note that we provide bare column names in the call to +`pivot_to_matrix()`. This function requires some extra packages to +be installed. They are listed as suggestions for `usedist`. If the +additional packages are not installed on your system, you'll get an error +message with the missing packages listed. + +The matrix format is what we need for distance calculations. If you want to +convert from long format and use a custom distance function, you can combine +`pivot_to_matrix()` with `dist_make()`: + +```{r} +dist_make(data_matrix, rms_distance) +``` + +## Parallelization + +Distance calculations can get computationally expensive with large sample sizes. +With the installation of future.apply package, you can compute the distances in +parallel to save time. + +```{r message=F, warning=F} +library(future.apply) +future::plan(future::multisession) +dist_make(data_matrix, rms_distance) +``` diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 39a380b..d1f0b10 --- a/README.md +++ b/README.md @@ -8,8 +8,6 @@ This package provides useful functions for distance matrix objects in R. [![R-CMD-check](https://github.com/kylebittinger/usedist/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/kylebittinger/usedist/actions/workflows/R-CMD-check.yaml) -[![Codecov test -coverage](https://codecov.io/gh/kylebittinger/usedist/branch/master/graph/badge.svg)](https://app.codecov.io/gh/kylebittinger/usedist?branch=master) ## Installation @@ -321,7 +319,7 @@ dist_make(data_matrix, rms_distance) ## Parallelization Distance calculations can get computationally expensive with large -sample sizes. With the installation of future.apply package, you cna +sample sizes. With the installation of future.apply package, you can compute the distances in parallel to save time. ``` r diff --git a/cran-comments.md b/cran-comments.md old mode 100644 new mode 100755 diff --git a/man/dist_between_centroids.Rd b/man/dist_between_centroids.Rd old mode 100644 new mode 100755 diff --git a/man/dist_get.Rd b/man/dist_get.Rd old mode 100644 new mode 100755 diff --git a/man/dist_groups.Rd b/man/dist_groups.Rd old mode 100644 new mode 100755 diff --git a/man/dist_make.Rd b/man/dist_make.Rd old mode 100644 new mode 100755 diff --git a/man/dist_multi_centroids.Rd b/man/dist_multi_centroids.Rd old mode 100644 new mode 100755 diff --git a/man/dist_setNames.Rd b/man/dist_setNames.Rd old mode 100644 new mode 100755 diff --git a/man/dist_subset.Rd b/man/dist_subset.Rd old mode 100644 new mode 100755 diff --git a/man/dist_to_centroids.Rd b/man/dist_to_centroids.Rd old mode 100644 new mode 100755 diff --git a/man/usedist.Rd b/man/usedist-package.Rd old mode 100644 new mode 100755 similarity index 73% rename from man/usedist.Rd rename to man/usedist-package.Rd index 67b6559..c1522b6 --- a/man/usedist.Rd +++ b/man/usedist-package.Rd @@ -1,8 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dist.R +% Please edit documentation in R/usedist-package.R \docType{package} -\name{usedist} +\name{usedist-package} \alias{usedist} +\alias{usedist-package} \title{usedist: a package for working with distance matrices in R} \description{ In usedist, we provide a number of functions to help with distance matrix @@ -12,3 +13,8 @@ relate to groups of items in the distance matrix. They provide access to within- or between-group distances, or use these distances to infer the distance to group centroids. } +\author{ +\strong{Maintainer}: Kyle Bittinger \email{kylebittinger@gmail.com} + +} +\keyword{internal} diff --git a/tests/testthat.R b/tests/testthat.R old mode 100644 new mode 100755 diff --git a/tests/testthat/test_long_format.R b/tests/testthat/test_long_format.R old mode 100644 new mode 100755 diff --git a/tools/readme/centroid_example-1.png b/tools/readme/centroid_example-1.png index ceb5ae9..b2d2ffb 100755 Binary files a/tools/readme/centroid_example-1.png and b/tools/readme/centroid_example-1.png differ diff --git a/usedist.Rproj b/usedist.Rproj new file mode 100755 index 0000000..8d82f1f --- /dev/null +++ b/usedist.Rproj @@ -0,0 +1,17 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source