Skip to content

Commit 75d279d

Browse files
committed
Merge branch 'yule_i' of https://github.com/chainsawriot/quanteda into chainsawriot-yule_i
2 parents 5e63c6b + 8aabd69 commit 75d279d

File tree

3 files changed

+32
-74
lines changed

3 files changed

+32
-74
lines changed

R/textstat_lexdiv.R

+17-7
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@
3434
#' \item{\code{"K"}:}{Yule's \emph{K} (Yule, 1944, as presented in Tweedie &
3535
#' Baayen, 1998, Eq. 16) is calculated by: \deqn{K = 10^4 \times
3636
#' \left[ -\frac{1}{N} + \sum_{i=1}^{V} f_v(i, N) \left( \frac{i}{N} \right)^2 \right] }}
37+
#'
38+
#' \item{\code{"I"}:}{Yule's \emph{I} (Yule, 1944) is calculated by: \deqn{I = \frac{V^2}{M_2 - V}}
39+
#' \deqn{M_2 = \sum_{i=1}^{V} i^2 * f_v(i, N)}
3740
#'
3841
#' \item{\code{"D"}:}{Simpson's \emph{D} (Simpson 1949, as presented in
3942
#' Tweedie & Baayen, 1998, Eq. 17) is calculated by:
@@ -142,7 +145,7 @@
142145
#' toks <- tokens(corpus_subset(data_corpus_inaugural, Year > 2000))
143146
#' textstat_lexdiv(toks, c("CTTR", "TTR", "MATTR"), MATTR_window = 100)
144147
textstat_lexdiv <- function(x,
145-
measure = c("TTR", "C", "R", "CTTR", "U", "S", "K", "D",
148+
measure = c("TTR", "C", "R", "CTTR", "U", "S", "K", "I","D",
146149
"Vm", "Maas", "MATTR", "MSTTR", "all"),
147150
remove_numbers = TRUE, remove_punct = TRUE,
148151
remove_symbols = TRUE, remove_hyphens = FALSE,
@@ -160,7 +163,7 @@ textstat_lexdiv.default <- function(x, ...) {
160163

161164
#' @export
162165
textstat_lexdiv.dfm <- function(x,
163-
measure = c("TTR", "C", "R", "CTTR", "U", "S", "K", "D",
166+
measure = c("TTR", "C", "R", "CTTR", "U", "S", "K", "I", "D",
164167
"Vm", "Maas", "all"),
165168
remove_numbers = TRUE, remove_punct = TRUE,
166169
remove_symbols = TRUE, remove_hyphens = FALSE,
@@ -204,7 +207,7 @@ textstat_lexdiv.dfm <- function(x,
204207
#' @export
205208
textstat_lexdiv.tokens <-
206209
function(x,
207-
measure = c("TTR", "C", "R", "CTTR", "U", "S", "K", "D",
210+
measure = c("TTR", "C", "R", "CTTR", "U", "S", "K", "I", "D",
208211
"Vm", "Maas", "MATTR", "MSTTR", "all"),
209212
remove_numbers = TRUE, remove_punct = TRUE,
210213
remove_symbols = TRUE, remove_hyphens = FALSE,
@@ -279,7 +282,7 @@ NULL
279282
compute_lexdiv_dfm_stats <- function(x, measure = NULL, log.base = 10) {
280283

281284
n_tokens <- n_types <- TTR <- C <- R <- CTTR <- U <- S <- Maas <-
282-
lgV0 <- lgeV0 <- K <- D <- Vm <- NULL
285+
lgV0 <- lgeV0 <- K <- D <- Vm <- I <- NULL
283286
temp <- data.table(n_tokens = ntoken(x), n_types = ntype(x))
284287

285288
if ("TTR" %in% measure)
@@ -302,9 +305,9 @@ compute_lexdiv_dfm_stats <- function(x, measure = NULL, log.base = 10) {
302305
temp[, S := log(log(n_types, base = log.base), base = log.base) /
303306
log(log(n_tokens, base = log.base), base = log.base)]
304307

305-
# computations for K, D, Vm
308+
# computations for K, D, Vm, I
306309
# produces a list of data.frames that will be used for computing the measures
307-
if (length(intersect(c("K", "D", "Vm"), measure))) {
310+
if (length(intersect(c("K", "D", "Vm", "I"), measure))) {
308311
ViN <- lapply(docnames(x), function(y) {
309312
result <- as.data.frame(table(colSums(x[y, ])), stringsAsFactors = FALSE)
310313
names(result) <- c("i", "ViN")
@@ -317,7 +320,14 @@ compute_lexdiv_dfm_stats <- function(x, measure = NULL, log.base = 10) {
317320

318321
if ("K" %in% measure)
319322
temp[, K := 10 ^ 4 * vapply(ViN, function(y) sum(y$ViN * (y$i / y$n_tokens) ^ 2), numeric(1))]
320-
323+
if ("I" %in% measure) {
324+
M_2 <- vapply(ViN, function(y) sum(y$ViN * y$i^2), numeric(1))
325+
M_1 <- temp$n_types
326+
yule_i <- (M_1 ^ 2) / (M_2 - M_1)
327+
yule_i[yule_i== Inf] <- 0
328+
temp[, I := yule_i]
329+
}
330+
321331
if ("D" %in% measure)
322332
temp[, D := vapply(ViN,
323333
function(y) sum(y$ViN * (y$i / y$n_tokens) * ( (y$i - 1) / (y$n_tokens - 1)) ),

man/textstat_lexdiv.Rd

+4-67
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-textstat_lexdiv.R

+11
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,17 @@ test_that("textstat_lexdiv Maas works correct", {
5454
)
5555
})
5656

57+
test_that("textstat_lexdiv I works correct", {
58+
mydfm <- dfm(c(d1 = "a b c",
59+
d2 = "a a b b c"))
60+
expect_equivalent(
61+
textstat_lexdiv(mydfm, "I")$I[1], 0, tolerance = 0.01
62+
)
63+
expect_equivalent(
64+
textstat_lexdiv(mydfm, "I")$I[2], (3^2) / ((1 + 2 * 2^2) - 3), tolerance = 0.01
65+
)
66+
})
67+
5768
test_that("textstat_lexdiv works with a single document dfm (#706)", {
5869
mytxt <- "one one two one one two one"
5970
mydfm <- dfm(mytxt)

0 commit comments

Comments
 (0)