34
34
# ' \item{\code{"K"}:}{Yule's \emph{K} (Yule, 1944, as presented in Tweedie &
35
35
# ' Baayen, 1998, Eq. 16) is calculated by: \deqn{K = 10^4 \times
36
36
# ' \left[ -\frac{1}{N} + \sum_{i=1}^{V} f_v(i, N) \left( \frac{i}{N} \right)^2 \right] }}
37
+ # '
38
+ # ' \item{\code{"I"}:}{Yule's \emph{I} (Yule, 1944) is calculated by: \deqn{I = \frac{V^2}{M_2 - V}}
39
+ # ' \deqn{M_2 = \sum_{i=1}^{V} i^2 * f_v(i, N)}
37
40
# '
38
41
# ' \item{\code{"D"}:}{Simpson's \emph{D} (Simpson 1949, as presented in
39
42
# ' Tweedie & Baayen, 1998, Eq. 17) is calculated by:
142
145
# ' toks <- tokens(corpus_subset(data_corpus_inaugural, Year > 2000))
143
146
# ' textstat_lexdiv(toks, c("CTTR", "TTR", "MATTR"), MATTR_window = 100)
144
147
textstat_lexdiv <- function (x ,
145
- measure = c(" TTR" , " C" , " R" , " CTTR" , " U" , " S" , " K" , " D" ,
148
+ measure = c(" TTR" , " C" , " R" , " CTTR" , " U" , " S" , " K" , " I " , " D" ,
146
149
" Vm" , " Maas" , " MATTR" , " MSTTR" , " all" ),
147
150
remove_numbers = TRUE , remove_punct = TRUE ,
148
151
remove_symbols = TRUE , remove_hyphens = FALSE ,
@@ -160,7 +163,7 @@ textstat_lexdiv.default <- function(x, ...) {
160
163
161
164
# ' @export
162
165
textstat_lexdiv.dfm <- function (x ,
163
- measure = c(" TTR" , " C" , " R" , " CTTR" , " U" , " S" , " K" , " D" ,
166
+ measure = c(" TTR" , " C" , " R" , " CTTR" , " U" , " S" , " K" , " I " , " D" ,
164
167
" Vm" , " Maas" , " all" ),
165
168
remove_numbers = TRUE , remove_punct = TRUE ,
166
169
remove_symbols = TRUE , remove_hyphens = FALSE ,
@@ -204,7 +207,7 @@ textstat_lexdiv.dfm <- function(x,
204
207
# ' @export
205
208
textstat_lexdiv.tokens <-
206
209
function (x ,
207
- measure = c(" TTR" , " C" , " R" , " CTTR" , " U" , " S" , " K" , " D" ,
210
+ measure = c(" TTR" , " C" , " R" , " CTTR" , " U" , " S" , " K" , " I " , " D" ,
208
211
" Vm" , " Maas" , " MATTR" , " MSTTR" , " all" ),
209
212
remove_numbers = TRUE , remove_punct = TRUE ,
210
213
remove_symbols = TRUE , remove_hyphens = FALSE ,
279
282
compute_lexdiv_dfm_stats <- function (x , measure = NULL , log.base = 10 ) {
280
283
281
284
n_tokens <- n_types <- TTR <- C <- R <- CTTR <- U <- S <- Maas <-
282
- lgV0 <- lgeV0 <- K <- D <- Vm <- NULL
285
+ lgV0 <- lgeV0 <- K <- D <- Vm <- I <- NULL
283
286
temp <- data.table(n_tokens = ntoken(x ), n_types = ntype(x ))
284
287
285
288
if (" TTR" %in% measure )
@@ -302,9 +305,9 @@ compute_lexdiv_dfm_stats <- function(x, measure = NULL, log.base = 10) {
302
305
temp [, S : = log(log(n_types , base = log.base ), base = log.base ) /
303
306
log(log(n_tokens , base = log.base ), base = log.base )]
304
307
305
- # computations for K, D, Vm
308
+ # computations for K, D, Vm, I
306
309
# produces a list of data.frames that will be used for computing the measures
307
- if (length(intersect(c(" K" , " D" , " Vm" ), measure ))) {
310
+ if (length(intersect(c(" K" , " D" , " Vm" , " I " ), measure ))) {
308
311
ViN <- lapply(docnames(x ), function (y ) {
309
312
result <- as.data.frame(table(colSums(x [y , ])), stringsAsFactors = FALSE )
310
313
names(result ) <- c(" i" , " ViN" )
@@ -317,7 +320,14 @@ compute_lexdiv_dfm_stats <- function(x, measure = NULL, log.base = 10) {
317
320
318
321
if (" K" %in% measure )
319
322
temp [, K : = 10 ^ 4 * vapply(ViN , function (y ) sum(y $ ViN * (y $ i / y $ n_tokens ) ^ 2 ), numeric (1 ))]
320
-
323
+ if (" I" %in% measure ) {
324
+ M_2 <- vapply(ViN , function (y ) sum(y $ ViN * y $ i ^ 2 ), numeric (1 ))
325
+ M_1 <- temp $ n_types
326
+ yule_i <- (M_1 ^ 2 ) / (M_2 - M_1 )
327
+ yule_i [yule_i == Inf ] <- 0
328
+ temp [, I : = yule_i ]
329
+ }
330
+
321
331
if (" D" %in% measure )
322
332
temp [, D : = vapply(ViN ,
323
333
function (y ) sum(y $ ViN * (y $ i / y $ n_tokens ) * ( (y $ i - 1 ) / (y $ n_tokens - 1 )) ),
0 commit comments