forked from quanteda/quanteda
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocfreq.Rd
81 lines (73 loc) · 3.06 KB
/
docfreq.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dfm_weight.R
\name{docfreq}
\alias{docfreq}
\title{Compute the (weighted) document frequency of a feature}
\usage{
docfreq(x, scheme = c("count", "inverse", "inversemax", "inverseprob",
"unary"), smoothing = 0, k = 0, base = 10, threshold = 0,
use.names = TRUE)
}
\arguments{
\item{x}{a \link{dfm}}
\item{scheme}{type of document frequency weighting, computed as
follows, where \eqn{N} is defined as the number of documents in the dfm and
\eqn{s} is the smoothing constant:
\describe{
\item{\code{count}}{\eqn{df_j}, the number of documents for which \eqn{n_{ij} > threshold}}
\item{\code{inverse}}{\deqn{\textrm{log}_{base}\left(s + \frac{N}{k + df_j}\right)}}
\item{\code{inversemax}}{\deqn{\textrm{log}_{base}\left(s + \frac{\textrm{max}(df_j)}{k + df_j}\right)}}
\item{\code{inverseprob}}{\deqn{\textrm{log}_{base}\left(\frac{N - df_j}{k + df_j}\right)}}
\item{\code{unary}}{1 for each feature}
}}
\item{smoothing}{added to the quotient before taking the logarithm}
\item{k}{added to the denominator in the "inverse" weighting types, to
prevent a zero document count for a term}
\item{base}{the base with respect to which logarithms in the inverse document
frequency weightings are computed; default is 10 (see Manning,
Raghavan, and Schütze 2008, p123).}
\item{threshold}{numeric value of the threshold \emph{above which} a feature
will considered in the computation of document frequency. The default is
0, meaning that a feature's document frequency will be the number of
documents in which it occurs greater than zero times.}
\item{use.names}{logical; if \code{TRUE} attach feature labels as names of
the resulting numeric vector}
\item{...}{not used}
}
\value{
a numeric vector of document frequencies for each feature
}
\description{
For a \link{dfm} object, returns a (weighted) document frequency for each
term. The default is a simple count of the number of documents in which a
feature occurs more than a given frequency threshold. (The default threshold
is zero, meaning that any feature occurring at least once in a document will
be counted.)
}
\examples{
dfmat1 <- dfm(data_corpus_inaugural[1:2])
docfreq(dfmat1[, 1:20])
# replication of worked example from
# https://en.wikipedia.org/wiki/Tf-idf#Example_of_tf.E2.80.93idf
dfmat2 <-
matrix(c(1,1,2,1,0,0, 1,1,0,0,2,3),
byrow = TRUE, nrow = 2,
dimnames = list(docs = c("document1", "document2"),
features = c("this", "is", "a", "sample",
"another", "example"))) \%>\%
as.dfm()
dfmat2
docfreq(dfmat2)
docfreq(dfmat2, scheme = "inverse")
docfreq(dfmat2, scheme = "inverse", k = 1, smoothing = 1)
docfreq(dfmat2, scheme = "unary")
docfreq(dfmat2, scheme = "inversemax")
docfreq(dfmat2, scheme = "inverseprob")
}
\references{
Manning, C. D., Raghavan, P., & Schütze, H. (2008).
\emph{Introduction to Information Retrieval}. Cambridge: Cambridge University Press.
\url{https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf}
}
\keyword{dfm}
\keyword{weighting}