forked from quanteda/quanteda
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdfm_trim.Rd
102 lines (86 loc) · 4.14 KB
/
dfm_trim.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dfm_trim.R
\name{dfm_trim}
\alias{dfm_trim}
\title{Trim a dfm using frequency threshold-based feature selection}
\usage{
dfm_trim(x, min_termfreq = NULL, max_termfreq = NULL,
termfreq_type = c("count", "prop", "rank", "quantile"),
min_docfreq = NULL, max_docfreq = NULL, docfreq_type = c("count",
"prop", "rank", "quantile"), sparsity = NULL,
verbose = quanteda_options("verbose"), ...)
}
\arguments{
\item{x}{a \link{dfm} object}
\item{min_termfreq, max_termfreq}{minimum/maximum values of feature frequencies
across all documents, below/above which features will
be removed}
\item{termfreq_type}{how \code{min_termfreq} and \code{max_termfreq} are
interpreted. \code{"count"} sums the frequencies; \code{"prop"} divides the
term frequencies by the total sum; \code{"rank"} is matched against the
inverted ranking of features in terms of overall frequency, so that 1, 2,
... are the highest and second highest frequency features, and so on;
\code{"quantile"} sets the cutoffs according to the quantiles (see
\code{\link{quantile}}) of term frequencies.}
\item{min_docfreq, max_docfreq}{minimum/maximum values of a feature's document
frequency, below/above which features will be removed}
\item{docfreq_type}{specify how \code{min_docfreq} and \code{max_docfreq} are
interpreted. \code{"count"} is the same as \code{\link{docfreq}(x, scheme
= "count")}; \code{"prop"} divides the document frequencies by the total
sum; \code{"rank"} is matched against the inverted ranking of document
frequency, so that 1, 2, ... are the features with the highest and second
highest document frequencies, and so on; \code{"quantile"} sets the cutoffs
according to the quantiles (see \code{\link{quantile}}) of document
frequencies.}
\item{sparsity}{equivalent to \code{1 - min_docfreq}, included for comparison
with \pkg{tm}}
\item{verbose}{print messages}
\item{...}{not used}
}
\value{
A \link{dfm} reduced in features (with the same number of documents)
}
\description{
Returns a document by feature matrix reduced in size based on
document and term frequency, usually in terms of a minimum frequency, but
may also be in terms of maximum frequencies. Setting a combination of
minimum and maximum frequencies will select features based on a range.
Feature selection is implemented by considering features across
all documents, by summing them for term frequency, or counting the
documents in which they occur for document frequency. Rank and quantile
versions of these are also implemented, for taking the first \eqn{n}
features in terms of descending order of overall global counts or document
frequencies, or as a quantile of all frequencies.
}
\note{
Trimming a \link{dfm} object is an operation based on the \emph{values}
in the document-feature matrix. To select subsets of a dfm based on the
features themselves (meaning the feature labels from
\code{\link{featnames}}) -- such as those matching a regular expression, or
removing features matching a stopword list, use \code{\link{dfm_select}}.
}
\examples{
(dfmat <- dfm(data_corpus_inaugural[1:5]))
# keep only words occurring >= 10 times and in >= 2 documents
dfm_trim(dfmat, min_termfreq = 10, min_docfreq = 2)
# keep only words occurring >= 10 times and in at least 0.4 of the documents
dfm_trim(dfmat, min_termfreq = 10, min_docfreq = 0.4)
# keep only words occurring <= 10 times and in <=2 documents
dfm_trim(dfmat, max_termfreq = 10, max_docfreq = 2)
# keep only words occurring <= 10 times and in at most 3/4 of the documents
dfm_trim(dfmat, max_termfreq = 10, max_docfreq = 0.75)
# keep only words occurring 5 times in 1000, and in 2 of 5 of documents
dfm_trim(dfmat, min_docfreq = 0.4, min_termfreq = 0.005, termfreq_type = "prop")
# keep only words occurring frequently (top 20\%) and in <=2 documents
dfm_trim(dfmat, min_termfreq = 0.2, max_docfreq = 2, termfreq_type = "quantile")
\dontrun{
# compare to removeSparseTerms from the tm package
(dfmattm <- convert(dfmat, "tm"))
tm::removeSparseTerms(dfmattm, 0.7)
dfm_trim(dfmat, min_docfreq = 0.3)
dfm_trim(dfmat, sparsity = 0.7)
}
}
\seealso{
\code{\link{dfm_select}}, \code{\link{dfm_sample}}
}