forked from quanteda/quanteda
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtextstat_frequency.Rd
94 lines (84 loc) · 3.49 KB
/
textstat_frequency.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/textstat_frequency.R
\name{textstat_frequency}
\alias{textstat_frequency}
\title{Tabulate feature frequencies}
\usage{
textstat_frequency(x, n = NULL, groups = NULL, ties_method = c("min",
"average", "first", "random", "max", "dense"), ...)
}
\arguments{
\item{x}{a \link{dfm} object}
\item{n}{(optional) integer specifying the top \code{n} features to be returned,
within group if \code{groups} is specified}
\item{groups}{either: a character vector containing the names of document
variables to be used for grouping; or a factor or object that can be
coerced into a factor equal in length or rows to the number of documents.
See \link{groups} for details.}
\item{ties_method}{character string specifying how ties are treated. See
\code{\link[data.table]{frank}} for details. Unlike that function,
however, the default is \code{"min"}, so that frequencies of 10, 10, 11
would be ranked 1, 1, 3.}
\item{...}{additional arguments passed to \code{\link{dfm_group}}. This can
be useful in passing `force = TRUE`, for instance, if you are grouping a
dfm that has been weighted.}
}
\value{
a data.frame containing the following variables:
\describe{
\item{\code{feature}}{(character) the feature}
\item{\code{frequency}}{count of the feature}
\item{\code{rank}}{rank of the feature, where 1 indicates the greatest
frequency}
\item{\code{docfreq}}{document frequency of the feature, as a count (the
number of documents in which this feature occurred at least once)}
\item{\code{docfreq}}{document frequency of the feature, as a count}
\item{\code{group}}{(only if \code{groups} is specified) the label of the group.
If the features have been grouped, then all counts, ranks, and document
frequencies are within group. If groups is not specified, the \code{group}
column is omitted from the returned data.frame.}
}
\code{textstat_frequency} returns a data.frame of features and
their term and document frequencies within groups.
}
\description{
Produces counts and document frequencies summaries of the features in a
\link{dfm}, optionally grouped by a \link{docvars} variable or other supplied
grouping variable.
}
\examples{
set.seed(20)
dfmat1 <- dfm(c("a a b b c d", "a d d d", "a a a"))
textstat_frequency(dfmat1)
textstat_frequency(dfmat1, groups = c("one", "two", "one"), ties_method = "first")
textstat_frequency(dfmat1, groups = c("one", "two", "one"), ties_method = "dense")
dfmat2 <- corpus_subset(data_corpus_inaugural, President == "Obama") \%>\%
dfm(remove_punct = TRUE, remove = stopwords("english"))
tstat1 <- textstat_frequency(dfmat2)
head(tstat1, 10)
\donttest{
# plot 20 most frequent words
library("ggplot2")
ggplot(tstat1[1:20, ], aes(x = reorder(feature, frequency), y = frequency)) +
geom_point() +
coord_flip() +
labs(x = NULL, y = "Frequency")
# plot relative frequencies by group
dfmat3 <- data_corpus_inaugural \%>\%
corpus_subset(Year > 2000) \%>\%
dfm(remove = stopwords("english"), remove_punct = TRUE) \%>\%
dfm_group(groups = "President") \%>\%
dfm_weight(scheme = "prop")
# calculate relative frequency by president
tstat2 <- textstat_frequency(dfmat3, n = 10, groups = "President")
# plot frequencies
ggplot(data = tstat2, aes(x = factor(nrow(tstat2):1), y = frequency)) +
geom_point() +
facet_wrap(~ group, scales = "free") +
coord_flip() +
scale_x_discrete(breaks = nrow(tstat2):1,
labels = tstat2$feature) +
labs(x = NULL, y = "Relative frequency")
}
}
\keyword{plot}