forked from quanteda/quanteda
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdfm.Rd
145 lines (124 loc) · 5.89 KB
/
dfm.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dfm.R
\name{dfm}
\alias{dfm}
\title{Create a document-feature matrix}
\usage{
dfm(x, tolower = TRUE, stem = FALSE, select = NULL, remove = NULL,
dictionary = NULL, thesaurus = NULL, valuetype = c("glob", "regex",
"fixed"), groups = NULL, verbose = quanteda_options("verbose"), ...)
}
\arguments{
\item{x}{character, \link{corpus}, \link{tokens}, or \link{dfm} object}
\item{tolower}{convert all features to lowercase}
\item{stem}{if \code{TRUE}, stem words}
\item{select}{a \link{pattern} of user-supplied features to keep, while
excluding all others. This can be used in lieu of a dictionary if there
are only specific features that a user wishes to keep. To extract only
Twitter usernames, for example, set \code{select = "@*"} and make sure
that \code{remove_twitter = FALSE} as an additional argument passed to
\link{tokens}. Note: \code{select = "^@\\\w+\\\b"} would be the regular
expression version of this matching pattern. The pattern matching type
will be set by \code{valuetype}. See also \code{\link{tokens_remove}}.}
\item{remove}{a \link{pattern} of user-supplied features to ignore, such as
"stop words". To access one possible list (from any list you wish), use
\code{\link{stopwords}()}. The pattern matching type will be set by
\code{valuetype}. See also \code{\link{tokens_select}}. For behaviour of
\code{remove} with \code{ngrams > 1}, see Details.}
\item{dictionary}{a \link{dictionary} object to apply to the tokens when
creating the dfm}
\item{thesaurus}{a \link{dictionary} object that will be applied as if
\code{exclusive = FALSE}. See also \code{\link{tokens_lookup}}. For more
fine-grained control over this and other aspects of converting features
into dictionary/thesaurus keys from pattern matches to values, consider
creating the dfm first, and then applying \code{\link{dfm_lookup}}
separately, or using \code{\link{tokens_lookup}} on the tokenized text
before calling \code{dfm}.}
\item{valuetype}{the type of pattern matching: \code{"glob"} for
"glob"-style wildcard expressions; \code{"regex"} for regular expressions;
or \code{"fixed"} for exact matching. See \link{valuetype} for details.}
\item{groups}{either: a character vector containing the names of document
variables to be used for grouping; or a factor or object that can be
coerced into a factor equal in length or rows to the number of documents.
See \link{groups} for details.}
\item{verbose}{display messages if \code{TRUE}}
\item{...}{additional arguments passed to \link{tokens}; not used when \code{x}
is a \link{dfm}}
}
\value{
a \link{dfm-class} object
}
\description{
Construct a sparse document-feature matrix, from a character, \link{corpus},
\link{tokens}, or even other \link{dfm} object.
}
\details{
The default behaviour for \code{remove}/\code{select} when
constructing ngrams using \code{dfm(x, } \emph{ngrams > 1}\code{)} is to
remove/select any ngram constructed from a matching feature. If you wish
to remove these before constructing ngrams, you will need to first tokenize
the texts with ngrams, then remove the features to be ignored, and then
construct the dfm using this modified tokenization object. See the code
examples for an illustration.
To select on and match the features of a another \link{dfm}, \code{x} must
also be a \link{dfm}.
}
\note{
When \code{x} is a \link{dfm}, \code{groups} provides a convenient and
fast method of combining and refactoring the documents of the dfm according
to the groups.
}
\examples{
## for a corpus
dfmat1 <- corpus_subset(data_corpus_inaugural, Year > 1980)
dfm(dfmat1)
dfm(dfmat1, tolower = FALSE)
# grouping documents by docvars in a corpus
dfm(dfmat1, groups = "President", verbose = TRUE)
# with English stopwords and stemming
dfm(dfmat1, remove = stopwords("english"), stem = TRUE, verbose = TRUE)
# works for both words in ngrams too
dfm("Banking industry", stem = TRUE, ngrams = 2)
# with dictionaries
dfmat2 <- corpus_subset(data_corpus_inaugural, Year > 1900)
dict <- dictionary(list(christmas = c("Christmas", "Santa", "holiday"),
opposition = c("Opposition", "reject", "notincorpus"),
taxing = "taxing",
taxation = "taxation",
taxregex = "tax*",
country = "states"))
dfm(dfmat2, dictionary = dict)
# removing stopwords
txt <- "The quick brown fox named Seamus jumps over the lazy dog also named Seamus, with
the newspaper from a boy named Seamus, in his mouth."
corp <- corpus(txt)
# note: "also" is not in the default stopwords("english")
featnames(dfm(corp, select = stopwords("english")))
# for ngrams
featnames(dfm(corp, ngrams = 2, select = stopwords("english"), remove_punct = TRUE))
featnames(dfm(corp, ngrams = 1:2, select = stopwords("english"), remove_punct = TRUE))
# removing stopwords before constructing ngrams
toks1 <- tokens(char_tolower(txt), remove_punct = TRUE)
toks2 <- tokens_remove(toks1, stopwords("english"))
toks3 <- tokens_ngrams(toks2, 2)
featnames(dfm(toks3))
# keep only certain words
dfm(corp, select = "*s") # keep only words ending in "s"
dfm(corp, select = "s$", valuetype = "regex")
# testing Twitter functions
txttweets <- c("My homie @justinbieber #justinbieber shopping in #LA yesterday #beliebers",
"2all the ha8ers including my bro #justinbieber #emabiggestfansjustinbieber",
"Justin Bieber #justinbieber #belieber #fetusjustin #EMABiggestFansJustinBieber")
dfm(txttweets, select = "#*", remove_twitter = FALSE) # keep only hashtags
dfm(txttweets, select = "^#.*$", valuetype = "regex", remove_twitter = FALSE)
# for a dfm
dfmat3 <- dfm(data_corpus_irishbudget2010)
dfmat4 <- dfm(dfmat3,
groups = ifelse(docvars(data_corpus_irishbudget2010, "party") \%in\% c("FF", "Green"),
"Govt", "Opposition"),
tolower = FALSE, verbose = TRUE)
}
\seealso{
\code{\link{dfm_select}}, \link{dfm-class}
}
\keyword{dfm}