man/dfm.Rd

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dfm.R
\name{dfm}
\alias{dfm}
\title{Create a document-feature matrix}
\usage{
dfm(x, tolower = TRUE, stem = FALSE, select = NULL, remove = NULL,
  dictionary = NULL, thesaurus = NULL, valuetype = c("glob", "regex",
  "fixed"), groups = NULL, verbose = quanteda_options("verbose"), ...)
}
\arguments{
\item{x}{character, \link{corpus}, \link{tokens}, or \link{dfm} object}

\item{tolower}{convert all features to lowercase}

\item{stem}{if \code{TRUE}, stem words}

\item{select}{a  \link{pattern}  of user-supplied features to keep, while
excluding all others.  This can be used in lieu of a dictionary if there
are only specific features that a user wishes to keep. To extract only
Twitter usernames, for example, set \code{select = "@*"} and make sure
that \code{remove_twitter = FALSE} as an additional argument passed to
\link{tokens}.  Note: \code{select = "^@\\\w+\\\b"} would be the regular
expression version of this matching pattern.  The pattern matching type
will be set by \code{valuetype}.  See also \code{\link{tokens_remove}}.}

\item{remove}{a \link{pattern} of user-supplied features to ignore, such as
"stop words".  To access one possible list (from any list you wish), use
\code{\link{stopwords}()}.  The pattern matching type will be set by
\code{valuetype}.  See also \code{\link{tokens_select}}.  For behaviour of
\code{remove} with \code{ngrams > 1}, see Details.}

\item{dictionary}{a \link{dictionary} object to apply to the tokens when
creating the dfm}

\item{thesaurus}{a \link{dictionary} object that will be applied as if
\code{exclusive = FALSE}. See also \code{\link{tokens_lookup}}.  For more
fine-grained control over this and other aspects of converting features
into dictionary/thesaurus keys from pattern matches to values, consider
creating the dfm first, and then applying \code{\link{dfm_lookup}}
separately, or using \code{\link{tokens_lookup}} on the tokenized text
before calling \code{dfm}.}

\item{valuetype}{the type of pattern matching: \code{"glob"} for 
"glob"-style wildcard expressions; \code{"regex"} for regular expressions;
or \code{"fixed"} for exact matching. See \link{valuetype} for details.}

\item{groups}{either: a character vector containing the names of document 
variables to be used for grouping; or a factor or object that can be 
coerced into a factor equal in length or rows to the number of documents. 
See \link{groups} for details.}

\item{verbose}{display messages if \code{TRUE}}

\item{...}{additional arguments passed to \link{tokens}; not used when \code{x}
is a \link{dfm}}
}
\value{
a \link{dfm-class} object
}
\description{
Construct a sparse document-feature matrix, from a character, \link{corpus},
\link{tokens}, or even other \link{dfm} object.
}
\details{
The default behaviour for \code{remove}/\code{select} when
  constructing ngrams using \code{dfm(x, } \emph{ngrams > 1}\code{)} is to
  remove/select any ngram constructed from a matching feature.  If you wish
  to remove these before constructing ngrams, you will need to first tokenize
  the texts with ngrams, then remove the features to be ignored, and then
  construct the dfm using this modified tokenization object.  See the code
  examples for an illustration.

  To select on and match the features of a another \link{dfm}, \code{x} must
  also be a \link{dfm}.
}
\note{
When \code{x} is a \link{dfm}, \code{groups} provides a convenient and
  fast method of combining and refactoring the documents of the dfm according
  to the groups.
}
\examples{
## for a corpus
dfmat1 <- corpus_subset(data_corpus_inaugural, Year > 1980)
dfm(dfmat1)
dfm(dfmat1, tolower = FALSE)

# grouping documents by docvars in a corpus
dfm(dfmat1, groups = "President", verbose = TRUE)

# with English stopwords and stemming
dfm(dfmat1, remove = stopwords("english"), stem = TRUE, verbose = TRUE)
# works for both words in ngrams too
dfm("Banking industry", stem = TRUE, ngrams = 2)

# with dictionaries
dfmat2 <- corpus_subset(data_corpus_inaugural, Year > 1900)
dict <- dictionary(list(christmas = c("Christmas", "Santa", "holiday"),
               opposition = c("Opposition", "reject", "notincorpus"),
               taxing = "taxing",
               taxation = "taxation",
               taxregex = "tax*",
               country = "states"))
dfm(dfmat2, dictionary = dict)


# removing stopwords
txt <- "The quick brown fox named Seamus jumps over the lazy dog also named Seamus, with
             the newspaper from a boy named Seamus, in his mouth."
corp <- corpus(txt)
# note: "also" is not in the default stopwords("english")
featnames(dfm(corp, select = stopwords("english")))
# for ngrams
featnames(dfm(corp, ngrams = 2, select = stopwords("english"), remove_punct = TRUE))
featnames(dfm(corp, ngrams = 1:2, select = stopwords("english"), remove_punct = TRUE))

# removing stopwords before constructing ngrams
toks1 <- tokens(char_tolower(txt), remove_punct = TRUE)
toks2 <- tokens_remove(toks1, stopwords("english"))
toks3 <- tokens_ngrams(toks2, 2)
featnames(dfm(toks3))

# keep only certain words
dfm(corp, select = "*s")  # keep only words ending in "s"
dfm(corp, select = "s$", valuetype = "regex")

# testing Twitter functions
txttweets <- c("My homie @justinbieber #justinbieber shopping in #LA yesterday #beliebers",
                "2all the ha8ers including my bro #justinbieber #emabiggestfansjustinbieber",
                "Justin Bieber #justinbieber #belieber #fetusjustin #EMABiggestFansJustinBieber")
dfm(txttweets, select = "#*", remove_twitter = FALSE)  # keep only hashtags
dfm(txttweets, select = "^#.*$", valuetype = "regex", remove_twitter = FALSE)

# for a dfm
dfmat3 <- dfm(data_corpus_irishbudget2010)
dfmat4 <- dfm(dfmat3,
            groups = ifelse(docvars(data_corpus_irishbudget2010, "party") \%in\% c("FF", "Green"),
                            "Govt", "Opposition"),
            tolower = FALSE, verbose = TRUE)

}
\seealso{
\code{\link{dfm_select}}, \link{dfm-class}
}
\keyword{dfm}