forked from quanteda/quanteda
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert.Rd
84 lines (73 loc) · 3.02 KB
/
convert.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/convert.R
\name{convert}
\alias{convert}
\title{Convert a dfm to a non-quanteda format}
\usage{
convert(x, to = c("lda", "tm", "stm", "austin", "topicmodels", "lsa",
"matrix", "data.frame", "tripletlist"), docvars = NULL,
omit_empty = TRUE)
}
\arguments{
\item{x}{a \link{dfm} to be converted}
\item{to}{target conversion format, consisting of the name of the package
into whose document-term matrix representation the dfm will be converted:
\describe{ \item{\code{"lda"}}{a list with components "documents" and
"vocab" as needed by the function \link[lda]{lda.collapsed.gibbs.sampler} from the
\pkg{lda} package} \item{\code{"tm"}}{a \link[tm]{DocumentTermMatrix} from
the \pkg{tm} package} \item{\code{"stm"}}{the format for the \pkg{stm}
package} \item{\code{"austin"}}{the \code{wfm} format from the
\strong{austin} package} \item{\code{"topicmodels"}}{the "dtm" format as
used by the \pkg{topicmodels} package}
\item{\code{"lsa"}}{the "textmatrix" format as
used by the \pkg{lsa} package}
\item{\code{"data.frame"}}{a data.frame where each feature is a variable}
\item{\code{"tripletlist"}}{a named "triplet" format list consisting of
\code{document}, \code{feature}, and \code{frequency}}
}}
\item{docvars}{optional data.frame of document variables used as the
\code{meta} information in conversion to the \pkg{stm} package format.
This aids in selecting the document variables only corresponding to the
documents with non-zero counts. Only affects the "stm" format.}
\item{omit_empty}{logical; if \code{TRUE}, omit empty documents and features
from the converted dfm. This is required for some formats (such as STM)
that do not accept empty documents. Only used when \code{to = "lda"} or
\code{to = "topicmodels"}. For \code{to = "stm"} format, `omit_empty`` is
always \code{TRUE}.}
}
\value{
A converted object determined by the value of \code{to} (see above).
See conversion target package documentation for more detailed descriptions
of the return formats.
}
\description{
Convert a quanteda \link{dfm} object to a format useable by other text
analysis packages. The general function \code{convert} provides easy
conversion from a dfm to the document-term representations used in all other
text analysis packages for which conversions are defined.
}
\examples{
corp <- corpus_subset(data_corpus_inaugural, Year > 1970)
dfmat1 <- dfm(corp)
# austin's wfm format
identical(dim(dfmat1), dim(convert(dfmat1, to = "austin")))
# stm package format
stmmat <- convert(dfmat1, to = "stm")
str(stmmat)
#' # triplet
tripletmat <- convert(dfmat1, to = "tripletlist")
str(tripletmat)
# illustrate what happens with zero-length documents
dfmat2 <- dfm(c(punctOnly = "!!!", corp[-1]))
rowSums(dfmat2)
str(convert(dfmat2, to = "stm", docvars = docvars(corp)))
\dontrun{
# tm's DocumentTermMatrix format
tmdfm <- convert(dfmat1, to = "tm")
str(tmdfm)
# topicmodels package format
str(convert(dfmat1, to = "topicmodels"))
# lda package format
str(convert(dfmat1, to = "lda"))
}
}