man/convert.Rd

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/convert.R
\name{convert}
\alias{convert}
\title{Convert a dfm to a non-quanteda format}
\usage{
convert(x, to = c("lda", "tm", "stm", "austin", "topicmodels", "lsa",
  "matrix", "data.frame", "tripletlist"), docvars = NULL,
  omit_empty = TRUE)
}
\arguments{
\item{x}{a \link{dfm} to be converted}

\item{to}{target conversion format, consisting of the name of the package 
into whose document-term matrix representation the dfm will be converted: 
\describe{ \item{\code{"lda"}}{a list with components "documents" and 
"vocab" as needed by the function \link[lda]{lda.collapsed.gibbs.sampler} from the 
\pkg{lda} package} \item{\code{"tm"}}{a \link[tm]{DocumentTermMatrix} from 
the \pkg{tm} package} \item{\code{"stm"}}{the  format for the \pkg{stm} 
package} \item{\code{"austin"}}{the \code{wfm} format from the 
\strong{austin} package} \item{\code{"topicmodels"}}{the "dtm" format as 
used by the \pkg{topicmodels} package} 
\item{\code{"lsa"}}{the "textmatrix" format as 
used by the \pkg{lsa} package}
\item{\code{"data.frame"}}{a data.frame where each feature is a variable} 
\item{\code{"tripletlist"}}{a named "triplet" format list consisting of 
\code{document}, \code{feature}, and \code{frequency}} 
}}

\item{docvars}{optional data.frame of document variables used as the
\code{meta} information in conversion to the \pkg{stm} package format.
This aids in selecting the document variables only corresponding to the
documents with non-zero counts.  Only affects the "stm" format.}

\item{omit_empty}{logical; if \code{TRUE}, omit empty documents and features
from the converted dfm. This is required for some formats (such as STM)
that do not accept empty documents.  Only used when \code{to = "lda"} or
\code{to = "topicmodels"}.  For \code{to = "stm"} format, `omit_empty`` is
always \code{TRUE}.}
}
\value{
A converted object determined by the value of \code{to} (see above). 
  See conversion target package documentation for more detailed descriptions 
  of the return formats.
}
\description{
Convert a quanteda \link{dfm} object to a format useable by other text
analysis packages.  The general function \code{convert} provides easy
conversion from a dfm to the document-term representations used in all other
text analysis packages for which conversions are defined.
}
\examples{
corp <- corpus_subset(data_corpus_inaugural, Year > 1970)
dfmat1 <- dfm(corp)

# austin's wfm format
identical(dim(dfmat1), dim(convert(dfmat1, to = "austin")))

# stm package format
stmmat <- convert(dfmat1, to = "stm")
str(stmmat)

#' # triplet
tripletmat <- convert(dfmat1, to = "tripletlist")
str(tripletmat)

# illustrate what happens with zero-length documents
dfmat2 <- dfm(c(punctOnly = "!!!", corp[-1]))
rowSums(dfmat2)
str(convert(dfmat2, to = "stm", docvars = docvars(corp)))

\dontrun{
# tm's DocumentTermMatrix format
tmdfm <- convert(dfmat1, to = "tm")
str(tmdfm)

# topicmodels package format
str(convert(dfmat1, to = "topicmodels"))

# lda package format
str(convert(dfmat1, to = "lda"))

}
}