man/corpus_sample.Rd

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/corpus_sample.R
\name{corpus_sample}
\alias{corpus_sample}
\title{Randomly sample documents from a corpus}
\usage{
corpus_sample(x, size = ndoc(x), replace = FALSE, prob = NULL,
  by = NULL)
}
\arguments{
\item{x}{a corpus object whose documents will be sampled}

\item{size}{a positive number, the number of documents to select; when used
with groups, the number to select from each group or a vector equal in
length to the number of groups defining the samples to be chosen in each
group category.  By defining a size larger than the number of documents, it
is possible to \emph{over}sample groups.}

\item{replace}{Should sampling be with replacement?}

\item{prob}{A vector of probability weights for obtaining the elements of the
vector being sampled.  May not be applied when \code{by} is used.}

\item{by}{a grouping variable for sampling.  Useful for resampling
sub-document units such as sentences, for instance by specifying \code{by =
"document"}}
}
\value{
A corpus object with number of documents equal to \code{size}, drawn 
  from the corpus \code{x}.  The returned corpus object will contain all of 
  the meta-data of the original corpus, and the same document variables for 
  the documents selected.
}
\description{
Take a random sample of documents of the specified size from a corpus, with
or without replacement.  Works just as \code{\link{sample}} works for the
documents and their associated document-level variables.
}
\examples{
set.seed(2000)
# sampling from a corpus
summary(corpus_sample(data_corpus_inaugural, 5)) 
summary(corpus_sample(data_corpus_inaugural, 10, replace = TRUE))

# sampling sentences within document
corp <- corpus(c(one = "Sentence one.  Sentence two.  Third sentence.",
                      two = "First sentence, doc2.  Second sentence, doc2."))
corpsent <- corpus_reshape(corp, to = "sentences")
texts(corpsent)
texts(corpus_sample(corpsent, replace = TRUE, by = "document"))
}
\keyword{corpus}