forked from quanteda/quanteda
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorpus_sample.Rd
52 lines (47 loc) · 1.98 KB
/
corpus_sample.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/corpus_sample.R
\name{corpus_sample}
\alias{corpus_sample}
\title{Randomly sample documents from a corpus}
\usage{
corpus_sample(x, size = ndoc(x), replace = FALSE, prob = NULL,
by = NULL)
}
\arguments{
\item{x}{a corpus object whose documents will be sampled}
\item{size}{a positive number, the number of documents to select; when used
with groups, the number to select from each group or a vector equal in
length to the number of groups defining the samples to be chosen in each
group category. By defining a size larger than the number of documents, it
is possible to \emph{over}sample groups.}
\item{replace}{Should sampling be with replacement?}
\item{prob}{A vector of probability weights for obtaining the elements of the
vector being sampled. May not be applied when \code{by} is used.}
\item{by}{a grouping variable for sampling. Useful for resampling
sub-document units such as sentences, for instance by specifying \code{by =
"document"}}
}
\value{
A corpus object with number of documents equal to \code{size}, drawn
from the corpus \code{x}. The returned corpus object will contain all of
the meta-data of the original corpus, and the same document variables for
the documents selected.
}
\description{
Take a random sample of documents of the specified size from a corpus, with
or without replacement. Works just as \code{\link{sample}} works for the
documents and their associated document-level variables.
}
\examples{
set.seed(2000)
# sampling from a corpus
summary(corpus_sample(data_corpus_inaugural, 5))
summary(corpus_sample(data_corpus_inaugural, 10, replace = TRUE))
# sampling sentences within document
corp <- corpus(c(one = "Sentence one. Sentence two. Third sentence.",
two = "First sentence, doc2. Second sentence, doc2."))
corpsent <- corpus_reshape(corp, to = "sentences")
texts(corpsent)
texts(corpus_sample(corpsent, replace = TRUE, by = "document"))
}
\keyword{corpus}