forked from quanteda/quanteda
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtexts.Rd
81 lines (68 loc) · 2.83 KB
/
texts.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/corpus-methods-quanteda.R
\name{texts}
\alias{texts}
\alias{texts<-}
\alias{as.character.corpus}
\title{Get or assign corpus texts}
\usage{
texts(x, groups = NULL, spacer = " ")
texts(x) <- value
\method{as.character}{corpus}(x, ...)
}
\arguments{
\item{x}{a \link{corpus} or character object}
\item{groups}{either: a character vector containing the names of document
variables to be used for grouping; or a factor or object that can be
coerced into a factor equal in length or rows to the number of documents.
See \link{groups} for details.}
\item{spacer}{when concatenating texts by using \code{groups}, this will be the
spacing added between texts. (Default is two spaces.)}
\item{value}{character vector of the new texts}
\item{...}{unused}
}
\value{
For \code{texts}, a character vector of the texts in the corpus.
For \code{texts <-}, the corpus with the updated texts.
for \code{texts <-}, a corpus with the texts replaced by \code{value}
\code{as.character(x)} is equivalent to \code{texts(x)}
}
\description{
Get or replace the texts in a \link{corpus}, with grouping options.
Works for plain character vectors too, if \code{groups} is a factor.
}
\details{
\code{as.character(x)} where \code{x} is a corpus is equivalent to
calling \code{texts(x)}
}
\note{
The \code{groups} will be used for concatenating the texts based on shared
values of \code{groups}, without any specified order of aggregation.
You are strongly encouraged as a good practice of text analysis
workflow \emph{not} to modify the substance of the texts in a corpus.
Rather, this sort of processing is better performed through downstream
operations. For instance, do not lowercase the texts in a corpus, or you
will never be able to recover the original case. Rather, apply
\code{\link{tokens_tolower}} after applying \code{\link{tokens}} to a
corpus, or use the option \code{tolower = TRUE} in \code{\link{dfm}}.
}
\examples{
nchar(texts(corpus_subset(data_corpus_inaugural, Year < 1806)))
# grouping on a document variable
nchar(texts(corpus_subset(data_corpus_inaugural, Year < 1806), groups = "President"))
# grouping a character vector using a factor
nchar(data_char_ukimmig2010[1:5])
nchar(texts(data_corpus_inaugural[1:5],
groups = as.factor(data_corpus_inaugural[1:5, "President"])))
BritCorpus <- corpus(c("We must prioritise honour in our neighbourhood.",
"Aluminium is a valourous metal."))
texts(BritCorpus) <-
stringi::stri_replace_all_regex(texts(BritCorpus),
c("ise", "([nlb])our", "nium"),
c("ize", "$1or", "num"),
vectorize_all = FALSE)
texts(BritCorpus)
texts(BritCorpus)[2] <- "New text number 2."
texts(BritCorpus)
}
\keyword{corpus}