man/texts.Rd

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/corpus-methods-quanteda.R
\name{texts}
\alias{texts}
\alias{texts<-}
\alias{as.character.corpus}
\title{Get or assign corpus texts}
\usage{
texts(x, groups = NULL, spacer = "  ")

texts(x) <- value

\method{as.character}{corpus}(x, ...)
}
\arguments{
\item{x}{a \link{corpus} or character object}

\item{groups}{either: a character vector containing the names of document 
variables to be used for grouping; or a factor or object that can be 
coerced into a factor equal in length or rows to the number of documents. 
See \link{groups} for details.}

\item{spacer}{when concatenating texts by using \code{groups}, this will be the 
spacing added between texts.  (Default is two spaces.)}

\item{value}{character vector of the new texts}

\item{...}{unused}
}
\value{
For \code{texts}, a character vector of the texts in the corpus.
  
  For \code{texts <-}, the corpus with the updated texts.

for \code{texts <-}, a corpus with the texts replaced by \code{value}

\code{as.character(x)} is equivalent to \code{texts(x)}
}
\description{
Get or replace the texts in a \link{corpus}, with grouping options. 
Works for plain character vectors too, if \code{groups} is a factor.
}
\details{
\code{as.character(x)} where \code{x} is a corpus is equivalent to
calling \code{texts(x)}
}
\note{
The \code{groups} will be used for concatenating the texts based on shared
values of \code{groups}, without any specified order of aggregation.

You are strongly encouraged as a good practice of text analysis 
  workflow \emph{not} to modify the substance of the texts in a corpus. 
  Rather, this sort of processing is better performed through downstream 
  operations.  For instance, do not lowercase the texts in a corpus, or you 
  will never be able to recover the original case.  Rather, apply 
  \code{\link{tokens_tolower}} after applying \code{\link{tokens}} to a
  corpus, or use the option \code{tolower = TRUE} in \code{\link{dfm}}.
}
\examples{
nchar(texts(corpus_subset(data_corpus_inaugural, Year < 1806)))

# grouping on a document variable
nchar(texts(corpus_subset(data_corpus_inaugural, Year < 1806), groups = "President"))

# grouping a character vector using a factor
nchar(data_char_ukimmig2010[1:5])
nchar(texts(data_corpus_inaugural[1:5], 
            groups = as.factor(data_corpus_inaugural[1:5, "President"])))

BritCorpus <- corpus(c("We must prioritise honour in our neighbourhood.", 
                       "Aluminium is a valourous metal."))
texts(BritCorpus) <- 
    stringi::stri_replace_all_regex(texts(BritCorpus),
                                   c("ise", "([nlb])our", "nium"),
                                   c("ize", "$1or", "num"),
                                   vectorize_all = FALSE)
texts(BritCorpus)
texts(BritCorpus)[2] <- "New text number 2."
texts(BritCorpus)
}
\keyword{corpus}