forked from quanteda/quanteda
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokens_compound.Rd
88 lines (75 loc) · 3.77 KB
/
tokens_compound.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tokens_compound.R
\name{tokens_compound}
\alias{tokens_compound}
\title{Convert token sequences into compound tokens}
\usage{
tokens_compound(x, pattern, concatenator = "_", valuetype = c("glob",
"regex", "fixed"), case_insensitive = TRUE, join = TRUE)
}
\arguments{
\item{x}{an input \link{tokens} object}
\item{pattern}{a character vector, list of character vectors,
\link{dictionary}, or \link{collocations} object. See \link{pattern} for
details.}
\item{concatenator}{the concatenation character that will connect the words
making up the multi-word sequences. The default \code{_} is recommended
since it will not be removed during normal cleaning and tokenization (while
nearly all other punctuation characters, at least those in the Unicode
punctuation class [P] will be removed).}
\item{valuetype}{the type of pattern matching: \code{"glob"} for
"glob"-style wildcard expressions; \code{"regex"} for regular expressions;
or \code{"fixed"} for exact matching. See \link{valuetype} for details.}
\item{case_insensitive}{logical; if \code{TRUE}, ignore case when matching.
When \code{pattern} is a \code{collocations}, case-sensitive operation is
significantly faster than case-insensitive operation.}
\item{join}{logical; if \code{TRUE}, join overlapping compounds into a single
compound; otherwise, form these separately. See examples.}
}
\value{
A \link{tokens} object in which the token sequences matching
\code{pattern} have been replaced by compound "tokens" joined by the
concatenator.
}
\description{
Replace multi-token sequences with a multi-word, or "compound" token. The
resulting compound tokens will represent a phrase or multi-word expression,
concatenated with \code{concatenator} (by default, the "\code{_}" character)
to form a single "token". This ensures that the sequences will be processed
subsequently as single tokens, for instance in constructing a \link{dfm}.
}
\note{
Patterns to be compounded (naturally) consist of multi-word sequences,
and how these are expected in \code{pattern} is very specific. If the
elements to be compounded are supplied as space-delimited elements of a
character vector, wrap the vector in \code{\link{phrase}}. If the elements
to be compounded are separate elements of a character vector, supply it as
a list where each list element is the sequence of character elements.
See the examples below.
}
\examples{
txt <- "The United Kingdom is leaving the European Union."
toks <- tokens(txt, remove_punct = TRUE)
# character vector - not compounded
tokens_compound(toks, c("United", "Kingdom", "European", "Union"))
# elements separated by spaces - not compounded
tokens_compound(toks, c("United Kingdom", "European Union"))
# list of characters - is compounded
tokens_compound(toks, list(c("United", "Kingdom"), c("European", "Union")))
# elements separated by spaces, wrapped in phrase)() - is compounded
tokens_compound(toks, phrase(c("United Kingdom", "European Union")))
# supplied as values in a dictionary (same as list) - is compounded
# (keys do not matter)
tokens_compound(toks, dictionary(list(key1 = "United Kingdom",
key2 = "European Union")))
# pattern as dictionaries with glob matches
tokens_compound(toks, dictionary(list(key1 = c("U* K*"))), valuetype = "glob")
# supplied as collocations - is compounded
colls <- tokens("The new European Union is not the old European Union.") \%>\%
textstat_collocations(size = 2, min_count = 1, tolower = FALSE)
tokens_compound(toks, colls, case_insensitive = FALSE)
# note the differences caused by join = FALSE
compounds <- list(c("the", "European"), c("European", "Union"))
tokens_compound(toks, pattern = compounds, join = TRUE)
tokens_compound(toks, pattern = compounds, join = FALSE)
}