forked from quanteda/quanteda
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokens_lookup.Rd
118 lines (102 loc) · 5.1 KB
/
tokens_lookup.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tokens_lookup.R
\name{tokens_lookup}
\alias{tokens_lookup}
\title{Apply a dictionary to a tokens object}
\usage{
tokens_lookup(x, dictionary, levels = 1:5, valuetype = c("glob",
"regex", "fixed"), case_insensitive = TRUE, capkeys = !exclusive,
exclusive = TRUE, nomatch = NULL, nested_scope = c("key",
"dictionary"), verbose = quanteda_options("verbose"))
}
\arguments{
\item{x}{tokens object to which dictionary or thesaurus will be supplied}
\item{dictionary}{the \link{dictionary}-class object that will be applied to
\code{x}}
\item{levels}{integers specifying the levels of entries in a hierarchical
dictionary that will be applied. The top level is 1, and subsequent levels
describe lower nesting levels. Values may be combined, even if these
levels are not contiguous, e.g. `levels = c(1:3)` will collapse the second
level into the first, but record the third level (if present) collapsed
below the first (see examples).}
\item{valuetype}{the type of pattern matching: \code{"glob"} for
"glob"-style wildcard expressions; \code{"regex"} for regular expressions;
or \code{"fixed"} for exact matching. See \link{valuetype} for details.}
\item{case_insensitive}{ignore the case of dictionary values if \code{TRUE}
uppercase to distinguish them from other features}
\item{capkeys}{if TRUE, convert dictionary keys to uppercase to distinguish
them from other features}
\item{exclusive}{if \code{TRUE}, remove all features not in dictionary,
otherwise, replace values in dictionary with keys while leaving other
features unaffected}
\item{nomatch}{an optional character naming a new key for tokens that do not
matched to a dictionary values If \code{NULL} (default), do not record
unmatched tokens.}
\item{nested_scope}{how to treat matches from different dictionary keys that
are nested. When one value is nested within another, such as "a b" being
nested within "a b c", `the `tokens_lookup()` will match the longer. When
\code{nested_scope = "key"}, this longer-match priority is applied only
within the key, while \code{"dictionary"} applies it across keys, matching only
the key with the longer pattern, not the matches nested within that longer
pattern from other keys. See Details.}
\item{verbose}{print status messages if \code{TRUE}}
}
\description{
Convert tokens into equivalence classes defined by values of a dictionary
object.
}
\details{
Dictionary values may consist of sequences, and there are different
methods of counting key matches based on values that are nested or that
overlap.
When two different keys in a dictionary are nested matches of one another,
the \code{nested_scope} options provide the choice of matching each key's
values independently (the \code{"key"}) option, or just counting the
longest match (the \code{"dictionary"} option). Values that are nested
\emph{within} the same key are always counted as a single match. See the
last example below comparing the \emph{New York} and \emph{New York Times}
for these two different behaviours.
\emph{Overlapping values}, such as \code{"a b"} and \code{"b a"} are
currently always considered as separate matches if they are in different
keys, or as one match if the overlap is within the same key.
\emph{Overlapped}
}
\examples{
toks1 <- tokens(data_corpus_inaugural)
dict1 <- dictionary(list(country = "united states",
law=c("law*", "constitution"),
freedom=c("free*", "libert*")))
dfm(tokens_lookup(toks1, dict1, valuetype = "glob", verbose = TRUE))
dfm(tokens_lookup(toks1, dict1, valuetype = "glob", verbose = TRUE, nomatch = "NONE"))
dict2 <- dictionary(list(country = "united states",
law = c("law", "constitution"),
freedom = c("freedom", "liberty")))
# dfm(applyDictionary(toks1, dict2, valuetype = "fixed"))
dfm(tokens_lookup(toks1, dict2, valuetype = "fixed"))
# hierarchical dictionary example
txt <- c(d1 = "The United States has the Atlantic Ocean and the Pacific Ocean.",
d2 = "Britain and Ireland have the Irish Sea and the English Channel.")
toks2 <- tokens(txt)
dict3 <- dictionary(list(US = list(Countries = c("States"),
oceans = c("Atlantic", "Pacific")),
Europe = list(Countries = c("Britain", "Ireland"),
oceans = list(west = "Irish Sea",
east = "English Channel"))))
tokens_lookup(toks2, dict3, levels = 1)
tokens_lookup(toks2, dict3, levels = 2)
tokens_lookup(toks2, dict3, levels = 1:2)
tokens_lookup(toks2, dict3, levels = 3)
tokens_lookup(toks2, dict3, levels = c(1,3))
tokens_lookup(toks2, dict3, levels = c(2,3))
# show unmatched tokens
tokens_lookup(toks2, dict3, nomatch = "_UNMATCHED")
# nested matching differences
dict4 <- dictionary(list(paper = "New York Times", city = "New York"))
toks4 <- tokens("The New York Times is a New York paper.")
tokens_lookup(toks4, dict4, nested_scope = "key", exclusive = FALSE)
tokens_lookup(toks4, dict4, nested_scope = "dictionary", exclusive = FALSE)
}
\seealso{
tokens_replace
}
\keyword{tokens}