forked from quanteda/quanteda
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpattern.Rd
66 lines (61 loc) · 2.69 KB
/
pattern.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/quanteda-documentation.R
\name{pattern}
\alias{pattern}
\title{Pattern for feature, token and keyword matching}
\arguments{
\item{pattern}{a character vector, list of character vectors,
\link{dictionary}, or \link{collocations} object. See \link{pattern} for
details.}
}
\description{
Pattern(s) for use in matching features, tokens, and keywords through a
\link{valuetype} pattern.
}
\details{
The \code{pattern} argument is a vector of patterns, including
sequences, to match in a target object, whose match type is specified by
\code{\link{valuetype}}. Note that an empty pattern (\code{""}) will match
"padding" in a \link{tokens} object.
\describe{
\item{\code{character}}{A character vector of token patterns to be selected
or removed. Whitespace is not privileged, so that in a character vector,
white space is interpreted literally. If you wish to consider
whitespace-separated elements as sequences of tokens, wrap the argument in
\code{\link{phrase}}. }
\item{\code{list of character objects}}{If the list elements are character
vectors of length 1, then this is equivalent to a vector of characters. If
a list element contains a vector of characters longer than length 1, then
for matching will consider these as sequences of matches, equivalent to
wrapping the argument in \code{\link{phrase}}, except for matching to
\link{dfm} features where this does not apply. }
\item{\code{dictionary}}{Values in \link{dictionary} are used as patterns,
for literal matches. Multi-word values are automatically converted into
phrases, so performing selection or compounding using a dictionary is the
same as wrapping the dictionary in \code{\link{phrase}}. }
\item{\code{collocations}}{Collocations objects created from
\code{\link{textstat_collocations}}, which are treated as phrases
automatically.
}
}
}
\examples{
# these are interpreted literally
(patt1 <- c("president", "white house", "house of representatives"))
# as multi-word sequences
phrase(patt1)
# three single-word patterns
(patt2 <- c("president", "white_house", "house_of_representatives"))
phrase(patt2)
# this is equivalent to phrase(patt1)
(patt3 <- list(c("president"), c("white", "house"),
c("house", "of", "representatives")))
# glob expression can be used
phrase(patt4 <- c("president?", "white house", "house * representatives"))
# this is equivalent to phrase(patt4)
(patt5 <- list(c("president?"), c("white", "house"), c("house", "*", "representatives")))
# dictionary with multi-word matches
(dict1 <- dictionary(list(us = c("president", "white house", "house of representatives"))))
phrase(dict1)
}
\keyword{internal}