forked from quanteda/quanteda
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokens_ngrams.Rd
88 lines (80 loc) · 3.47 KB
/
tokens_ngrams.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tokens_ngrams.R
\name{tokens_ngrams}
\alias{tokens_ngrams}
\alias{char_ngrams}
\alias{tokens_skipgrams}
\title{Create ngrams and skipgrams from tokens}
\usage{
tokens_ngrams(x, n = 2L, skip = 0L, concatenator = "_")
char_ngrams(x, n = 2L, skip = 0L, concatenator = "_")
tokens_skipgrams(x, n, skip, concatenator = "_")
}
\arguments{
\item{x}{a tokens object, or a character vector, or a list of characters}
\item{n}{integer vector specifying the number of elements to be concatenated
in each ngram. Each element of this vector will define a \eqn{n} in the
\eqn{n}-gram(s) that are produced.}
\item{skip}{integer vector specifying the adjacency skip size for tokens
forming the ngrams, default is 0 for only immediately neighbouring words.
For \code{skipgrams}, \code{skip} can be a vector of integers, as the
"classic" approach to forming skip-grams is to set skip = \eqn{k} where
\eqn{k} is the distance for which \eqn{k} or fewer skips are used to
construct the \eqn{n}-gram. Thus a "4-skip-n-gram" defined as \code{skip =
0:4} produces results that include 4 skips, 3 skips, 2 skips, 1 skip, and 0
skips (where 0 skips are typical n-grams formed from adjacent words). See
Guthrie et al (2006).}
\item{concatenator}{character for combining words, default is \code{_}
(underscore) character}
}
\value{
a tokens object consisting a list of character vectors of ngrams, one
list element per text, or a character vector if called on a simple
character vector
}
\description{
Create a set of ngrams (tokens in sequence) from already tokenized text
objects, with an optional skip argument to form skipgrams. Both the ngram
length and the skip lengths take vectors of arguments to form multiple
lengths or skips in one pass. Implemented in C++ for efficiency.
}
\details{
Normally, these functions will be called through
\code{\link{tokens}(x, ngrams = , ...)}, but these functions are provided
in case a user wants to perform lower-level ngram construction on tokenized
texts.
\code{\link{tokens_skipgrams}} is a wrapper to \code{\link{tokens_ngrams}}
that requires arguments to be supplied for both \code{n} and \code{skip}.
For \eqn{k}-skip skipgrams, set \code{skip} to \code{0:}\eqn{k}, in order
to conform to the definition of skip-grams found in Guthrie et al (2006): A
\eqn{k} skip-gram is an ngram which is a superset of all ngrams and each
\eqn{(k-i)} skipgram until \eqn{(k-i)==0} (which includes 0 skip-grams).
}
\note{
\code{char_ngrams} is a convenience wrapper for a (non-list)
vector of characters, so named to be consistent with \pkg{quanteda}'s naming
scheme.
}
\examples{
# ngrams
tokens_ngrams(tokens(c("a b c d e", "c d e f g")), n = 2:3)
toks <- tokens(c(text1 = "the quick brown fox jumped over the lazy dog"))
tokens_ngrams(toks, n = 1:3)
tokens_ngrams(toks, n = c(2,4), concatenator = " ")
tokens_ngrams(toks, n = c(2,4), skip = 1, concatenator = " ")
# on character
char_ngrams(letters[1:3], n = 1:3)
# skipgrams
toks <- tokens("insurgents killed in ongoing fighting")
tokens_skipgrams(toks, n = 2, skip = 0:1, concatenator = " ")
tokens_skipgrams(toks, n = 2, skip = 0:2, concatenator = " ")
tokens_skipgrams(toks, n = 3, skip = 0:2, concatenator = " ")
}
\references{
Guthrie, David, Ben Allison, Wei Liu, Louise Guthrie, and Yorick Wilks. 2006.
"\href{https://aclweb.org/anthology/papers/L/L06/L06-1210/}{A Closer
Look at Skip-Gram Modelling}."
}
\author{
Kohei Watanabe (C++) and Ken Benoit (R)
}