forked from quanteda/quanteda
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtextmodel_wordfish.Rd
129 lines (117 loc) · 5.7 KB
/
textmodel_wordfish.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/textmodel_wordfish.R
\name{textmodel_wordfish}
\alias{textmodel_wordfish}
\title{Wordfish text model}
\usage{
textmodel_wordfish(x, dir = c(1, 2), priors = c(Inf, Inf, 3, 1),
tol = c(1e-06, 1e-08), dispersion = c("poisson", "quasipoisson"),
dispersion_level = c("feature", "overall"), dispersion_floor = 0,
sparse = FALSE, abs_err = FALSE, svd_sparse = TRUE,
residual_floor = 0.5)
}
\arguments{
\item{x}{the dfm on which the model will be fit}
\item{dir}{set global identification by specifying the indexes for a pair of
documents such that \eqn{\hat{\theta}_{dir[1]} < \hat{\theta}_{dir[2]}}.}
\item{priors}{prior precisions for the estimated parameters \eqn{\alpha_i},
\eqn{\psi_j}, \eqn{\beta_j}, and \eqn{\theta_i}, where \eqn{i} indexes
documents and \eqn{j} indexes features}
\item{tol}{tolerances for convergence. The first value is a convergence
threshold for the log-posterior of the model, the second value is the
tolerance in the difference in parameter values from the iterative
conditional maximum likelihood (from conditionally estimating
document-level, then feature-level parameters).}
\item{dispersion}{sets whether a quasi-Poisson quasi-likelihood should be
used based on a single dispersion parameter (\code{"poisson"}), or
quasi-Poisson (\code{"quasipoisson"})}
\item{dispersion_level}{sets the unit level for the dispersion parameter,
options are \code{"feature"} for term-level variances, or \code{"overall"}
for a single dispersion parameter}
\item{dispersion_floor}{constraint for the minimal underdispersion multiplier
in the quasi-Poisson model. Used to minimize the distorting effect of
terms with rare term or document frequencies that appear to be severely
underdispersed. Default is 0, but this only applies if \code{dispersion =
"quasipoisson"}.}
\item{sparse}{specifies whether the \code{"dfm"} is coerced to dense. While
setting this to \code{TRUE} will make it possible to handle larger dfm
objects (and make execution faster), it will generate slightly different
results each time, because the sparse SVD routine has a stochastic element.}
\item{abs_err}{specifies how the convergence is considered}
\item{svd_sparse}{uses svd to initialize the starting values of theta,
only applies when \code{sparse = TRUE}}
\item{residual_floor}{specifies the threshold for residual matrix when
calculating the svds, only applies when \code{sparse = TRUE}}
}
\value{
An object of class \code{textmodel_fitted_wordfish}. This is a list
containing: \item{dir}{global identification of the dimension}
\item{theta}{estimated document positions} \item{alpha}{estimated document
fixed effects} \item{beta}{estimated feature marginal effects}
\item{psi}{estimated word fixed effects} \item{docs}{document labels}
\item{features}{feature labels} \item{sigma}{regularization parameter for
betas in Poisson form} \item{ll}{log likelihood at convergence}
\item{se.theta}{standard errors for theta-hats} \item{x}{dfm to which
the model was fit}
}
\description{
Estimate Slapin and Proksch's (2008) "wordfish" Poisson scaling model of
one-dimensional document positions using conditional maximum likelihood.
}
\details{
The returns match those of Will Lowe's R implementation of
\code{wordfish} (see the austin package), except that here we have renamed
\code{words} to be \code{features}. (This return list may change.) We
have also followed the practice begun with Slapin and Proksch's early
implementation of the model that used a regularization parameter of
se\eqn{(\sigma) = 3}, through the third element in \code{priors}.
}
\note{
In the rare situation where a warning message of "The algorithm did not
converge." shows up, removing some documents may work.
}
\examples{
(tmod1 <- textmodel_wordfish(data_dfm_lbgexample, dir = c(1,5)))
summary(tmod1, n = 10)
coef(tmod1)
predict(tmod1)
predict(tmod1, se.fit = TRUE)
predict(tmod1, interval = "confidence")
\dontrun{
dfmat <- dfm(data_corpus_irishbudget2010)
(tmod2 <- textmodel_wordfish(dfmat, dir = c(6,5)))
(tmod3 <- textmodel_wordfish(dfmat, dir = c(6,5),
dispersion = "quasipoisson", dispersion_floor = 0))
(tmod4 <- textmodel_wordfish(dfmat, dir = c(6,5),
dispersion = "quasipoisson", dispersion_floor = .5))
plot(tmod3$phi, tmod4$phi, xlab = "Min underdispersion = 0", ylab = "Min underdispersion = .5",
xlim = c(0, 1.0), ylim = c(0, 1.0))
plot(tmod3$phi, tmod4$phi, xlab = "Min underdispersion = 0", ylab = "Min underdispersion = .5",
xlim = c(0, 1.0), ylim = c(0, 1.0), type = "n")
underdispersedTerms <- sample(which(tmod3$phi < 1.0), 5)
which(featnames(dfmat) \%in\% names(topfeatures(dfmat, 20)))
text(tmod3$phi, tmod4$phi, tmod3$features,
cex = .8, xlim = c(0, 1.0), ylim = c(0, 1.0), col = "grey90")
text(tmod3$phi['underdispersedTerms'], tmod4$phi['underdispersedTerms'],
tmod3$features['underdispersedTerms'],
cex = .8, xlim = c(0, 1.0), ylim = c(0, 1.0), col = "black")
if (requireNamespace("austin")) {
tmod5 <- austin::wordfish(quanteda::as.wfm(dfmat), dir = c(6,5))
cor(tmod1$theta, tmod5$theta)
}}
}
\references{
Slapin, J. & Proksch, S.O. (2008).
\href{https://doi.org/10.1111/j.1540-5907.2008.00338.x}{A Scaling Model
for Estimating Time-Series Party Positions from Texts}. \emph{American
Journal of Political Science}, 52(3), 705--772.
Lowe, W. & Benoit, K.R. (2013). \href{http://doi.org/10.1093/pan/mpt002}{Validating
Estimates of Latent Traits from Textual Data Using Human Judgment as a Benchmark}.
\emph{Political Analysis}, 21(3), 298--313.
}
\seealso{
\code{\link{predict.textmodel_wordfish}}
}
\author{
Benjamin Lauderdale, Haiyan Wang, and Kenneth Benoit
}