-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhmm_testing.R
389 lines (306 loc) · 14.8 KB
/
hmm_testing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
##### Sera HMM & Activity Budgets #####
library(tidyverse, quietly = TRUE)
library(lubridate, quietly = TRUE)
library(ggplot2, quietly = TRUE)
library(momentuHMM, quietly = TRUE)
library(overlapping, quietly = TRUE)
library(lsmeans, quietly = TRUE)
source('Sera_functions.R')
# Set environment to EAT
Sys.setenv(TZ="Africa/Nairobi")
rawData <- read.csv('./movdata/Sera_datasets_10Mar21.csv')
meta <- read.csv('./movdata/sera_metadata.csv')
rawData <- merge(rawData, meta, by = 'MovDataID')
##### 1. Clean trajectories #####
#' *Prep data and check fixes
#' *Round fixes to nearest hour
#' *Remove individuals with fix rates <95%
## Prep and check data
sera <- data.prep(rawData, collar.data = c('Class', 'releaseDate', 'cohort'))
meta.table <- sera %>% group_by(MovDataID, CalcID, Class, releaseDate, cohort) %>%
tally()
#View(meta.table)
# viz of raw fix times
p <- ggplot(sera, aes(x = Fixtime, y = minute(Fixtime))) + geom_point() + facet_wrap(.~MovDataID) +
ylab("minutes from 0")
p
# Downsample to clean up immobility alerts and recheck result.
sera <- sera %>%
filter(minute(Fixtime) > 55 | minute(Fixtime) < 5 | between(minute(Fixtime), 25, 35))
#Downsample to 1-hour because wild collars are on hourly schedule
sera <- sera %>%
filter(minute(Fixtime) > 55 | minute(Fixtime) < 5)
# check downsample
p <- ggplot(sera, aes(x = Fixtime, y = minute(Fixtime))) + geom_point() + facet_wrap(.~MovDataID) +
ylab("minutes from 0")
p
## Clean Trajectories
# Set burst function -- no relocs within 4 hours creates a seperate burst
foo <- function(dt) {
return(dt > (6*3600))
}
# regularize traj
system.time({
ele.traj <- traj.func(df = sera, tol = 1, fix.units = "hour", regular = TRUE,
infolocs = sera[,c(1,6:8)])
})
# mean fix rate looks good
1 - mean(ele.traj$summary.df$fixRate)
# check output
# head(ele.traj$tracking.df)
##### 2. HMM dataframe #####
#' HMMs are fit using bursts as ID.
#' Filter out bursts with low fix rates and <500 points.
#' We also take the log step length.
library(momentuHMM)
# create burst filter list that meets HMM specs (>500 points in a burst, <5% missing data)
t <- filter.traj(df = ele.traj$summary.df, n.reloc = 500, p.missing = .05)
hmm.filter <- t$df
# all individuals are still included after filtering out bursts
levels(hmm.filter$id)
# check data loss after filtering out bursts - minimal
1-sum(hmm.filter$nb.reloc)/dim(sera)[1]
# create HMM dataframe
sera.hmm <- filter(ele.traj$tracking.df, burst %in% hmm.filter$burst) %>%
rename(ID = burst) # treat bursts as individuals for fitting
#sera.hmm <- subset(sera.hmm, !is.na(sera.hmm$x))
# prep data for HMM
sera.step <- prepData(sera.hmm)
sera.log.step <- sera.step
sera.log.step$step <- log(sera.log.step$step + 0.001) # add constant for zero steps
##### 3. Fit HMMs #####
# state label
stateNames2 <- c("encamped", "exploratory")
stateNames3 <- c("encamped", "meandering", "dirwalk")
# set distributions
# distGam = list(step = "gamma", angle = "vm") # zero-inflated gamma distributions
distNorm = list(step = "norm", angle = "vm") # normal step distribution since we use log step length
# Null 2-state
par2 <- list(step = c(2, 6, 2, 1),
angle = c(.2,.2))
system.time(
mod2 <- fitHMM(data = sera.log.step, nbStates = 2, dist = distNorm,
Par0 = par2,
retryFits = 1,
stateName = stateNames2,
modelName = "twoStep")
)
#' View model summary and diagnostic plots. Some autocorrelation in the step lengths
mod2
plotPR(mod2)
# Null 3-state
par3 <- list(step = c(1, 4, 6, 1.5, 1, 1),
angle = c(.2,.2,.2))
system.time(
mod3 <- fitHMM(data = sera.log.step, nbStates = 3, dist = distNorm,
Par0 = par3,
retryFits = 1,
stateName = stateNames3,
modelName = "threeState null")
)
# save 3-state model
#saveRDS(mod3, 'mod3_noCovs.RDS')
#mod3 <- readRDS('mod3_noCovs.RDS')
#' View model summary for 3-state model. Still some autocorrelation but not as bad as 2-state. Maybe temp-related?
mod3
plotPR(mod3)
#' To Add - Rose Diagram plots for turning angles
##### Activity Budgets #####
#' Using 3-state model, calculate behavioral states
#' Divide data into synchronized 14-day periods based on orphan cohort 1
#' Assess activity budgets using state-level time density
#' Assess activity time budgets (e.g. % time spent in each state over 14-day periods)
# assign states using viterbi algorithm
sera.hmm$viterbi <- (viterbi(mod3))
sera.hmm$state <- sera.hmm$viterbi
levels(sera.hmm$state) <- c('encamped', 'meandering', 'dirwalk')
# create true start and end dates based on collars
sera <- sera %>%
group_by(CalcID) %>%
mutate(StartDate = min(Fixtime), EndDate = max(Fixtime))
# check sample sizes by cohort
sera.hmm %>% group_by(cohort) %>% tally()
##### Activity Budgets - Time Density #####
#' Divide data into synchronized 14-day periods based on orphan cohort 1
#' Assess activity budgets using state-level time density
# split by cohort - 4 = wild
split <- split(sera.hmm, sera.hmm$cohort)
#as.Date(unique(sera.hmm$releaseDate))
release.date <- as.Date(c("2019-05-02", "2019-11-16", "2020-05-28", "2019-05-30"))
# function to assign periods
date_slice <- function(data, fixtime, release.date, slice.length){
# create a date variable
data$Day <- as.Date(fixtime)
# assign slices based on 14 day periods from release date
data$slice <- as.numeric(data$Day - release.date) %/% slice.length
# record start dates of each slice
data <- data %>% group_by(slice) %>%
mutate(slice.start = min(Day))
}
#' Plot activity over 14-day periods - all dates are relative to the first cohort's release date.
#' Activity time density plots are faceted by relative period, showing pre and post release. Immediate use of exploratory state after release
# set period length
per <- 14
cohort.1 <- date_slice(data = split[[1]], fixtime = split[[1]]$date, release.date = release.date[1], slice.length = per)
plot_budget(cohort.1, facet = slice~viterbi, title = 'cohort 1')
cohort.2 <- date_slice(data = split[[2]], fixtime = split[[2]]$date, release.date = release.date[1], slice.length = per)
plot_budget(cohort.2, facet = slice~viterbi, title = 'cohort 2')
cohort.3 <- date_slice(data = split[[3]], fixtime = split[[3]]$date, release.date = release.date[1], slice.length = per)
plot_budget(cohort.3, facet = slice~viterbi, title = 'cohort 3')
cohort.4 <- date_slice(data = split[[4]], fixtime = split[[4]]$date, release.date = release.date[1], slice.length = per)
plot_budget(cohort.4, facet = slice~viterbi, title = 'wild')
##### Overlap Tests #####
#' Test the overlap in density of activity between each cohort and the wild group using the 14-day time slices.
#' Data are filtered to remove pre-release data from orphan cohorts, and to remove time slices with less than 100 relocations.
library(overlapping)
## overlap test function - used in the for loop
overlap.test <- function(df, comp0, comp1, boot.it = 1000, plot = FALSE) {
require(tidyverse)
require(lubridate)
require(overlapping)
d0 <- dplyr::right_join(df, as.data.frame(comp0))
d1 <- dplyr::right_join(df, as.data.frame(comp1))
d <- list(as.numeric(hour(d0$date)), as.numeric(hour(d1$date)))
# overlap and bootstrap test
t <- overlap(d, plot = plot)
boot <- boot.overlap(d, B = boot.it)
boot$OVboot_stats
return(list(d = df, bootstrap = boot))
}
## Prep cohort dataframes
# filter out pre-release data based on release date of each cohort, and split by relative time slice
cohort.1 <- subset(cohort.1, as.Date(cohort.1$date) >= release.date[1])
cohort.1.split <- split(cohort.1, cohort.1$slice)
cohort.2 <- subset(cohort.2, as.Date(cohort.2$date) >= release.date[2])
cohort.2.split <- split(cohort.2, cohort.2$slice)
cohort.3 <- subset(cohort.3, as.Date(cohort.3$date) >= release.date[2])
cohort.3.split <- split(cohort.3, cohort.3$slice)
# combine to single nested list of orphan cohorts
cohort.all.list <- list(cohort.1.split, cohort.2.split, cohort.3.split) # all cohorts after filtering out pre-release data
# **for wild, must repeat the first 14 day period twice to compare with cohort 1's earlier release date
wild.split <- split(cohort.4, cohort.4$slice)
t <- length(wild.split)
wild.split[[t+1]] <- wild.split[[2]]
wild.split[[t+1]]$slice <- 0
wild.split[[1]] <- wild.split[[2]]
wild.split[[1]]$slice <- 1
# Bind back together and check result
cohort.4 <- do.call(rbind, wild.split)
#plot_budget(cohort.4, facet = slice~viterbi, title = 'wild') # check
# **
# filter out slices with low amounts of data
n <- 100
cohort.all.list <- lapply(cohort.all.list, function(x) Filter(function(y) nrow(y) >= n, x))
## Tactic comp loop
comp.all <- list()
OV_results <- list()
for(k in 1:3){ # orphan cohort loop - to be comped always with wild cohort 4
for(i in 1:3){ # viterbi state
for(j in 1:length(cohort.all.list[[k]])){ # slice loop must update for each cohort
#create comps
#create comps -- !! How to reference the correct slice as it iterates through. Will only work for individual cohorts
comp0 <- list(viterbi = as.integer(i), cohort = k, slice = unique(cohort.all.list[[k]][[j]]$slice)) # cohort number
comp1 <- list(viterbi = as.integer(i), cohort = 4, slice = unique(cohort.all.list[[k]][[j]]$slice)) # comp is always wild cohort 4
# run and store overlap output
t <- overlap.test(df = rbind(cohort.all.list[[k]][[j]], cohort.4), comp0 = comp0, comp1 = comp1)
name <- paste(as.integer(i), j,k,4, sep='-') #pattern = viterbi-slice-cohort-wild
q <- quantile(t$bootstrap$OVboot_dist, probs = c(0.05, 0.95))
# store OV stats, CIs, and comp info for sorting
comp.all[[name]] <- t$bootstrap$OVboot_stats
comp.all[[name]]$lwr <- as.numeric(q[1])
comp.all[[name]]$upr <- as.numeric(q[2])
comp.all[[name]]$viterbi <- comp0$viterbi
comp.all[[name]]$cohort <- comp0$cohort
comp.all[[name]]$slice <- comp0$slice
comp.all[[name]]$slice.start <- unique(cohort.all.list[[k]][[j]]$slice.start)
comp.all[[name]]$comp <- paste(k, 4, sep='-')
# store OV bootstrap results
OV_results[[j]] <- t$bootstrap$OVboot_dist
}
}
}
# join results into single dataframe
t <- do.call(rbind, comp.all)
#' Overall the overlap is lower than would be expected if they were behaving similarly.
#' In the ag tactic comparisons, similar tactics were averaging close to or above 90% overlap
t %>% group_by(state) %>% summarise(mean(estOV))
#' Plot the overlap estimates of activity between orphan cohorts and the wild group over time. Not any obvious trends.
#' x-axis corresponds to the 14-day time period. Plots are faceted by cohort comparison and behavioral state
#' Cohort 4 = Wild Group
t$state <- as.factor(t$viterbi)
levels(t$state) <- c('encamped', 'meandering', 'dirwalk')
t$comp <- as.factor(t$comp)
levels(t$comp) <- c('cohort1-wild', 'cohort2-wild', 'cohort3-wild')
ggplot(t, aes(slice, estOV)) + geom_pointrange(aes(ymin = lwr, ymax = upr, color = state), size = .3) + facet_wrap(comp~state) +
xlab('14-day periods') + ylab('Estimated overlap in 24-hour activity')
# # explore specific comps of interest
# k = 1
# i = 1
# j = 44
#
# #check comps
# comp0 <- list(viterbi = as.integer(i), cohort = k, slice = unique(cohort.all.list[[k]][[j]]$slice)) # cohort number
# comp1 <- list(viterbi = as.integer(i), cohort = 4, slice = unique(cohort.all.list[[k]][[j]]$slice)) # comp is always wild
# t <- overlap.test(df = rbind(cohort.all.list[[k]][[j]], cohort.4), comp0 = comp0, comp1 = comp1)
##### Activity Time Budgets #####
#' Plot activity time budgets (% time spent in each state) over 14-day periods
#' Plot absolute difference in time budgets between each cohort the wild group over 14-day periods
library(DescTools)
# ag budget with 95% CIs
ag.budget <- rbind(cohort.1, cohort.2, cohort.3, cohort.4) %>%
group_by(cohort, slice, viterbi) %>% tally() %>%
group_by(cohort, slice) %>%
mutate(prop = MultinomCI(n, conf.level = 0.95, sides = 'two.sided')[,1],
lwr.ci = MultinomCI(n, conf.level = 0.95, sides = 'two.sided')[,2],
upr.ci = MultinomCI(n, conf.level = 0.95, sides = 'two.sided')[,3],
) %>%
mutate(cohort = as.factor(cohort), viterbi = as.factor(viterbi))
# budgets over time
ag.budget$state <- ag.budget$viterbi
levels(ag.budget$state) <- c('encamped', 'meandering', 'dirwalk')
levels(ag.budget$cohort) <- c('cohort 1', 'cohort 2', 'cohort 3', 'wild')
ggplot(ag.budget, aes(slice, y = prop, group = viterbi)) + geom_point(aes(color = viterbi)) +
geom_line(aes(color = viterbi), linetype = 'dashed') +
facet_grid(rows = vars(cohort), cols = vars(viterbi)) +
xlab('14-day period') + ggtitle('activity time budgets for each group (14-day)')
# # plot by each cohort-wild comp
# c.1 <- subset(ag.budget, cohort %in% c('1', '4'))
# c.2 <- subset(ag.budget, cohort %in% c('2', '4'))
# c.3 <- subset(ag.budget, cohort %in% c('3', '4'))
#
# # plot w/ CIs
# ggplot(c.1, aes(x = as.factor(slice), y = prop, group = cohort)) +
# geom_pointrange(aes(ymin = lwr.ci, ymax = upr.ci, color = cohort)) +
# geom_line(aes(color = cohort), linetype = "dashed") +
# facet_wrap(.~viterbi) + ylab("% time spent in state") + xlab("14-day period")
# get differences between movement phases
split <- split(ag.budget, ag.budget$cohort)
wild <- split[[4]]
split[[4]] <- NULL
for(i in 1:3){
split[[i]] <- merge(split[[i]], wild, by = c("slice", 'viterbi'))
split[[i]]$diff <- (split[[i]]$prop.x - split[[i]]$prop.y) # use 'abs' to get absolute difference
split[[i]]$viterbi.y <- NULL
}
diff <- do.call(rbind, split)
diff$state <- diff$viterbi
levels(diff$state) <- c("encamped", "meandering", "dir-walk")
levels(diff$cohort.x) <- c("cohort 1", 'cohort 2', 'cohort 3', 'wild')
#' There is not much temporal pattern here.
#' Columns correspond to behavioral state, rows correspond to each orphan cohort
ggplot(diff, aes(x = slice, y = diff, group = state)) + geom_point(aes(color = state)) +
geom_line(aes(color = state), linetype = "dashed") + geom_hline(yintercept = 0) +
facet_grid(rows = vars(cohort.x), cols = vars(state)) +
xlab('14-day period') + ylab('difference in activity time budget') +
ggtitle('Difference in activity time budgets between cohorts and wild groups (14-day)')
#' Difference between time budgets for each orphan cohort compared to the wild group.
#' Overall, orphans appear to generally spend more time in exploratory and less time in encamped. Amounts of time in meandering are similar
diff %>% group_by(viterbi, cohort.x) %>% summarise(mean(diff))
#' A linear mixed model of percent time by cohort and behavioral state demonstrates that cohort 1 and 2 spent significantly more time
#' in an exploratory state, and significantly less time in encamped.
mod <- lm(prop ~ cohort*viterbi, data = ag.budget)
library(lsmeans)
# want to compare the levels of cohort, within the levels of viterbi state
ls <- lsmeans(mod, pairwise ~ cohort|viterbi, glhargs=list())
#ls
plot(ls[[1]])