-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSDM tutorial.R
541 lines (485 loc) · 28 KB
/
SDM tutorial.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
##SDM tutorial, for more info go here: https://rspatial.org/raster/sdm/2_sdm_occdata.html
###Install and load packages required for the SDM
install.packages(c('raster', 'rgdal', 'dismo', 'rJava'))
install.packages("raster")
install.packages("rgeos")
install.packages("maptools")
install.packages("usdm")
install.packages("rlang")
install.packages("mapview")
install.packages("sdm")
installAll() #install all the dependent packages to the previously installed package, in this case sdm
library('raster')
library("rgdal")
library("dismo")
library("raster")
library("rgeos")
library("maptools")
library("usdm")
library("mapview")
library("rlang")
library("sdm")
library("devtools")
##DATA PREPARATION
#Load the data
acaule <- gbif("solanum", "acaule*", geo=FALSE) #extracting data from gbif
#in the paranthesis put the species names required
#"geo=FALSE"- gets all records, geo=T will get only records with geographical data
lynx<-gbif("lynx","rufus*",geo=FALSE)# OR lynx example
data(acaule) # load the acaule data
colnames(acaule) #check column names
acgeo <- subset(acaule, !is.na(lon) & !is.na(lat)) #get data that has coordinates
dim(acgeo)#check how much data with coordinates you have
acgeo[1:4, c(1:5,7:10)] #to see column 1-4 and rows 1-5, 7-1
##Other places you can get data from
#http://portal.vertnet.org/search
###Create a map
data(wrld_simpl) #load world map
plot(wrld_simpl, xlim=c(-80,70), ylim=c(-85,90), axes=TRUE, col="light yellow") #for world coordinates
plot(wrld_simpl, xlim=c(-63,-55), ylim=c(-28,-19), axes=TRUE, col="light yellow")#for Paraguay coordinates
plot(wrld_simpl, xlim=c(-100,0), ylim=c(-70,20), axes=TRUE, col="light yellow")#for south america coordinates
box() # restore the box around the map
points(acgeo$lon, acgeo$lat, col='orange', pch=20, cex=0.75) #add data coordinates
points(acgeo$lon, acgeo$lat, col='green', cex=0.75) #give colour to the border
?getData #you can upload different country boundaries, better quality
###CORRECT COORDINATES
lonzero = subset(acgeo, lon==0) #find all the data with longitude of zero
dups <- duplicated(lonzero[, 1:10]) #find duplicates of the longitude zero
lonzero <-lonzero[dups, ] #remove all the duplicates from the lonzero data
dups2 <- duplicated(acgeo[, c('lon', 'lat')]) #detect all the duplicates (i.e in same coordinates) in the whole data
sum(dups2) #check how many points you have left
acg <- acgeo[!dups2, ] #keep the non duplicated records
# this data should only have coordinates in southern Peru, Bolivia and northern Argentina
# coordinates (latitude-NS -80 to -57, longitude WE -22 to -0)
#Therefore, any positive coordinates should be changed to negative
i <- acg$lon > 0 & acg$lat > 0 #creates a data where both coordinates are more than 0
acg$lon[i] <- -1 * acg$lon[i] #changes the longitude of the coordinates that are more than 0 to negative numbers
acg$lat[i] <- -1 * acg$lat[i] #same with latitude (This gets rid of wrong coordinates in Parkistan)
#With these removed now we want to get rid of coordinates in Antarctica and Brazil
acg <- acg[acg$lon < -50 & acg$lat > -50, ]
Occurences.PY<-acgeo[acgeo$country=="Paraguay",] #this chooses only points found in Paraguay
#Note: some coordinates may not have a country name, if there are any without country name
which(is.na(acgeo$country))
#Alternatively to remove duplicates
dups.1<-duplicated(lonzero[c("lat","lon")])
sum(dups.1) #to see how many duplicates you've got
Occurences<-Occurences[!dups,] #update your data to not have duplicates
#Remove fossil and captive data (if there is one), check for any other things
Occurences<-Occurrences[Occurences$isfossil==o]
#Double check coordinates with the coordinating countries. This allows to see spatial query of polygons
library(sp)
coordinates(acg) <- ~lon+lat #set your columns as coordinates
crs(acg) <- crs(wrld_simpl) #crs - coordinate reference system
class(acg) #prints the vector of names of classes "acg" inherits from
class(wrld_simpl)
ovr <- over(acg, wrld_simpl) #This spatialy overlays the data acg with the world simple data
head(ovr) #to see the result
cntr <- ovr$NAME ## To get variable ‘NAME’ in the data.frame of wrld_simpl
t <- which(is.na(cntr)) # find data with missing country names (the one's that have NA for the country column)
t #it will tell you how many you have, in this case the integer will be 0 because we have the data
j <- which(cntr != acg$country) #find points where coordinates and country do not match, "!=" means are not equal to
cbind(cntr, acg$country)[j,] # for the mismatches, bind the country names of the polygons and points
#Note one of the common mistakes is mistmaching points on the border of countries, due to wrld_simpl not being very precise
plot(acg) #plot the coordinates with all the cleaning up you've done
plot(wrld_simpl, add=T, border='blue', lwd=2) #"add=T" - this adds the map to the existing plot (i.e the points), "lwd" - line width
points(acg[j, ], col='red', pch=20, cex=2) #add the j coordinates to the map
#"cex"-size of the point, "pch" - type of character http://coleoguy.blogspot.com/2016/06/symbols-and-colors-in-r-pch-argument.html
##To create coordiantes for data with country records with no coordinates, use georeferencing
georef <- subset(acaule, (is.na(lon) | is.na(lat)) & ! is.na(locality) ) # "subset"- selects only parts of the data you need, "is.na"- find data with no coordinates, but where locality is present
dim(georef) #returns dimentions of the georef, gives you number of records and columns
# to do georeferencing you need to get Google API_KEY
#To subsample your samples (to not be biased towards certain samples)
r <- raster(acg) #create a RasterLayer with the extent of acgeo (creates squares on the map)
res(r) <- 1 #set the resolution of the cells to (for example) 1 degree
r <- extend(r, extent(r)+1) ## expand (extend) the extent of the RasterLayer a little
acsel <- gridSample(acg, r, n=1) # to sample
p <- rasterToPolygons(r) # to see the results and illustrate the method
plot(p, border='gray')
points(acg)
points(acsel, cex=1, col='red', pch='x')
#Alternatively to thin the data use spThin package
thin( loc.data, lat.col = "LAT", long.col = "LONG", spec.col = "SPEC",
thin.par, reps, locs.thinned.list.return = FALSE, write.files = TRUE,
max.files = 5, out.dir, out.base = "thinned_data", write.log.file = TRUE, log.file = "spatial_thin_log.txt", verbose = TRUE )
##Make sure that coordinate system are the same
Occur.NAD27a<-Occurrences[Occurrences$geodeticdatum=="NAD27",]
Occur.NAD27b<-Occurrences[Occurrences$geodeticdatum=="North American Datum",]
Occur.NAD27<-rbind(Occur.NAD27a,Occur.NAD27b) #bird the observations together
coordinates(Occur.NAD27)<--dechimallongitude+decimallatitude #state that these are coordinates
projection(Occur.NAD27)<-crs('+proj=longlat +datum=NAD27') #give those coordinates a projection
NAD27.converted<-spTransform(Occur.NAD27,crs('+proj=longlat +datum=WGS84')) #convert the coordinates to WGS84
#Once you clean up all the data you can have data.frame with needed columns
acgeo$species1<-1 #add a new column, make sure that the name of the column is not already taken, make every value in that column 1
acgeo<-acgeo[,c('lon','lat','species1')] #choose the columns you want to keep
coordinates(acgeo)<-~lon+lat #to convert this data to spatial dataframe, introduce the columns that has the geodata
class(acgeo) #double-check that it worked, it should say now "SpatialPointsDataFrame", also acgeo should have only one column now
### ABSENCE-PRESENCE
##this allows us to choose random points which would eliminate oversampling or undersampling error)
files <- list.files(path=paste(system.file(package="dismo"), '/ex',
sep=''), pattern='grd', full.names=TRUE ) #get the file names, "pattern=grd" - read files ending with .grd
# "sep'' " - reads a file in a table format and creates a data format for it
mask <- raster(files[1]) # create raster layer
set.seed(1963) #this makes sure that the examples are chosen at random, will produce 1963 random examples
bg <- randomPoints(mask, 500 ) #select 500 random points
###Plot a map (to inspect the results)
par(mfrow=c(1,1)) #set the parameters (i.e size of the map)
plot(!is.na(mask), legend=FALSE) #exclude all the points in the "mask" that have NA, "legend=FALSE" - don't show legend
points(bg, cex=0.5) #puts the points on the map
#Note that when your map space is too small the map and points will appear displaced
e <- extent(-80, -53, -39, -22) #this creates limits of the points, creating a red square of results(xmin=-80, xmax=-53, ymin=-39, ymax=-22)
bg2 <- randomPoints(mask, 50, ext=e) # selects 50 random points, using the limits we've set
plot(!is.na(mask), legend=FALSE) #plot mask exluding NA data
plot(e, add=T, col='red') #add new limits (i.e the red box), "add=T" - adds map to the existing one
points(bg2, cex=0.5) #add points chosen will be the random one's we've assigned. To see current parameters write this "par()", "cex" - size of dots or text
##Example
file <- paste(system.file(package="dismo"), '/ex/acaule.csv', sep='') #Load acaule example
ac <- read.csv(file)
coordinates(ac) <- ~lon+lat #create coordinates
projection(ac) <- CRS('+proj=longlat +datum=WGS84') #change ac data from ac.frame to SpatialPointsDataFrame
#make these coordinates readable, createing coordinate reference system with projects of longitude and latitude, using goereferencing datum of WGS84
#Create diameter of locations
x <- circles(ac, d=50000, lonlat=TRUE) #"d" - radius/diameter, in this case 50km
pol <- polygons(x) #instead of having points for your data, you will not have i.e you'll encase data within 50km of your points
samp1 <- spsample(pol, 250, type='random', iter=25) #randomly sample the circles
cells <- cellFromXY(mask, samp1) #get unique cells
length(cells) #check the number of the cells
cells <- unique(cells) #check number of unique cells
xy <- xyFromCell(mask, cells) #"xyFromCell"-get coordinates of the center of raster cells for a row, column
plot(pol, axes=TRUE)
points(xy, cex=0.75, pch=20, col='blue')
#Select only those cells that have their centers within the circles, using the overlay function
spxy <- SpatialPoints(xy, proj4string=CRS('+proj=longlat +datum=WGS84'))
o <- over(spxy, geometry(x))
xyInside <- xy[!is.na(o), ]
v <- extract(mask, x@polygons, cellnumbers=T)
v <- do.call(rbind, v) # use rbind to combine the elements in list v
v <- unique(v[,1]) # get unique cell numbers from which you could sample
head(v)
m <- mask
m[] <- NA #this tells you to not select NA subsets? I think
m[v] <- 1
plot(m, ext=extent(x@polygons)+1)
plot(x@polygons, add=T)
##ENVIRONMENTAL DATA - pull up maps
##Note: make sure that spatial extent, resolution, origin, and projection are the same
#For more go to dismo package: https://cran.r-project.org/web/packages/dismo/dismo.pdf
# OR how it is calculates: https://pubs.usgs.gov/ds/691/ds691.pdf
path <- file.path(system.file(package="dismo"), 'ex') #get the file "ex"
files <- list.files(path, pattern="grd$", full.names=TRUE ) #"choose files that end with grd ($ means ends with)", "full.names" - returns the full path names
predictors <- stack(files) #Stack merges all the files
#This will load predictable values for environmental changes
predictors
names(predictors)
plot(predictors)
###PLOT species environmental data
data(wrld_simpl)
file <- paste(system.file(package="dismo"), "/ex/bradypus.csv", sep="") #pull up bradypus data and dismo package has bioclimatic data
bradypus <- read.table(file, header=TRUE, sep=',')
bradypus <- bradypus[,-1] #get rid of the first column with bradypus species names
plot(predictors,1) #1 is the number of the biological data you want to load, in this case temperature (i.e bio1)
plot(wrld_simpl, add=TRUE)
points(bradypus, col='red') #plug the bradypus numbers
#For more datasets or higher resolution go to WWF:https://www.worldwildlife.org/pages/conservation-science-data-and-tools
#Or here for climatic data: https://www.worldclim.org/data/index.html
#Creating predictors for boclimatic variables
climate<-getData('worldclim',download=T,var='bio',res=2.5) #get climate data, var - variables, res-resolution, try putting 10
plot(climate)
obsclim<-extract(climate,Occur) #extract climate data for each one of the observation points (of the Occurence data)
#Create species distribution model using bioclim
bioclim.model<-bioclim(obsclim)
pairs(bioclim.model,pa='p')
#Predictors - creates a map of suitable areas, taking into account the biomes you want it to include
predictors<-stack(climate$bio1,climate$bio2,climate$bio3) #add ass many as you want
predictions<-predict(predictors,bioclim.model)
plot(predictions,xlim=c(-63,-55), ylim=c(-28,-19),axes=T)
##Extract environemntal values from rasters
presvals <- extract(predictors, bradypus) #predictors (predictable environemntal data) and bradypus data
set.seed(0) #setting a random value
backgr <- randomPoints(predictors, 500) #select 500 random points
absvals <- extract(predictors, backgr) #extract numbers for the background points
pb <- c(rep(1, nrow(presvals)), rep(0, nrow(absvals)))
sdmdata <- data.frame(cbind(pb, rbind(presvals, absvals))) #first column variable is "pb", this indicates whether its presence or a backgrond point (i.e bradypus present or not)
sdmdata[,'biome'] = as.factor(sdmdata[,'biome'])
head(sdmdata) #see the data
tail(sdmdata) #see the data
pairs(sdmdata[,2:5], cex=0.1) #to see colinearity within the environmental data
#Save data for the next steps
saveRDS(sdmdata, "sdm.Rds")
saveRDS(presvals, "pvals.Rds")
##CREATE A MAP (+IMPROVE IT)
mapview(acgeo)
#if the projection doesn't work i.e it's not showing where its supposed to be
proj4string(acgeo) #to check what the projection is
proj4string(acgeo)<-projection(raster())
#raster function without any argument creates an empty raster with a Geographic coordinate system
#projection(x) reads the coordinate reference system of the object x
#projection(raster()) resutns the coordinate reference system for the object created by raster()which is the one
#we needed for or sp object i.e the definition for the geographic coordinate reference system
mapview(acgeo)
## MODEL FITTING, PREDICTION, AND EVALUATION
#Calculate variance inflation factor (VIF) for a set of variables and exclude the highly correlated variables
#When bio data is corelated e.g high humidity and rain, they can have an excesively high effect on the data
#whilst in reality they are almost the same thing (kind of)
bio<-raster::getData('worldclim',var='bio',res=10) #extract biome data (new way) or use previous methods
#Reduce collinearity
v1<-vifstep(bio) #you can choose either this or the vifcor one below. Both are good.
v2<-vifcor(bio,th=0.9) #tests correlation, if greater than 0.9 - need to remove
v1
v2 #to check results. Tells you how many variables from the input are collinearity problem
biom<-exclude(bio,v2) #then you choose to exlude those variables, in dimensions the number before the bracket says how many variables it has
#Plot to check (the species data to see the distribution)
plot(biom[[1]]) #plot biom data only bio1 figure
points(acgeo,cex=0.5,pch=16)
#Load saved data
sdmdata <- readRDS("sdm.Rds")
presvals <- readRDS("pvals.Rds")
#model fitting
m1 <- glm(pb ~ bio1 + bio5 + bio12, data=sdmdata) #how does the presence data correlate with the bio data (1,5, and 12)
#for meanings for the biomes go here:
class(m1) #prints/checks the vector of names of classes an object has, in this case generalised linear model and linear model
summary(m1)
m2 = glm(pb ~ ., data=sdmdata) #create general linear model between the presence and other data?
m2 #see data
#bioclim only takes presence data, so presvals is used instead of "sdmdata"
bc <- bioclim(presvals[,c('bio1', 'bio5', 'bio12')])
#extracted presvals (bradypus data on where it is present and environmental data), but only columns bio1, bio5, bio12
bc #show data
pairs(bc) #show the plot matrix
#Model prediction
bio1 = c(40, 150, 200)
bio5 = c(60, 115, 290)
bio12 = c(600, 1600, 1700)
pd = data.frame(cbind(bio1, bio5, bio12))
pd
predict(m1, pd) #m1 - glm model object
predict(bc, pd) #bc - bioclimatic model
response(bc) #creates response plots for each variable
#Create a map of predictors
predictors <- stack(list.files(file.path(system.file(package="dismo"), 'ex'), pattern='grd$', full.names=TRUE ))
#This creates predictable model for the bio data
p <- predict(predictors, m1) #predict presence and bio data
plot(p) #map of suitability scores, the darkest means more chances of having presence of bradypus?
#Model evaluation
###Test quality of the prediction
p <- rnorm(50, mean=0.7, sd=0.3) #rnorm - random numbers from normal distribution
#presence has higher values, shows prediction for 50 known cases/locations where the species is present
a <- rnorm(50, mean=0.4, sd=0.4)
#same as presence, but for when the species is absent
par(mfrow=c(1, 2)) #set parameters, 1 row, 2 columns (for the plot)
plot(sort(p), col='red', pch=21) #sort a vector into accending/descending order
points(sort(a), col='blue', pch=24)
legend(1, 0.95 * max(a,p), c('presence', 'absence'),
pch=c(21,24), col=c('red', 'blue')) #creates a legend
comb <- c(p,a) #combine both data
group <- c(rep('presence', length(p)), rep('absence', length(a))) #length - sets the length of a vector
boxplot(comb~group, co=c('blue', 'red')) #combined data vs grouped length
##Compute the correlation coefficient and the AUC
group = c(rep(1, length(p)), rep(0, length(a)))
cor.test(comb, group)$estimate
mv <- wilcox.test(p,a)
auc <- as.numeric(mv$statistic) / (length(p) * length(a))
auc
#Another example of testing prediction
e <- evaluate(p=p, a=a)
class(e)
e
par(mfrow=c(1, 2))
density(e)
boxplot(e, col=c('blue', 'red'))
###Bioclimatic example
samp <- sample(nrow(sdmdata), round(0.75 * nrow(sdmdata))) #
traindata <- sdmdata[samp,]
traindata <- traindata[traindata[,1] == 1, 2:9]
testdata <- sdmdata[-samp,]
bc <- bioclim(traindata)
e2 <- evaluate(testdata[testdata==1,], testdata[testdata==0,], bc)
e2
plot(e, 'ROC')
pres <- sdmdata[sdmdata[,1] == 1, 2:9]
back <- sdmdata[sdmdata[,1] == 0, 2:9]
k <- 5
group <- kfold(pres, k) #partition the data into 5 groups
group[1:10]
unique(group)
##fit and test our model five times.
##In each run, the records corresponding to one of the five groups is only used to evaluate the model,
##while the other four groups are only used to fit the model.
e <- list() #results will be stored in a list called ‘e’.
for (i in 1:k) {
train <- pres[group != i,]
test <- pres[group == i,]
bc <- bioclim(train)
e[[i]] <- evaluate(p=test, a=back, bc)
}
## Extract AUC values and the “maximum of the sum of the sensitivity (true positive rate) and specificity (true negative rate)” threshold “spec_sens”
##(this is sometimes uses as a threshold for setting cells to presence or absence).
auc <- sapply(e, function(x){x@auc})
auc
mean(auc)
sapply( e, function(x){ threshold(x)['spec_sens'] } )
##
file <- file.path(system.file(package="dismo"), "ex/bradypus.csv")
bradypus <- read.table(file, header=TRUE, sep=',') #read the data
bradypus <- bradypus[,-1] #get rid of unessesary columns
presvals <- extract(predictors, bradypus) #
set.seed(0)
backgr <- randomPoints(predictors, 500)
nr <- nrow(bradypus)
s <- sample(nr, 0.25 * nr)
pres_train <- bradypus[-s, ]
pres_test <- bradypus[s, ]
nr <- nrow(backgr)
set.seed(9)
s <- sample(nr, 0.25 * nr)
back_train <- backgr[-s, ]
back_test <- backgr[s, ]
sb <- ssb(pres_test, back_test, pres_train)
sb[,1] / sb[,2] #sb[,1] / sb[,2] is an indicator of spatial sorting bias (SSB).
#If there is no SSB this value should be 1, in these data it is close to zero, indicating that SSB is very strong.
#Example of when SSB is removed
i <- pwdSample(pres_test, back_test, pres_train, n=1, tr=0.1)
pres_test_pwd <- pres_test[!is.na(i[,1]), ]
back_test_pwd <- back_test[na.omit(as.vector(i)), ]
sb2 <- ssb(pres_test_pwd, back_test_pwd, pres_train)
sb2[1]/ sb2[2]
bc <- bioclim(predictors, pres_train)
evaluate(bc, p=pres_test, a=back_test, x=predictors)
evaluate(bc, p=pres_test_pwd, a=back_test_pwd, x=predictors) #notice how AUC dropped
##MODELING METHODS
##Method 1 of modelling for PRESENCE ONLY DATA
head(acgeo) #double check that your data only has species column
d<-sdmData(species1~.,acgeo, predictors=biom) #specify species (the name of the colum) against everything else
#predictors would be the biom
d #this will show you that this is presence only data
#Method 2 of modelling for Presence-Absence data
head(acgeo) #double check that your data only has species column
d<-sdmData(species1~., acgeo, predictors=biom,bg=list(n=1000)) #specify species (the name of the colum) against everything else
#predictors would be the biom, "bg=" - background creates absence data
#where 1000 is the number of pseudoabsences, usually generate twice the number of existing presence number
d #now you can see that it's presence-background data
#Note that you might lose some values due to this formula getting rid of duplications or any missing values
m<-sdm(species1~., d, methods = c('glm','svm','rf','brt','mars'),
replication=c('boot'),n=2)
#choose the column/dataset name -species1
#we use four different methods to fit the models
#for boostrapping the minimum number of samples is 400 to get p value of 0.05, for 0.01 15000
m #to see the results
#Look at AUC values. the higher they are the higher theprobability that the fit model will score a randomly drawn positive sample
#Note that the higher you put your pseudoabsences doesn't make the result more reliable since those are not real absences
#Prediction building
#First option
p<-predict(m,biom,'predictions.img', mean =T)
#predictions - file name in which your data is saved in your working directory
#"mean=T", if you have several rounds for each replication
#(e.g for example we have 1000 for bg - you will get an average over all rounds for each technique)
#Second option
en<-ensemble(m,biom,'ens.img',setting=list(method='weights',stat="AUC"))
#for this function you need to specify some setting.
#Method used. Choices: 1.unweighted - taken averages over all the methods,
#2. weighted - gives higher weights to the methods that have higehr accuracy - for this you need to specify statistics e.g AUC
#AUC is threshold independent
#OR
en<-ensemble(m,biom,'ens.img',setting=list(method='weights',stat="TSS",opt=2,overwrite=TRUE))
#use TSS
gui(m) #look at the output of the model
#To download variables for the future time
biof<-raster::getData('CMIP5',var='bio',res=10,rcp=85,year=70,model='AC')
#to get data from wordclim you need to use CMIPS as the name of the dataset
#res-resolution, rcp scenario, year =70
names(biof)<-names(bio) #becase if you check the names of biof they will be changed to more complicated ones
?getData #to get more info on variations
#Plot that model of the potential current distribution of the species
plot(p[[1]]) #the number stands for the model that you are trying
#in this case 1=glm, 5-mars
plot(p[[c(1:5)]])
#Plot the map for the future
pf<-predict(m,biof,'predictionsf.img')
enf<-ensemble(m,biof,'ensf.img',setting=list(method='weights',stat="TSS",opt=2))
plot(stack(en,enf)) #stack current and future to compare
plot (enf) #to see the future
mapview(stack(en,enf)) #you stack the probability for the current time and the future time
#on the map (its interactive), you can see the layer 1 - current, 2 future
#Calculate the changes/difference in probability
changes<-enf-en
plot(changes)
#the plus values (over zero) - are the ones that gain suitability
#under 0 are the ones that lost suitability, 0 is no change
##FOR MAP OF PRESENCE-ABSENCE (e.g only 1 and 0 numbers)
#this one is for the current
ev<-getEvaluation(m,stat=c('AUC','TCC','threshold'),opt=2) #Get statistics that you want
#opt - specifies which optimisation value you want to use e.g 2
mean(ev$threshold) #gives result of the mean of the threshold e.g 0.5
pa<-raster(en) #create an empty raster
pa[]<-ifelse(en[] >=0.5,1,0) #0.5 is the value of the mean threshold see the top, change depending on your value
#if the value of en is greater of equal to that number than the answer will be 1, otherwise 0
plot (pa)
#For future time
paf<-raster(enf) #create an empty raster
paf[]<-ifelse(enf[] >=0.05,1,0) #if the values of en, 0.5 the value of the mean threshold
#if the value is greater of equal to that number than the answer will be 1, otherwise 0
plot (paf)
#Compare those two
pac<-paf-pa
cl<-colorRampPalette(c('red','gray','darkgreen')) #set the pallete: red -extinction, gray-no change, darkgreen- colonisation
plot(pac,col=cl(3)) #the results is either 1,0,-1, which means colonisation, didn't change, lost respectively
##Method 3 of modelling
##Types of organisms
##Create data
data(wrld_simpl)
predictors <- stack(list.files(file.path(system.file(package="dismo"), 'ex'), pattern='grd$', full.names=TRUE ))
file <- file.path(system.file(package="dismo"), "ex/bradypus.csv")
bradypus <- read.table(file, header=TRUE, sep=',')
bradypus <- bradypus[,-1]
presvals <- extract(predictors, bradypus)
set.seed(0)
backgr <- randomPoints(predictors, 500)
absvals <- extract(predictors, backgr)
pb <- c(rep(1, nrow(presvals)), rep(0, nrow(absvals)))
sdmdata <- data.frame(cbind(pb, rbind(presvals, absvals)))
sdmdata[,'biome'] <- as.factor(sdmdata[,'biome'])
#Responce of species to each variable (bios)
rcurve(m,id=9) #plot that shows responce curve - shows at which range the species would increase/descrease, range of survival
#id - is the number of the model used. Note the curve does NOT show importance of the variables itself
plot(sdm::getVarImp(m,9)) #to see how important different variables(i.e bios) are
#Pre-model data clean up
pred_nf <- dropLayer(predictors, 'biome') #drop categorical values
set.seed(0)
group <- kfold(bradypus, 5) #kfold - partitioning of a data set for model testing purposes, 5 - number of groups
pres_train <- bradypus[group != 1, ] # group is not equal to 1 (as in absent?)
pres_test <- bradypus[group == 1, ]# group is equal to 1 (as in present?)
ext <- extent(-90, -32, -33, 23) #restrict the area coordinates
#The first layer in the RasterStack is used as a ‘mask’.
#That ensures that random points only occur within the spatial extent of the rasters,
# and within cells that are not NA, and that there is only a single absence point per cell.
set.seed(10)
backg <- randomPoints(pred_nf, n=1000, ext=ext, extf = 1.25)
#restrict the background points to be within 12.5% of our specified extent ‘ext’
colnames(backg) = c('lon', 'lat')
group <- kfold(backg, 5)
backg_train <- backg[group != 1, ]
backg_test <- backg[group == 1, ]
r <- raster(pred_nf, 1)
plot(!is.na(r), col=c('white', 'light grey'), legend=FALSE) #create a map with white background, gray continents
plot(ext, add=TRUE, col='red', lwd=2) #create limits for the restricted coordinates
points(backg_train, pch='-', cex=0.5, col='yellow') #points for baground train
points(backg_test, pch='-', cex=0.5, col='black')#points for baground test
points(pres_train, pch= '+', col='green') #bradypus absent?
points(pres_test, pch='+', col='blue') #bradypus present?
##Profile methods:
#Bioclimatic method - used extensively in sdm, great model, not as great for the climate change, easiest to use
bc <- bioclim(pred_nf, pres_train) #we fit a bioclim model simply using the predictors and the occurrence points
plot(bc, a=1, b=2, p=0.85)
e <- evaluate(pres_test, backg_test, bc, pred_nf) #evaluate the model
e
tr <- threshold(e, 'spec_sens') #find a threshold
tr
pb <- predict(pred_nf, bc, ext=ext, progress='') #RasterStack with predictor variables to make a prediction to a RasterLayer
pb
#Plot
par(mfrow=c(1,2)) #2 graphs
plot(pb, main='Bioclim, raw values') #bioclimate data
plot(wrld_simpl, add=TRUE, border='dark grey')
plot(pb > tr, main='presence/absence') #map w/presence and absence
plot(wrld_simpl, add=TRUE, border='dark grey') #create country borders
points(pres_train, pch='+') #present data bradypus coordinates
#Domain method - commonly used, bad in terms of climate change, poor in model comparison