R for Spatial Statistics

 

Presence-Only Example with GAMs

Below is a comlete, documented, example of using R to create a model with GAMs from presence-only data. This example uses the following steps to create the model. These steps could easily be used to create other types of models from presence-only data.

  1. Load the required libraries (this step only needs to be executed once)
  2. Require the required libraries
  3. Let the user select a file with the sample data and then load it into R
  4. Let the user select a folder with the predictor layers and then load them into a vector in R
  5. Plot one of the predictors with the sample data overlaid
  6. Randomly sample the area of the predictors for "background" points and then add them as "absences" to the sample data
  7. Divide the data into "training" and "test" datasets
  8. Run the model on the "training" dataset
  9. Find the "Area-Under-The-Curve" metric for the results
  10. Plot the final predicted surface

The Code

#################################################################################################################
#   Purpose: GEO 599 Big data modeling class - GAMs example with only prescence data 
#	Date of most recent modification: 4.24.2013
#	Date: 4.14.2013
#	Authors:  Jose Montero & Jim Graham
#	Language: R
#	Data from ?
#################################################################################################################

############################## load the data and required libraries

install.packages("MapGAM")
install.packages("mgcv")
install.packages("tcltk2")
install.packages("rgdal")
install.packages("dismo")
install.packages("raster")
install.packages("maptools")
install.packages("sp")
install.packages( "ggplot2")
install.packages("rgdal")

require(ROCR)                              # Library to calculate AUC
require(ggplot2)                           # ggplot2 library to make the map                    
require(tcltk2)                            # library to set the directory to load the predictor rasters
require(rgdal)                             # you should know already this one if are all spatial people
require(dismo)                             # the dismo packages has losts of toos for SDM
require(raster)                            # to handle raster. It has a lot of tools similar to ArcMap 
require(maptools)                          # Another usefull library for spatial stuff
require(sp)                                # this is the spatial library
require(mgcv)                              # GAMs library

FilePath=file.choose()
TheRaster=raster(FilePath)

trees=read.csv(FilePath,h=T)         # load the data. flile.choose() allows you to search for the data
names(trees)                               # Chech the names of hech attribute in the data frame you loaded
trees$pb=1                                # add 1 to the data frmae to represent prescences
trees$pb                                  

# Add the rasters
setwd(tk_choose.dir(default="", caption="Select directory")) # select directory where the .img rasters are
file.list=list.files()                    # allocate memory for the list of files
rasters=list()                            # allocate memory for the list of rasters
for(file in file.list)                     # loop to load the rasters
{
  print(file)                              # print the variable in the loop to ccheck is working
  rasters[[file]]=raster(file)            # generate the list of raster objects
}                                          # end of the loop

all.rasters =stack(rasters)               # create a stack layer of rasters (to do this all should be mask the same way)
plot(all.rasters)                          # plot the stack to 

plot(rasters[[1]],                         # plot the raster 1 in the list
     xlim=c(-125,-114), ylim=c(32,42))     
points(trees$x, trees$y,               # over lay the tree points on it
       pch=19, cex=.2, col='black')

############################# Backgrounds sampling when abscence data is not available 

set.seed(100000)                           # generate 100000 the random points
mask=rasters[[1]]                         # generate a mask to limit the sampling region where we are going to predict
backG = randomPoints(mask, 10000)         # sample 5000 background points within the mask

predictors=stack(rasters)                 # make a new stack call predictors. you can also use the one we created before
coordinates(trees) = ~Lon+Lat             # get the tree cordinates
backgrp = extract(predictors, backG)      # sample predictor values to background points      
presv   = extract(predictors, trees)      # sample predictor values to prescence points  
backgrp=data.frame(backgrp)               # make data frame of extracted back points

pb = c(rep(1, nrow(presv)), rep(0, nrow(backgrp)))     # put pres/abs values together
sdmdata = data.frame(cbind(pb, rbind(presv, backgrp))) # make final data frame for modeling
sdmdata=na.omit(sdmdata)                               # omit NA values

############################## dividing the data in testing and train sets
                
index=1:nrow(sdmdata)                                  # get data frame index for sampling
test.index=sample(index, trunc(length(index)/3))       # 30% of index is sampled as test data frame
test.set=sdmdata[test.index,]                          # make test data frame   
trai.nset=sdmdata[-test.index,]                        # make train data frame

############################## GAM model

m1 =gam(pb ~  s(AnnualMeanTemp) +              # GAM model with all predictor variables on it
               s(AnnualPrecip) + 
               s(DEM)  + 
               s(MaxTempWarmestMonth) + 
               s(MinTempColdestMonth),
               data=train.set, 
               family=binomial(link = "logit"), # binomial distribution with logit link function
               gamma=1.4)
        
summary(m1)                                     # summary of the GAM results
plot(m1, pages=1, scale=F, shade=T)             # make partial dependence plots

############################## model validation using AUC

pred.test=predict(m1,test.set,type='response')                  # make prediction for test set
preds.obs=data.frame(pred.test=pred.test,test.set$pb)           # data frame of preds vs obs                                
gam1.eval=prediction(pp$model_out,pp$test_.set)                 # Asses prediction for AUC
attributes(performance(gam1.eval, 'auc'))$y.values[[1]]          # get AUC value
roc(preds.obs$test.set.pb,preds.obs$pred.test)                   # use gbm package to the AUC
write.table(preds.obs,file.choose())                             # write preds-obs table to clean it in excel
pp=read.delim(file.choose(), h=T)                               # load the table after cleaning

############################### plot predicted raster with ggplot2 library 

preds=predict(predictors,m1,type='response')   # make prediction for the entire grid

Preds = rasterToPoints(preds)                  # raster to points
Preds = data.frame(Preds)                      # make a dataframe of the points to work with ggplot2 
colnames(Preds) = c("X","Y","probability")     # change tha names

# make the plot ussing ggplot2 (ggplot2 is an entire new language so we wont go into the details. go to: )
ggplot(NULL, aes(X, Y)) + 
  geom_raster(data = Preds, aes(fill = probability)) + 
  scale_fill_gradientn(name="Probability",colours = topo.colors(100))+
  guides(fill = guide_colorbar()) +
  scale_alpha(range = c(0, 0.5), guide = "none") +
  scale_x_continuous(name=expression(paste("")), limits=c(-125,-114),expand=c(0,0)) +
  scale_y_continuous(name=expression(paste("")), limits=c(32,42),expand=c(0,0)) +
  coord_equal()

writeRaster(preds, file.choose())               # export predicted raster outside R