Section 4 Processing eBird data

In this script, we will process the community science data across the Eastern and Western Himalayas.

4.1 Load necessary libraries

Code

library(tidyverse)
library(sf)
library(raster)
library(parallel)

4.2 Loading custom functions to process eBird data

Code

# This function processes the eBird data as long as the path where the data is stored and list of countries are mentioned 

readcleanrawdata = function(rawpath)
{
  require(lubridate)
  require(tidyverse)
  require(cowplot)
  
  preimp = c("COMMON.NAME","OBSERVATION.COUNT",
             "LOCALITY.ID","LOCALITY.TYPE", "STATE", "COUNTRY",
             "LATITUDE","LONGITUDE","OBSERVATION.DATE",
             "TIME.OBSERVATIONS.STARTED","OBSERVER.ID",
             "PROTOCOL.TYPE","DURATION.MINUTES","EFFORT.DISTANCE.KM",
             "REVIEWED","NUMBER.OBSERVERS","ALL.SPECIES.REPORTED",
             "GROUP.IDENTIFIER","SAMPLING.EVENT.IDENTIFIER","APPROVED",
             "CATEGORY")
  
  nms = read.delim(rawpath, nrows = 1, sep = "\t", header = T, quote = "", stringsAsFactors = F, na.strings = c(""," ",NA))
  nms = names(nms)
  nms[!(nms %in% preimp)] = "NULL"
  nms[nms %in% preimp] = NA
  
  data = read.delim(rawpath, colClasses = nms, sep = "\t", header = T, quote = "", stringsAsFactors = F, na.strings = c(""," ",NA))
  
  ## choosing important variables
  
  imp = c("COMMON.NAME","OBSERVATION.COUNT",
          "LOCALITY.ID","LOCALITY.TYPE", "STATE", "COUNTRY",
          "LATITUDE","LONGITUDE","OBSERVATION.DATE",
          "TIME.OBSERVATIONS.STARTED","OBSERVER.ID",
          "PROTOCOL.TYPE","DURATION.MINUTES","EFFORT.DISTANCE.KM",
          "SAMPLING.EVENT.IDENTIFIER",    
          "NUMBER.OBSERVERS","ALL.SPECIES.REPORTED","group.id",
          "CATEGORY","no.sp")
  
  days = c(31,28,31,30,31,30,31,31,30,31,30,31)
  cdays = c(0,31,59,90,120,151,181,212,243,273,304,334)
  
  ## setup eBird data ##
  
  ## filter approved observations, species, slice by single group ID, remove repetitions
  ## remove repeats
  ## set date, add month, year and day columns using package LUBRIDATE
  ## filter distance travelled, duration birded and number of observers
  ## add number of species column (no.sp)
  
  data = data %>%
    filter(REVIEWED == 0 | APPROVED == 1) %>%
    mutate(group.id = ifelse(is.na(GROUP.IDENTIFIER), SAMPLING.EVENT.IDENTIFIER, GROUP.IDENTIFIER)) %>%
    filter(ALL.SPECIES.REPORTED == 1) %>%  filter(PROTOCOL.TYPE == "Stationary"| PROTOCOL.TYPE == "Traveling")%>%
    filter(EFFORT.DISTANCE.KM<=2.5|is.na(EFFORT.DISTANCE.KM))%>%
    filter(DURATION.MINUTES <= 120)%>% filter(NUMBER.OBSERVERS <= 10)%>%
    mutate(Time = hms(TIME.OBSERVATIONS.STARTED)) %>% filter(Time > hms("4:00:00") & Time < hms("19:00:00"))%>%
    group_by(group.id,COMMON.NAME) %>% slice(1) %>% ungroup %>%
    group_by(group.id) %>% mutate(no.sp = n_distinct(COMMON.NAME))%>%
    dplyr::select(imp) %>%
    mutate(OBSERVATION.DATE = as.Date(OBSERVATION.DATE), 
           month = month(OBSERVATION.DATE), year = year(OBSERVATION.DATE),
           day = day(OBSERVATION.DATE) + cdays[month], week = week(OBSERVATION.DATE),
           fort = ceiling(day/14)) %>%
    filter(year > 2010)
    ungroup
  
  return(data)
}

4.3 Use the function written above to extract eBird data

Code

# please download the latest versions of eBird data from https://ebird.org/data/download and set the file path accordingly. Since these two datasets are extremely large, we have not uploaded the same to github.

# In this study, the latest version of the data corresponds to August 31st 2022

# extract data for the following list of countries

Bhutan <- readcleanrawdata("ebd_BT_relAug-2022.txt")
India <- rbind(readcleanrawdata("ebd_IN-JK_relAug-2022.txt"), readcleanrawdata("ebd_IN-LA_relAug-2022.txt"), readcleanrawdata("ebd_IN-HP_relAug-2022.txt"), readcleanrawdata("ebd_IN-AR_relAug-2022.txt"), readcleanrawdata("ebd_IN-WB_relAug-2022.txt"), readcleanrawdata("ebd_IN-UL_relAug-2022.txt"), readcleanrawdata("ebd_IN-SK_relAug-2022.txt"))

## Removing non himalayan regions
India<-India %>% filter(LATITUDE>26,LONGITUDE<100)

dat <-rbind(India,Bhutan)

# Keep only unique locations used

datll<-dat%>% filter(month %in% c(1,2,5,6,7,8,12)) %>% distinct(LATITUDE,LONGITUDE, .keep_all = T)%>%select(LOCALITY.ID,LATITUDE,LONGITUDE)
write.csv(datll, "results/unique_loc.csv", row.names = F)

4.4 Extract elevation at unique locations

Code

dat <- st_as_sf(dat, coords = c("LONGITUDE","LATITUDE"), crs=4326, remove = "F")

# Loading the elevation data
elev <- raster("data/elevation/alt")

# extract elevation
elevDat <- raster::extract(elev,dat)

# cbind elevation back to dataframe
dat <- cbind(dat,elevDat)

# save Rdata file (uploaded to GitHub)
save(dat, file = "results/eBird_elev.RData")