Section 2 Data comparability

In this script, we process the point-count and acoustic data and create a combined dataset for downstream analysis.

2.1 Load necessary libraries

library(tidyverse)
library(dplyr)
library(stringr)
library(vegan)
library(scico)
library(data.table)
library(extrafont)
library(sf)
library(raster)
library(scales)
library(ggplot2)
library(ggspatial)
library(colorspace)
library(lubridate)
library(naniar)

2.2 Loading point count and acoustic data

point_count <-  read.csv("data/point-count-data.csv")
acoustic_data <-  read.csv("data/acoustic-data.csv")

2.3 Cleaning up point count data

# include distance information in lieu of the categories
point_count <- point_count %>%
  mutate(distance = case_when(distance_band == 1 ~ "0-10",
                              distance_band == 2 ~ "10-25",
                              distance_band == 3 ~ "25-50",
                              distance_band == 4 ~">50",
                              distance_band == NA ~ NA))

# convert dates to standard YYYYMMDD format
point_count$date <- mdy(point_count$date)

# convert start time and observation time to lubridate format
point_count$start_time <- hms::as_hms(point_count$start_time)
point_count$observation_time <- hms::as_hms(point_count$observation_time)

# remove incidental observations
point_count <- point_count %>%
  filter(observation_method != "I")

# remove non-bird species and unidentified/NA values
point_count <- point_count %>%
  filter(!common_name %in% c("Unidentified Woodpecker",
                             "Eastern Chipmunk",
                             "Eastern Gray Squirrel",
                             "Red Squirrel",
                             NA))

# using the library naniar, we assess missing data/NANs or if exploratory processing of data did not clean up the NAs
miss_var_summary(point_count)

# above procedure revealed NAs in the observation_time column which are associated with data on Canada Geese from MABI1002 and MABI1001 on 5/22/2023. We will not delete this data
# other columns/missing data in those can be ignored as distance was not calculated for all observations due to variation in survey_observers

2.4 Cleaning up acoustic data

# ensure begin clock and end clock time are in HHMMSS format
# note: alternately, the begin and end clock time could be potentially extracted from the selection tables (if present)
# appending a zero to begin and end clock time
acoustic_data$begin_clock_time <- paste0("0", acoustic_data$begin_clock_time)
acoustic_data$ end_clock_time <- paste0("0", acoustic_data$end_clock_time)

# write a function to convert to HH:MM:SS
convert_to_time <- function(x) {
  # convert string to numeric
  x <- as.numeric(x)
  hours <- floor(x/10000)
  minutes <- floor((x - hours*10000)/100)
  seconds <- x - hours*10000 - minutes*100
  sprintf("%02d:%02d:%02d", hours, minutes, seconds)
}

# make the conversion
acoustic_data$begin_clock_time <- sapply(acoustic_data$begin_clock_time, convert_to_time)
acoustic_data$end_clock_time<- sapply(acoustic_data$end_clock_time, convert_to_time)

# convert to hms format to keep it consistent with point_count data
acoustic_data$begin_clock_time<- hms::as_hms(acoustic_data$begin_clock_time)
acoustic_data$end_clock_time <- hms::as_hms(acoustic_data$end_clock_time)

# make the data column similar to point_count data in YYYY-MM-DD format
acoustic_data$date <- ymd(acoustic_data$date)

# extract year column
acoustic_data$year <- lubridate::year(acoustic_data$date)

# remove non-bird species and unidentified/NA values
acoustic_data <- acoustic_data %>%
  filter(!common_name %in% c("Eastern Chipmunk",
                             "Red Squirrel",
                             NA))

# using the library naniar, we assess missing data/NANs or if exploratory processing of data did not clean up the NAs
miss_var_summary(acoustic_data)

# the column 'background' has a large number of NAs, which can be ignored since we will not be using this column in future analysis.

2.5 Creating a combined dataset

# include the data_type for each dataset
acoustic_data <-  acoustic_data %>%
  mutate(data_type = "acoustic_data")
point_count <- point_count %>%
  mutate(data_type = "point_count")

# before we combine the two datasets, we want to ensure that both dates and site_id match across the two datasets. In other words, we do not want visits in which in which either acoustic data or point count data was not collected, since this is a simultaneous comparison

# find dates to remove by checking for mismatches between the two datasets using both site_id and date
dates_to_remove <- anti_join(point_count, acoustic_data, 
                             by = c("site_id", "date")) %>%
  dplyr::select(site_id, date) %>%
  distinct()

# remove data from sites and visits from the point count data which do not have corresponding acoustic annotations
point_count <- point_count %>%
  anti_join(dates_to_remove, by = c("site_id", "date"))

# as a sanity check, let's repeat the above with acoustic data to ensure we are not keeping any extra site/visit annotations
acoustic_data_to_remove <- anti_join(acoustic_data, point_count, by = c("site_id", "date")) %>%
  dplyr::select(site_id, date) %>%
  distinct()
# sanity check successful and suggests that the acoustic annotations are often across dates and visits that have point_count data. No data needs to be further filtered.

# combine the datasets
datSubset <- bind_rows(point_count[,c(1:3,8,9,11,12,15,18:21)],
                   acoustic_data[,c(4:9,15,16,18:21)])

# add a site name column
datSubset <- datSubset %>%
  mutate(site_name = case_when(
    grepl("^ACAD", site_id) ~ "Acadia National Park",
    grepl("^HBEF", site_id) ~ "Hubbard Brook Experimental Forest",
    grepl("^KAWW", site_id) ~ "Katahdin Woods and Waters",
    grepl("^MABI", site_id) ~ "Marsh-Billings-Rockefeller NHP"
  ))

# visit_numbers need to be reassigned and sorted based on the earliest date for each site_id
# this is being done as a secondary sanity check to ensure that the same visit_numbers are being assigned to every single site_id and date combination for each data_type. In other words, for MABI1110 for say 2022-05-26 with visit_number 1 for point_count data, we ensure that the visit_number is 1 for the same date and site_id for acoustic data
datSubset <- datSubset %>%
  group_by(site_id, data_type) %>%
  arrange(date) %>%
  mutate(visit_number = dense_rank(date))

# write to file
write.csv(datSubset, "results/pooled_pointCount_acoustic_data.csv", row.names = F)