Section 3 Subsetting data and running exploratory data analyses

In this script, we will compute species richness, analyze data by time of data and subset data by including only species that occur across x% of sites for further analyses.

3.1 Load necessary libraries

library(tidyverse)
library(dplyr)
library(stringr)
library(vegan)
library(scico)
library(data.table)
library(extrafont)
library(sf)
library(raster)
library(scales)
library(ggplot2)
library(ggspatial)
library(colorspace)
library(lubridate)
library(naniar)
library(ggstatsplot)
library(MetBrewer)
library(gridExtra)
library(grid)

3.2 Load dataframe containing point count and acoustic data

datSubset <- read.csv("results/pooled_pointCount_acoustic_data.csv")

3.3 Estimate total abundance and richness for point count and acoustic data

## point-count data
# estimate total abundance across all species for each site
# note: this analysis ignores visit_level information and pools overall abundance across visits per year to a site
abundance <- datSubset %>%
  filter(data_type == "point_count") %>%
  group_by(site_id, site_name, year, scientific_name,
           common_name) %>% 
  summarise(total_abundance = sum(abundance)) %>%
  ungroup()

# Red Crossbill in Acadia National Park (ACAD3403) in year 2023 and Black-throated Green Warbler in year 2022 in Acadia National Park (ACAD3504) reported highest values of abundance (at n= 19 and n = 16, respectively)

# estimate richness for point_count data at each site
pc_richness <- abundance %>%
  mutate(forRichness = case_when(total_abundance > 0 ~ 1)) %>%
  group_by(site_id, site_name, year) %>%
  summarise(richness = sum(forRichness)) %>%
  mutate(data_type = "point_count") %>%
  ungroup()

# while this analysis is not particularly informative, it does tell us that KAWW5401 in year 2023 reported highest species richness (value=30)

# please note that both the calculations above discount the fact that there is an uneven number of visits to each site across years

# estimate total number of detections using the manual annotations of the acoustic data
# we ignore visit_level information and pool total detections at each site per year for this analysis
detections <- datSubset %>%
  filter(data_type == "acoustic_data") %>%
  group_by(site_id, year, site_name, scientific_name, common_name) %>% summarise(total_detections = n()) %>%
  ungroup()

# while this information is pooled across visits, the analysis reported highest detections for the Chestnut-sided Warbler at MABI1103 in Marsh-Billings in 2023 (n= 256 detections)

# estimate richness for acoustic data
aru_richness <- detections %>%
  mutate(forRichness = case_when(total_detections > 0 ~ 1)) %>%
  group_by(site_id, site_name, year) %>%
  summarise(richness = sum(forRichness)) %>%
  mutate(data_type = "acoustic_data") %>%
  ungroup()

# while this analysis is not particularly informative, it does tell us that MABI1102 in year 2022 reported highest species richness (value= 23)

# how many species were detected in a point count but not acoustic data?
pc_not_aru <- unique(anti_join(abundance[,5], detections[,5]))

# Nine species were detected only in point count data but not in the acoustic data and can be ignored for downstream analysis: Herring Gull, Northern Cardinal, American Goshawk, Swamp Sparrow, Red-bellied Woodpecker, Pine Siskin, Savannah Sparrow, Eastern Kingbird, Brown-headed Cowbird

# how many species were detected in acoustic data but not point counts?
aru_not_pc <- unique(anti_join(detections[,5], abundance[,5]))

# Six species were detected only in acoustic data and not in the point count data and can also be ignored for downstream analysis: Cape May Warbler, Gray Catbird, Mallard, Prairie Warbler, House Wren, Bobolink

3.4 Creating a single dataframe prior to running statistical analysis

## filter out species that were only detected in one approach and not in the other
point_count <- datSubset %>%
  filter(data_type == "point_count") %>%
  filter(!common_name %in% c("Herring Gull", 
                             "Northern Cardinal", "American Goshawk", 
                             "Swamp Sparrow", "Red-bellied Woodpecker", 
                             "Pine Siskin",
                             "Savannah Sparrow", "Eastern Kingbird", 
                             "Brown-headed Cowbird"
                             ))

acoustic_data <- datSubset %>%
  filter(data_type == "acoustic_data") %>%
  filter(!common_name %in% c("Cape May Warbler", "Gray Catbird", 
                             "Mallard", "Prairie Warbler", 
                             "House Wren", "Bobolink"
                             ))

# estimate abundance for each species for each site/visit combination
# ignore distance for now as we can model it in future analysis
abundance <- point_count %>%
  group_by(site_id, site_name, year, start_time, visit_number,
           scientific_name,
           common_name) %>% 
  summarise(abundance = sum(abundance)) %>%
  ungroup()

# estimate detections for each species for each site and year combination using the manual annotations of the acoustic data
detections <- acoustic_data %>%
  group_by(site_id, site_name, year, visit_number,
           scientific_name,
           common_name) %>% 
  summarise(detections = n()) %>%
  ungroup()

## sanity check to ensure that we are not including any species in either data type by mistake
pc_not_aru <- unique(anti_join(abundance[,7], 
                               detections[,6]))
aru_not_pc <- unique(anti_join(detections[,6], 
                               abundance[,7]))

3.5 Filtering criteria for species inclusion

How do we decide on the number of species to include or what species to keep? One criteria that was employed in Ramesh et al. 2022 (Ecography) was: “we kept only those species that occurred in at least 5% of all checklists across at least 27 unique grid cells (50% of the study area).” In a similar vein, we keep species occurring in atleast 15% of sites across both years for both acoustic data and point count data. While the percentage is arbitrary, this number reflects an objective approach/criteria to filter/keep species.

# add a total number of unique sites
# 104 unique sites
total_sites <- length(unique(abundance$site_id)) 

# how many sites was each species observed in when carrying out point counts and when carrying out manual acoustic annotations/detections?
species_year_pc <-  abundance %>%
  group_by(common_name, year) %>%
  summarise(n_sites = n_distinct(site_id), .groups = 'drop') %>%
  arrange(desc(n_sites)) %>%
   mutate(total_sites,
         prop_sites = n_sites/total_sites) # proportion of sites

species_year_aru <- detections %>%
  group_by(common_name, year) %>%
  summarise(n_sites = n_distinct(site_id), .groups = 'drop') %>%
  arrange(desc(n_sites)) %>%
  mutate(total_sites,
         prop_sites = n_sites/total_sites) # proportion of sites

# we choose to keep species that occurred in atleast 15% of sites in each method and within a year
species_pc <- species_year_pc %>%
  filter(prop_sites >= 0.15)

species_aru <- species_year_aru %>%
  filter(prop_sites >= 0.15)

# subset data from the abundance and detections dataframe to keep only species that occur in 15% of all sites during each year of survey
aru_subset <- detections %>%
  semi_join(species_aru, by = c("common_name", "year"))

pc_subset <- abundance %>%
  semi_join(species_pc, by = c("common_name", "year"))

# we only keep species that satisfied the criteria for both point counts and acoustic data. In some cases, a few species satisfied the criteria for only point counts or only acoustic data and such species were filtered out prior to other analyses

# how many species were detected in acoustic data but not point counts?
# no new species were matching the criteria in acoustic data
aru_not_pc <- unique(anti_join(aru_subset[,6], pc_subset[,7]))

# five species matched criteria in point counts that were not found in the acoustic data
pc_not_aru <- unique(anti_join(pc_subset[,7], aru_subset[,6]))

# removing these five species from the pc_subset
pc_subset <- pc_subset %>%
  filter(!common_name %in% c("Dark-eyed Junco","Red Crossbill",
                             "American Goldfinch","Wood Thrush",
                             "American Redstart"))

# create a single dataframe
data <- full_join(pc_subset, aru_subset) %>%
  replace_na(list(abundance = 0, detections = 0))

# criteria of 15% occurrence across sites per year/season for both acoustic data and point count data resulted in 22 species.

3.6 Explore whether time of day shows any variation in abundance/detections?

Prior to running statistical models comparing abundance and acoustic detections, we explore the two aforementioned variables as a function of time of day of survey/detection.

# examine summary of the data and check for NAs
miss_var_summary(data)

# start_time column has NAs which is mostly showing up for the acoustic data suggesting that there are species reported in the audio data during that visit that was not detected in that point count for the very same corresponding visit
data <- data %>%
  group_by(site_id, year, visit_number) %>%
  fill(start_time, .direction = "downup") %>%
  ungroup()

3.7 Examining abundance and acoustic detections as a function of time of day by species

# looking at time as a continuous value, and the counts as the estimate of abundance
data |> 
  mutate(start_time = hms::as_hms(start_time)) |> 
  ggplot(aes(x = start_time, y = abundance, color = site_name)) + 
  geom_segment(aes(y = 0, yend = abundance), alpha = 0.3) + 
  geom_point(alpha = 0.6) + 
  facet_wrap(~common_name, ncol = 3) + 
  labs(x = "Time",
       y = "Number of individuals counted in a 10-minute point count",
       color = "Site")+
  scale_color_manual(values=met.brewer("Homer2", 4))+
  theme_minimal()+
  theme(legend.position = "bottom");
ggsave(filename = "figs/abundance-by-time-continuous.jpg", width = 10, height = 10, units = "in", dpi = 300)

data |> 
  mutate(start_time = hms::as_hms(start_time)) |> 
  ggplot(aes(x = start_time, y = detections, color = site_name)) + 
  geom_segment(aes(y = 0, yend = detections), alpha = 0.3) + 
  geom_point(alpha = 0.6) + 
  facet_wrap(~common_name, ncol = 3, scale = "free_y") + 
  scale_color_manual(values=met.brewer("Homer2", 4))+
  labs(x = "Time",
       y = "Number of acoustic detections in a 10-minute recording",
       color = "Site")+
  theme_minimal()+
  theme(legend.position = "bottom")
ggsave(filename = "figs/acu_detections-by-time-continuous.jpg", width = 10, height = 10, units = "in", dpi = 300)

# I (Orlando) will play with this `data` object
saveRDS(data, "data/data_time_continuous.RDS")

Interestingly, we are not observing much variation between times of day for both acoustic detections and species abundances across species, potentially suggesting that time of day may/may not be impacting our results or might contribute to differences we are observing.

3.8 Save the subsetted data to file

# write to file
write.csv(data, "results/datSubset.csv", row.names = F)