Section 5 Abundance vs. acoustic detections

In this script, we will associations between abundance as estimated from point count data and acoustic detections, which were manually annotated from simultaneously paired audio recorders.

5.1 Load necessary libraries

library(tidyverse)
library(dplyr)
library(stringr)
library(vegan)
library(scico)
library(data.table)
library(extrafont)
library(sf)
library(raster)
library(scales)
library(ggplot2)
library(ggspatial)
library(colorspace)
library(lubridate)
library(naniar)
library(patchwork)
library(glmmTMB)
library(DHARMa)
library(performance)
library(effects)
library(parallel)
library(foreach)
library(doParallel)
library(gridExtra)
library(broom.mixed)
library(scales)

5.2 Load dataframe containing data that is already filtered for a subset of species

data <- read.csv("results/datSubset.csv")

5.3 Correlations between abundance and detections

# explore the distribution of data
hist(data$abundance, main = "Abundance Distribution", xlab = "Abundance")
# most species had an abundance of 1, followed by 2 and fewer species had higher abundances and so on

hist(data$detections, main = "Acoustic Detections Distribution", xlab = "Detections")
# right-tailed skewed distribution that is zero-inflated

plot(data$detections, data$abundance, 
     xlab = "Acoustic Detections", ylab = "Abundance")

# Check correlation
cor(data$abundance, data$detections, use = "complete.obs")

# 0.324 is the value of correlation between abundance and detections across species, sites and years.

5.4 Scaling of predictors and determining distribution family

Since it’s abundance, we will most likely choose Poisson, but we can do a quick assessment of the distribution to ensure we are choosing the right family prior to running mixed models.

## we scale acoustic detections since the range is vastly different when compared to abundance (0-10 vs. 1-500 for example)
data$detections_scaled <- scale(data$detections)

# analyze abundance distribution to choose appropriate family
abundance_analysis <- function(y) {
  
  # summary statistics
  stats <- list(
    mean = mean(y, na.rm = TRUE),
    var = var(y, na.rm = TRUE),
    min = min(y, na.rm = TRUE),
    max = max(y, na.rm = TRUE),
    zeros = sum(y == 0, na.rm = TRUE),
    total_obs = length(y[!is.na(y)])
  )
  
  # dispersion ratio (variance/mean)
  stats$dispersion_ratio <- stats$var / stats$mean
  
  # proportion of zeros
  stats$zero_prop <- stats$zeros / stats$total_obs
  
  return(stats)
}

abundance_stats <- abundance_analysis(data$abundance)
print(abundance_stats)

# visualize to confirm
hist(data$abundance, main = "Abundance Distribution")
plot(table(data$abundance), main = "Abundance Frequency")
qqnorm(data$abundance)
qqline(data$abundance)

# the data suggests that the response is underdispersed (variance is less than the mean and while the data is counts, we rely on a modified version of the Poisson distribution: the Conway-Maxwell-Poisson family that handles both under and overdispersion.

5.5 Generalized linear mixed model between abundance and acoustic detections

# ensure factors are properly coded
data$common_name <- as.factor(data$common_name)
data$year <- as.factor(data$year)
data$site_id <- as.factor(data$site_id)

# recode visit_number as year_visit_number to indicate repeat visits are different between years
data <- data %>%
  mutate(year_visit_number = paste(year, visit_number, sep = "_"))

# set up parallel processing
n_cores <- detectCores() - 1  # Leave one core free
cl <- makeCluster(n_cores)
registerDoParallel(cl)

# export necessary objects to clusters
clusterEvalQ(cl, {
  library(glmmTMB)
})

clusterExport(cl, c("data"))

# we will run models separately for each species
model_results <- list()

# include error handling for species for which the models failed
failed_species <- character()

# get unique species
species_list <- unique(data$common_name)

for(species in species_list) {
  
  cat("Fitting model for:", species, "\n")
  
  species_data <- data[data$common_name == species, ]
  
  # skip if not enough data
  if(nrow(species_data) < 10) {
    cat("Skipping", species, "- insufficient data\n")
    next
  }
  
  tryCatch({
    model_results[[species]] <- glmmTMB(
      abundance ~ detections_scaled + (1|site_id) + (1|year_visit_number),       data = species_data, 
      family = compois,
      control = glmmTMBControl(parallel = n_cores,
                             optCtrl = list(iter.max = 1000, 
                                            eval.max = 1000))
    )
  }, error = function(e) {
    cat("Model failed for", species, ":", e$message, "\n")
    failed_species <<- c(failed_species, species)
  })
}

# check which species failed
print(failed_species) # no models failed but some model/models did not converge

# are there models that did not converge?
converged_status <- sapply(names(model_results), function(species) {
  model <- model_results[[species]]
  if(inherits(model, "glmmTMB")) {
    return(model$fit$convergence == 0)
  }
  return(TRUE)  
})

# show which species didn't converge
non_converged <- names(converged_status)[!converged_status]
cat("Species that didn't converge:", non_converged, "\n")

# we can get rid of Scarlet Tanager from the next set of analyses

5.6 Model diagnostics

# remove non-converged models from model_results
# we got rid of Scarlet Tanager
if(length(non_converged) > 0) {
  model_results <- model_results[!names(model_results) %in% non_converged]
  cat("Removed", length(non_converged), "non-converged models\n")
}

# model diagnostics using DHARMa R package
diagnostics_results <- list()

cat("Running model diagnostics...\n")

for(species in names(model_results)) {
  cat("Diagnostics for:", species, "\n")
  
  tryCatch({
    
    # create DHARMa residuals
    simulationOutput <- simulateResiduals(fittedModel = model_results[[species]], plot = FALSE)
    
    # store diagnostics
    diagnostics_results[[species]] <- list(
      simulation = simulationOutput,
      uniformity_test = testUniformity(simulationOutput, plot = FALSE),
      dispersion_test = testDispersion(simulationOutput, plot = FALSE),
      outliers_test = testOutliers(simulationOutput, plot = FALSE)
    )
    
    # print test results
    cat("  Uniformity test p-value:", diagnostics_results[[species]]$uniformity_test$p.value, "\n")
    cat("  Dispersion test p-value:", diagnostics_results[[species]]$dispersion_test$p.value, "\n")
    cat("  Outliers test p-value:", diagnostics_results[[species]]$outliers_test$p.value, "\n")
    
  }, error = function(e) {
    cat("Diagnostics failed for", species, ":", e$message, "\n")
  })
}

# create diagnostic plots for all species
pdf("figs/model-diagnostics.pdf", width = 12, height = 8)
for(species in names(diagnostics_results)) {
  if(!is.null(diagnostics_results[[species]]$simulation)) {
    par(mfrow = c(2, 2))
    plot(diagnostics_results[[species]]$simulation, main = paste("Diagnostics:", species))
  }
}
dev.off()

# no cases of overdispersion or mild issues of uniformity

5.7 Predict abundance given detections

Following model diagnostic tests, we now plot abundance given acoustic detections for each species and save model summaries.

# extract model fit statistics and effect sizes
model_summary <- list()
effect_sizes <- list()

for(species in names(model_results)) {
  
  # extract model summary
  model_sum <- summary(model_results[[species]])
  
  # get fixed effects
  fixed_effects <- tidy(model_results[[species]], effects = "fixed")
  detection_effect <- fixed_effects[fixed_effects$term == "detections_scaled", ]
  
  # calculate R-squared (pseudo R-squared for GLMMs)
  # using correlation between fitted and observed values
  species_data <- data[data$common_name == species, ]
  fitted_vals <- fitted(model_results[[species]])
  pseudo_r2 <- cor(fitted_vals, species_data$abundance)^2
  
  # store effect size and significance
  effect_sizes[[species]] <- list(
    estimate = detection_effect$estimate,
    std_error = detection_effect$std.error,
    p_value = detection_effect$p.value,
    pseudo_r2 = pseudo_r2,
    significance = case_when(
      detection_effect$p.value < 0.001 ~ "***",
      detection_effect$p.value < 0.01 ~ "**",
      detection_effect$p.value < 0.05 ~ "*",
      detection_effect$p.value < 0.1 ~ ".",
      TRUE ~ "ns"
    ),
    direction = ifelse(detection_effect$estimate > 0, "Positive", "Negative"),
    effect_magnitude = case_when(
      abs(detection_effect$estimate) > 1 ~ "Large",
      abs(detection_effect$estimate) > 0.5 ~ "Medium",
      abs(detection_effect$estimate) > 0.2 ~ "Small",
      TRUE ~ "Very Small"
    )
  )
}

# get scaling parameters
scaling_center <- attr(data$detections_scaled, "scaled:center")
scaling_scale <- attr(data$detections_scaled, "scaled:scale")

# create prediction data
prediction_data_list <- list()
species_ecology_summary <- list()

for(species in names(model_results)) {
  
  species_data <- data[data$common_name == species, ]
  
  # calculate ecological summary statistics
  species_ecology_summary[[species]] <- list(
    n_observations = nrow(species_data),
    n_sites = length(unique(species_data$site_id)),
    mean_abundance = round(mean(species_data$abundance, na.rm = TRUE), 2),
    max_abundance = max(species_data$abundance, na.rm = TRUE),
    detection_range_original = range((species_data$detections_scaled * scaling_scale) + scaling_center),
    abundance_cv = round(sd(species_data$abundance, na.rm = TRUE) / mean(species_data$abundance, na.rm = TRUE), 2)
  )
  
  # create prediction range based on observed data
  detections_range <- seq(min(species_data$detections_scaled, 
                              na.rm = TRUE),
                         max(species_data$detections_scaled, 
                             na.rm = TRUE),
                         length.out = 100)
  
  typical_site <- names(sort(table(species_data$site_id), decreasing = TRUE))[1]
  typical_visit <- names(sort(table(species_data$year_visit_number), decreasing = TRUE))[1]
  
  pred_data <- data.frame(
    detections_scaled = detections_range,
    site_id = typical_site,
    year_visit_number = as.factor(typical_visit),
    common_name = species
  )
  
  # back-transform detections
  pred_data$detections_original <- (pred_data$detections_scaled * scaling_scale) + scaling_center
  
  # generate predictions
  tryCatch({
    predictions <- predict(model_results[[species]], 
                          newdata = pred_data, 
                          type = "response",
                          se.fit = TRUE,
                          re.form = NA)
    
    pred_data$predicted_abundance <- predictions$fit
    pred_data$se <- predictions$se.fit
    pred_data$lower_ci <- pmax(0, predictions$fit - 1.96 * predictions$se.fit)  
    pred_data$upper_ci <- predictions$fit + 1.96 * predictions$se.fit
    
    # Add effect size information
    pred_data$effect_direction <- effect_sizes[[species]]$direction
    pred_data$effect_magnitude <- effect_sizes[[species]]$effect_magnitude
    pred_data$significance <- effect_sizes[[species]]$significance
    pred_data$pseudo_r2 <- effect_sizes[[species]]$pseudo_r2
    pred_data$p_value <- effect_sizes[[species]]$p_value
    
    # calculate fold change from min to max detections
    min_pred <- pred_data$predicted_abundance[1]
    max_pred <- pred_data$predicted_abundance[nrow(pred_data)]
    pred_data$fold_change <- max_pred / min_pred
    
    prediction_data_list[[species]] <- pred_data
    
  }, error = function(e) {
    cat("Prediction failed for", species, ":", e$message, "\n")
  })
}

# combine all prediction data
all_predictions <- do.call(rbind, prediction_data_list)

# create observed data with original detection values
observed_data_list <- list()

for(species in names(model_results)) {
  species_data <- data[data$common_name == species, ]
  species_data$detections_original <- (species_data$detections_scaled * scaling_scale) + scaling_center
  observed_data_list[[species]] <- species_data
}
all_observed <- do.call(rbind, observed_data_list)

# create facet plot with statistical information
fig_predict_det_abundance <- ggplot() +
  
  # add observed data points with transparency
  geom_point(data = all_observed, 
             aes(x = detections_original, y = abundance), 
             alpha = 0.3, color = "gray50", size = 1) +
  
  # add confidence interval
  geom_ribbon(data = all_predictions, 
              aes(x = detections_original, 
                  ymin = lower_ci, ymax = upper_ci), 
              alpha = 0.2, fill = "steelblue") +
  
  # add prediction line colored by effect direction
  geom_line(data = all_predictions, 
            aes(x = detections_original, y = predicted_abundance, 
                color = effect_direction), 
            size = 1.2) +
  
  # add statistical information as text
  geom_text(data = all_predictions %>% 
              group_by(common_name) %>% 
              summarise(
                detections_original = max(detections_original) * 0.05,
                predicted_abundance = max(predicted_abundance) * 0.95,
                significance = first(significance),
                pseudo_r2 = first(pseudo_r2),
                effect_magnitude = first(effect_magnitude),
                p_value = first(p_value),
                fold_change = first(fold_change),
                .groups = 'drop'
              ),
            aes(x = detections_original, y = predicted_abundance,
                label = paste0("R² = ", round(pseudo_r2, 3), significance 
                          )),
            hjust = 0, vjust = 1, size = 2.8, 
            fontface = "bold", color = "black") +
  
  facet_wrap(~ common_name, scales = "free", ncol = 3) +
  
  scale_color_manual(values = c("Positive" = "#2E8B57", 
                                "Negative" = "#CD5C5C"),
                     name = "Association") +
  
  labs(
    x = "Number of acoustic detections",
    y = "Predicted avian abundance (counts of individuals)",
    caption = "R² = pseudo R-squared; *** p<0.001, ** p<0.01, * p<0.05, . p<0.1, ns = not significant\nGray points = observed data, colored lines = model predictions ± 95% CI"
  ) +
  theme_bw() +
  theme(
    strip.text = element_text(size = 10, face = "bold",
                              family = "Century Gothic"),
    axis.text = element_text(size = 9,
                             family = "Century Gothic"),
    axis.title = element_text(size = 12, face = "bold",
                              family = "Century Gothic"),
    plot.title = element_text(size = 16, hjust = 0.5, face = "bold",
                              family = "Century Gothic"),
    plot.subtitle = element_text(size = 12, hjust = 0.5, face = "italic",
                                 family = "Century Gothic"),
    plot.caption = element_text(size = 9, hjust = 0, family = "Century Gothic"),
    legend.position = "bottom",
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "lightgray")
  )

ggsave("figs/fig_predicted_avian_abundance_detections.png", fig_predict_det_abundance, width = 16, height = 12, dpi = 300,)
dev.off()

# create a summary table containing the model results
summary_table <- data.frame(
  Species = names(effect_sizes),
  N_Observations = sapply(species_ecology_summary, function(x) x$n_observations),
  N_Sites = sapply(species_ecology_summary, function(x) x$n_sites),
  Mean_Abundance = sapply(species_ecology_summary, function(x) x$mean_abundance),
  Max_Abundance = sapply(species_ecology_summary, function(x) x$max_abundance),
  Detection_Range = sapply(species_ecology_summary, function(x) 
    paste(round(x$detection_range_original, 1), collapse = " - ")),
  Effect_Direction = sapply(effect_sizes, function(x) x$direction),
  Effect_Size = round(sapply(effect_sizes, function(x) x$estimate), 3),
  Effect_Magnitude = sapply(effect_sizes, function(x) x$effect_magnitude),
  P_Value = round(sapply(effect_sizes, function(x) x$p_value), 4),
  Significance = sapply(effect_sizes, function(x) x$significance),
  Pseudo_R2 = round(sapply(effect_sizes, function(x) x$pseudo_r2), 3),
  Fold_Change = round(sapply(prediction_data_list, function(x) x$fold_change[1]), 2)
)

# write table to file
write.csv(summary_table, "results/glmm_model_abundance_detections.csv", row.names = FALSE)

# write model results and filtered data to file for loading in future scripts
save(model_results, file = "results/glmm_model_results.RData")