Section 5 Abundance vs. acoustic detections
In this script, we will associations between abundance as estimated from point count data and acoustic detections, which were manually annotated from simultaneously paired audio recorders.
5.1 Load necessary libraries
library(tidyverse)
library(dplyr)
library(stringr)
library(vegan)
library(scico)
library(data.table)
library(extrafont)
library(sf)
library(raster)
library(scales)
library(ggplot2)
library(ggspatial)
library(colorspace)
library(lubridate)
library(naniar)
library(patchwork)
library(glmmTMB)
library(DHARMa)
library(performance)
library(effects)
library(parallel)
library(foreach)
library(doParallel)
library(gridExtra)
library(broom.mixed)
library(scales)5.3 Correlations between abundance and detections
# explore the distribution of data
hist(data$abundance, main = "Abundance Distribution", xlab = "Abundance")
# most species had an abundance of 1, followed by 2 and fewer species had higher abundances and so on
hist(data$detections, main = "Acoustic Detections Distribution", xlab = "Detections")
# right-tailed skewed distribution that is zero-inflated
plot(data$detections, data$abundance,
xlab = "Acoustic Detections", ylab = "Abundance")
# Check correlation
cor(data$abundance, data$detections, use = "complete.obs")
# 0.324 is the value of correlation between abundance and detections across species, sites and years.5.4 Scaling of predictors and determining distribution family
Since it’s abundance, we will most likely choose Poisson, but we can do a quick assessment of the distribution to ensure we are choosing the right family prior to running mixed models.
## we scale acoustic detections since the range is vastly different when compared to abundance (0-10 vs. 1-500 for example)
data$detections_scaled <- scale(data$detections)
# analyze abundance distribution to choose appropriate family
abundance_analysis <- function(y) {
# summary statistics
stats <- list(
mean = mean(y, na.rm = TRUE),
var = var(y, na.rm = TRUE),
min = min(y, na.rm = TRUE),
max = max(y, na.rm = TRUE),
zeros = sum(y == 0, na.rm = TRUE),
total_obs = length(y[!is.na(y)])
)
# dispersion ratio (variance/mean)
stats$dispersion_ratio <- stats$var / stats$mean
# proportion of zeros
stats$zero_prop <- stats$zeros / stats$total_obs
return(stats)
}
abundance_stats <- abundance_analysis(data$abundance)
print(abundance_stats)
# visualize to confirm
hist(data$abundance, main = "Abundance Distribution")
plot(table(data$abundance), main = "Abundance Frequency")
qqnorm(data$abundance)
qqline(data$abundance)
# the data suggests that the response is underdispersed (variance is less than the mean and while the data is counts, we rely on a modified version of the Poisson distribution: the Conway-Maxwell-Poisson family that handles both under and overdispersion. 5.5 Generalized linear mixed model between abundance and acoustic detections
# ensure factors are properly coded
data$common_name <- as.factor(data$common_name)
data$year <- as.factor(data$year)
data$site_id <- as.factor(data$site_id)
# recode visit_number as year_visit_number to indicate repeat visits are different between years
data <- data %>%
mutate(year_visit_number = paste(year, visit_number, sep = "_"))
# set up parallel processing
n_cores <- detectCores() - 1 # Leave one core free
cl <- makeCluster(n_cores)
registerDoParallel(cl)
# export necessary objects to clusters
clusterEvalQ(cl, {
library(glmmTMB)
})
clusterExport(cl, c("data"))
# we will run models separately for each species
model_results <- list()
# include error handling for species for which the models failed
failed_species <- character()
# get unique species
species_list <- unique(data$common_name)
for(species in species_list) {
cat("Fitting model for:", species, "\n")
species_data <- data[data$common_name == species, ]
# skip if not enough data
if(nrow(species_data) < 10) {
cat("Skipping", species, "- insufficient data\n")
next
}
tryCatch({
model_results[[species]] <- glmmTMB(
abundance ~ detections_scaled + (1|site_id) + (1|year_visit_number), data = species_data,
family = compois,
control = glmmTMBControl(parallel = n_cores,
optCtrl = list(iter.max = 1000,
eval.max = 1000))
)
}, error = function(e) {
cat("Model failed for", species, ":", e$message, "\n")
failed_species <<- c(failed_species, species)
})
}
# check which species failed
print(failed_species) # no models failed but some model/models did not converge
# are there models that did not converge?
converged_status <- sapply(names(model_results), function(species) {
model <- model_results[[species]]
if(inherits(model, "glmmTMB")) {
return(model$fit$convergence == 0)
}
return(TRUE)
})
# show which species didn't converge
non_converged <- names(converged_status)[!converged_status]
cat("Species that didn't converge:", non_converged, "\n")
# we can get rid of Scarlet Tanager from the next set of analyses5.6 Model diagnostics
# remove non-converged models from model_results
# we got rid of Scarlet Tanager
if(length(non_converged) > 0) {
model_results <- model_results[!names(model_results) %in% non_converged]
cat("Removed", length(non_converged), "non-converged models\n")
}
# model diagnostics using DHARMa R package
diagnostics_results <- list()
cat("Running model diagnostics...\n")
for(species in names(model_results)) {
cat("Diagnostics for:", species, "\n")
tryCatch({
# create DHARMa residuals
simulationOutput <- simulateResiduals(fittedModel = model_results[[species]], plot = FALSE)
# store diagnostics
diagnostics_results[[species]] <- list(
simulation = simulationOutput,
uniformity_test = testUniformity(simulationOutput, plot = FALSE),
dispersion_test = testDispersion(simulationOutput, plot = FALSE),
outliers_test = testOutliers(simulationOutput, plot = FALSE)
)
# print test results
cat(" Uniformity test p-value:", diagnostics_results[[species]]$uniformity_test$p.value, "\n")
cat(" Dispersion test p-value:", diagnostics_results[[species]]$dispersion_test$p.value, "\n")
cat(" Outliers test p-value:", diagnostics_results[[species]]$outliers_test$p.value, "\n")
}, error = function(e) {
cat("Diagnostics failed for", species, ":", e$message, "\n")
})
}
# create diagnostic plots for all species
pdf("figs/model-diagnostics.pdf", width = 12, height = 8)
for(species in names(diagnostics_results)) {
if(!is.null(diagnostics_results[[species]]$simulation)) {
par(mfrow = c(2, 2))
plot(diagnostics_results[[species]]$simulation, main = paste("Diagnostics:", species))
}
}
dev.off()
# no cases of overdispersion or mild issues of uniformity5.7 Predict abundance given detections
Following model diagnostic tests, we now plot abundance given acoustic detections for each species and save model summaries.
# extract model fit statistics and effect sizes
model_summary <- list()
effect_sizes <- list()
for(species in names(model_results)) {
# extract model summary
model_sum <- summary(model_results[[species]])
# get fixed effects
fixed_effects <- tidy(model_results[[species]], effects = "fixed")
detection_effect <- fixed_effects[fixed_effects$term == "detections_scaled", ]
# calculate R-squared (pseudo R-squared for GLMMs)
# using correlation between fitted and observed values
species_data <- data[data$common_name == species, ]
fitted_vals <- fitted(model_results[[species]])
pseudo_r2 <- cor(fitted_vals, species_data$abundance)^2
# store effect size and significance
effect_sizes[[species]] <- list(
estimate = detection_effect$estimate,
std_error = detection_effect$std.error,
p_value = detection_effect$p.value,
pseudo_r2 = pseudo_r2,
significance = case_when(
detection_effect$p.value < 0.001 ~ "***",
detection_effect$p.value < 0.01 ~ "**",
detection_effect$p.value < 0.05 ~ "*",
detection_effect$p.value < 0.1 ~ ".",
TRUE ~ "ns"
),
direction = ifelse(detection_effect$estimate > 0, "Positive", "Negative"),
effect_magnitude = case_when(
abs(detection_effect$estimate) > 1 ~ "Large",
abs(detection_effect$estimate) > 0.5 ~ "Medium",
abs(detection_effect$estimate) > 0.2 ~ "Small",
TRUE ~ "Very Small"
)
)
}
# get scaling parameters
scaling_center <- attr(data$detections_scaled, "scaled:center")
scaling_scale <- attr(data$detections_scaled, "scaled:scale")
# create prediction data
prediction_data_list <- list()
species_ecology_summary <- list()
for(species in names(model_results)) {
species_data <- data[data$common_name == species, ]
# calculate ecological summary statistics
species_ecology_summary[[species]] <- list(
n_observations = nrow(species_data),
n_sites = length(unique(species_data$site_id)),
mean_abundance = round(mean(species_data$abundance, na.rm = TRUE), 2),
max_abundance = max(species_data$abundance, na.rm = TRUE),
detection_range_original = range((species_data$detections_scaled * scaling_scale) + scaling_center),
abundance_cv = round(sd(species_data$abundance, na.rm = TRUE) / mean(species_data$abundance, na.rm = TRUE), 2)
)
# create prediction range based on observed data
detections_range <- seq(min(species_data$detections_scaled,
na.rm = TRUE),
max(species_data$detections_scaled,
na.rm = TRUE),
length.out = 100)
typical_site <- names(sort(table(species_data$site_id), decreasing = TRUE))[1]
typical_visit <- names(sort(table(species_data$year_visit_number), decreasing = TRUE))[1]
pred_data <- data.frame(
detections_scaled = detections_range,
site_id = typical_site,
year_visit_number = as.factor(typical_visit),
common_name = species
)
# back-transform detections
pred_data$detections_original <- (pred_data$detections_scaled * scaling_scale) + scaling_center
# generate predictions
tryCatch({
predictions <- predict(model_results[[species]],
newdata = pred_data,
type = "response",
se.fit = TRUE,
re.form = NA)
pred_data$predicted_abundance <- predictions$fit
pred_data$se <- predictions$se.fit
pred_data$lower_ci <- pmax(0, predictions$fit - 1.96 * predictions$se.fit)
pred_data$upper_ci <- predictions$fit + 1.96 * predictions$se.fit
# Add effect size information
pred_data$effect_direction <- effect_sizes[[species]]$direction
pred_data$effect_magnitude <- effect_sizes[[species]]$effect_magnitude
pred_data$significance <- effect_sizes[[species]]$significance
pred_data$pseudo_r2 <- effect_sizes[[species]]$pseudo_r2
pred_data$p_value <- effect_sizes[[species]]$p_value
# calculate fold change from min to max detections
min_pred <- pred_data$predicted_abundance[1]
max_pred <- pred_data$predicted_abundance[nrow(pred_data)]
pred_data$fold_change <- max_pred / min_pred
prediction_data_list[[species]] <- pred_data
}, error = function(e) {
cat("Prediction failed for", species, ":", e$message, "\n")
})
}
# combine all prediction data
all_predictions <- do.call(rbind, prediction_data_list)
# create observed data with original detection values
observed_data_list <- list()
for(species in names(model_results)) {
species_data <- data[data$common_name == species, ]
species_data$detections_original <- (species_data$detections_scaled * scaling_scale) + scaling_center
observed_data_list[[species]] <- species_data
}
all_observed <- do.call(rbind, observed_data_list)
# create facet plot with statistical information
fig_predict_det_abundance <- ggplot() +
# add observed data points with transparency
geom_point(data = all_observed,
aes(x = detections_original, y = abundance),
alpha = 0.3, color = "gray50", size = 1) +
# add confidence interval
geom_ribbon(data = all_predictions,
aes(x = detections_original,
ymin = lower_ci, ymax = upper_ci),
alpha = 0.2, fill = "steelblue") +
# add prediction line colored by effect direction
geom_line(data = all_predictions,
aes(x = detections_original, y = predicted_abundance,
color = effect_direction),
size = 1.2) +
# add statistical information as text
geom_text(data = all_predictions %>%
group_by(common_name) %>%
summarise(
detections_original = max(detections_original) * 0.05,
predicted_abundance = max(predicted_abundance) * 0.95,
significance = first(significance),
pseudo_r2 = first(pseudo_r2),
effect_magnitude = first(effect_magnitude),
p_value = first(p_value),
fold_change = first(fold_change),
.groups = 'drop'
),
aes(x = detections_original, y = predicted_abundance,
label = paste0("R² = ", round(pseudo_r2, 3), significance
)),
hjust = 0, vjust = 1, size = 2.8,
fontface = "bold", color = "black") +
facet_wrap(~ common_name, scales = "free", ncol = 3) +
scale_color_manual(values = c("Positive" = "#2E8B57",
"Negative" = "#CD5C5C"),
name = "Association") +
labs(
x = "Number of acoustic detections",
y = "Predicted avian abundance (counts of individuals)",
caption = "R² = pseudo R-squared; *** p<0.001, ** p<0.01, * p<0.05, . p<0.1, ns = not significant\nGray points = observed data, colored lines = model predictions ± 95% CI"
) +
theme_bw() +
theme(
strip.text = element_text(size = 10, face = "bold",
family = "Century Gothic"),
axis.text = element_text(size = 9,
family = "Century Gothic"),
axis.title = element_text(size = 12, face = "bold",
family = "Century Gothic"),
plot.title = element_text(size = 16, hjust = 0.5, face = "bold",
family = "Century Gothic"),
plot.subtitle = element_text(size = 12, hjust = 0.5, face = "italic",
family = "Century Gothic"),
plot.caption = element_text(size = 9, hjust = 0, family = "Century Gothic"),
legend.position = "bottom",
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = "lightgray")
)
ggsave("figs/fig_predicted_avian_abundance_detections.png", fig_predict_det_abundance, width = 16, height = 12, dpi = 300,)
dev.off()
# create a summary table containing the model results
summary_table <- data.frame(
Species = names(effect_sizes),
N_Observations = sapply(species_ecology_summary, function(x) x$n_observations),
N_Sites = sapply(species_ecology_summary, function(x) x$n_sites),
Mean_Abundance = sapply(species_ecology_summary, function(x) x$mean_abundance),
Max_Abundance = sapply(species_ecology_summary, function(x) x$max_abundance),
Detection_Range = sapply(species_ecology_summary, function(x)
paste(round(x$detection_range_original, 1), collapse = " - ")),
Effect_Direction = sapply(effect_sizes, function(x) x$direction),
Effect_Size = round(sapply(effect_sizes, function(x) x$estimate), 3),
Effect_Magnitude = sapply(effect_sizes, function(x) x$effect_magnitude),
P_Value = round(sapply(effect_sizes, function(x) x$p_value), 4),
Significance = sapply(effect_sizes, function(x) x$significance),
Pseudo_R2 = round(sapply(effect_sizes, function(x) x$pseudo_r2), 3),
Fold_Change = round(sapply(prediction_data_list, function(x) x$fold_change[1]), 2)
)
# write table to file
write.csv(summary_table, "results/glmm_model_abundance_detections.csv", row.names = FALSE)
# write model results and filtered data to file for loading in future scripts
save(model_results, file = "results/glmm_model_results.RData")