# INSTALL CRAN PACKAGES
# Install missing CRAN packages
install.packages(setdiff(
c(
"stats", "dplyr", "ggplot2", "flextable", "ggpubr",
"randomForest", "ggridges", "ggalluvial", "tibble",
"matrixStats", "RColorBrewer", "ape", "rlang",
"scales", "magrittr", "phangorn", "igraph", "tidyr",
"xml2", "data.table", "reshape2", "vegan", "patchwork", "officer"
),
installed.packages()[, "Package"]
))
# Load CRAN packages
lapply(c(
"stats", "dplyr", "ggplot2", "flextable", "ggpubr", "randomForest",
"ggridges", "ggalluvial", "tibble", "matrixStats", "RColorBrewer",
"ape", "rlang", "scales", "magrittr", "phangorn", "igraph", "tidyr",
"xml2", "data.table", "reshape2", "vegan", "patchwork", "officer"
), library, character.only = TRUE)
# INSTALL BIOCONDUCTOR PACKAGES
# Install BiocManager if not installed
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
# Install missing Bioconductor packages
BiocManager::install(setdiff(
c(
"phyloseq", "msa", "DESeq2", "ggtree", "edgeR",
"Biostrings", "DECIPHER", "microbiome", "limma",
"S4Vectors", "SummarizedExperiment", "TreeSummarizedExperiment"
),
installed.packages()[, "Package"]
))
# Load Bioconductor packages
lapply(
c(
"phyloseq", "msa", "DESeq2", "edgeR", "Biostrings", "ggtree", "DECIPHER",
"microbiome", "limma", "S4Vectors", "SummarizedExperiment", "TreeSummarizedExperiment"
),
library,
character.only = TRUE
)
# INSTALL DspikeIn FROM GITHUB
# To access the DspikeIn vignette for a detailed tutorial, use vignette("DspikeIn"), or browse all available vignettes with browseVignettes("DspikeIn").
devtools::install_github("mghotbi/DspikeIn", build_vignettes = TRUE, dependencies = TRUE)
library(DspikeIn)
browseVignettes("DspikeIn")
vignette("DspikeIn")
## or
if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools")
devtools::install_github("mghotbi/DspikeIn")
# Load DspikeIn only if installed
if ("DspikeIn" %in% installed.packages()[, "Package"]) {
library(DspikeIn)
} else {
stop("DspikeIn installation failed. Check errors above.")
}
The development of the DspikeIn package was made possible through the generous and pioneering efforts of the R and Bioconductor communities. We gratefully acknowledge the developers and maintainers of the following open-source packages, whose tools and infrastructure underpin our work: Core infrastructure & data manipulation: methods, stats, utils, graphics, grDevices, data.table, dplyr, tibble, tidyr, reshape2, matrixStats, rlang, S4Vectors, grid, officer, xml2 Statistical analysis & modeling: DESeq2, edgeR, limma, randomForest, microbiome Phylogenetics & microbiome structure: phyloseq, TreeSummarizedExperiment, SummarizedExperiment, phangorn, ape, DECIPHER, msa, Biostrings Network and graph analysis: igraph, ggraph Visualization & layout design: ggplot2, ggrepel, ggpubr, ggnewscale, ggalluvial, ggtree, ggtreeExtra, ggstar, ggridges, patchwork, scales, RColorBrewer, flextable
These tools collectively empowered us to build a reproducible, modular, and extensible platform for robust absolute abundance quantification in microbial community analysis. We further acknowledge the broader scientific community working on absolute microbial quantification, spike-in calibration, and compositional data analysis, whose foundational insights directly informed the design and conceptual framework of DspikeIn.
The DspikeIn package supports both phyloseq and TreeSummarizedExperiment formats to streamline microbial quantification across diverse experimental setups. It accommodates either a single spike-in taxon or synthetic community taxa with variable or equal spike-in volumes and copy numbers. The package offers a comprehensive suite of tools for AA quantification, addressing challenges through ten core functions: 1) validation of spiked species, 2) data preprocessing, 3) system-specific spiked species retrieval, 4) scaling factor calculation, 5) conversion to absolute abundance, 6) bias correction and normalization, 7) performance assessment, and 8) taxa exploration and filtering 9) network topology assessment 10) further analyses and visualization.
DspikeIn works with 7 taxonomic ranks
# To remove strain from the taxonomic ranks
# DspikeIn works with 7 taxonomic ranks
# colnames(taxmat) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
# Function to remove strain or bracketed info
remove_strain_info <- function(df) {
pattern <- "Strain.*|strain.*|\\s*\\[.*?\\]|\\s*\\(.*?\\)"
for (col in colnames(df)) {
df[[col]] <- gsub(pattern, "", df[[col]])
df[[col]] <- trimws(df[[col]])}
return(df)}
# Apply to rowData of TSE
taxonomy <- as.data.frame(rowData(tse_16SOTU))
# Clean strain
cleaned_taxonomy <- remove_strain_info(taxonomy)
# drop "Strain" if present
if ("Strain" %in% colnames(cleaned_taxonomy)) {
cleaned_taxonomy <- cleaned_taxonomy[, colnames(cleaned_taxonomy) != "Strain"]}
# Re-assign to TSE
rowData(tse_16SOTU) <- cleaned_taxonomy
# To add species rank to the taxonomic ranks
# Make sure 'Genus' exists
if (!"Genus" %in% colnames(rowData(tse_16SOTU))) {
stop("The 'Genus' column is missing in rowData.")}
# prepare taxonomy
taxonomy <- as.data.frame(rowData(tse_16SOTU))
# Handle missing genus labels
taxonomy$Genus[is.na(taxonomy$Genus) | taxonomy$Genus == ""] <- "Unknown"
taxonomy$Species <- paste0(taxonomy$Genus, "_OTU", seq_len(nrow(taxonomy)))
# Assign back to TSE
rowData(tse_16SOTU) <- taxonomy
for more information please refer to https://github.com/markrobinsonuzh/TreeSummarizedExperiment
# Build TreeSummarizedExperiment (TSE)
otu <- read.csv("otu.csv", header = TRUE, sep = ",", row.names = 1)
otu_mat <- as.matrix(otu) # Convert to matrix
tax <- read.csv("tax.csv", header = TRUE, sep = ",", row.names = 1)
colnames(tax) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
tax_mat <- as.matrix(tax) # Convert to matrix
meta <- read.csv("metadata.csv", header = TRUE, sep = ",", row.names = 1)
reference_seqs <- readDNAStringSet("dna-sequences.fasta", format = "fasta")
tse <- TreeSummarizedExperiment(
assays = list(counts = otu_mat), # OTU table
rowData = tax_mat, # Taxonomy information
colData = meta, # Sample metadata
rowTree = MyTree, # Phylogenetic tree
rowSeqs = reference_seqs # Reference sequences)
Do all detected sample spike-in sequences cluster with the reference, and are their branch lengths statistically similar, supporting a common ancestor?
All sample-derived sequences are forming a clade with the reference. We look for a monophyletic grouping of spike-in OTUs The clade is strongly supported (bootstrap around 100 percentage). The branch lengths and distances are in a biologically plausible range.
# Use the Neighbor-Joining method based on a Jukes-Cantor distance matrix and plot the tree with bootstrap values.
# we compare the Sanger read of Tetragenococcus halophilus with the FASTA sequence of Tetragenococcus halophilus from our phyloseq object.
library(Biostrings)
library(TreeSummarizedExperiment)
library(SummarizedExperiment)
library(DspikeIn)
# Get path to external data folder
extdata_path <- system.file("extdata", package = "DspikeIn")
list.files(extdata_path)
## [1] "Ref.fasta" "Sample.fasta"
data("physeq_16SOTU", package = "DspikeIn")
tse_16SOTU<-convert_phyloseq_to_tse(physeq_16SOTU)
tse_16SOTU <- tidy_phyloseq_tse(tse_16SOTU)
# Filter TSE object to keep only Bacteria and Archaea
tse_16SOTU <- tse_16SOTU[
rowData(tse_16SOTU)$Kingdom %in% c("Bacteria", "Archaea"),]
library(Biostrings)
#
# # Subset the TSE object to include only Tetragenococcus
# tetragenococcus_tse <- tse_16SOTU[
# rowData(tse_16SOTU)$Genus == "Tetragenococcus" &
# !is.na(rownames(tse_16SOTU)) &
# rownames(tse_16SOTU) != "", ]
#
# ref_sequences <- referenceSeq(tetragenococcus_tse)
#
# # Convert to DNAStringSet if needed
# ref_sequences <- Biostrings::DNAStringSet(ref_sequences)
# Biostrings::writeXStringSet(ref_sequences, filepath = "Sample.fasta")
ref_fasta <- system.file("extdata", "Ref.fasta", package = "DspikeIn")
sample_fasta <- system.file("extdata", "Sample.fasta", package = "DspikeIn")
result <- validate_spikein_clade(
reference_fasta = ref_fasta,
sample_fasta = sample_fasta,
bootstrap = 200,
output_prefix = NULL)
## use default substitution matrix
Tip labels= OTU/ASV names Branch length numbers= Actual evolutionary distances (small = very similar) Prevalence stars How frequently the OTU occurs across samples Blue bar ring= Log10 mean abundance Outer colored tiles= The metadata variable you choose (e.g., Animal.type)
## class: TreeSummarizedExperiment
## dim: 9242 312
## metadata(0):
## assays(1): counts
## rownames(9242): 020e00d90ba97c5898944ab6f7b1b7c9
## b00466354053c9065c8aa3d6fbb33eaa ... 17f00f9912d4e139ebcfce6354d3fa97
## 5a28da69cc65fb06e686e6260ecbfe0e
## rowData names(7): Kingdom Phylum ... Genus Species
## colnames(312): blank.20818_S96 NTCswab.20626_S96 ... UHM998.20618_S95
## UHM999.20617_S83
## colData names(34): sample.id X16S.biosample ... swab.presence MK.spike
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
## rowLinks: a LinkDataFrame (9242 rows)
## rowTree: 1 phylo tree(s) (9255 leaves)
## colLinks: NULL
## colTree: NULL
## referenceSeq: a DNAStringSet (9242 sequences)
library(ggstar)
library(ggplot2)
# filter your object to only include spike-in taxa beforehand:
# change the OTU IDs for easy detection
# Big stars = detected in many samples
# Small stars = rarely detected
# log10(Mean Abundance) Bars= Color intensity reflects mean abundance.
# The log-transformed average abundance of each OTU across all samples where it appears.
# Extreme blue may signal unintended over-representation.
# Metadata Ring = factor of your interest e.g. Animal.type
# Each OTU is colored by where it was observed.
# Branch length numbers= Actual evolutionary distances (small = very similar)
library(DspikeIn)
library(TreeSummarizedExperiment)
# ---- 1. Subset taxa where Genus is Tetragenococcus ----
spikein_tse <- tse_16SOTU[
rowData(tse_16SOTU)$Genus == "Tetragenococcus", ]
# ---- 2. Diagnostic plot (tree-based) ----
ps <- plot_spikein_tree_diagnostic(
obj = spikein_tse,
metadata_var = "Animal.type",
save_plot = FALSE )
merges monophyletic ASVs/OTUs
# CALCULATE SCALING FACTORS
result <- calculate_spikeIn_factors(
Spiked_16S_sum_scaled,
spiked_cells = spiked_cells,
merged_spiked_species = species_name)
# View extracted outputs
result$spiked_species_reads # Merged spiked species name
## # A tibble: 264 × 2
## Sample Spiked_Reads
## <chr> <dbl>
## 1 spiked.blank.20433_S84 8
## 2 spiked.blank.20817_S84 47066
## 3 Std2uL.20625_S84 62433
## 4 StdSwab1uL.20624_S72 17639
## 5 STP1719.20422_S47 14554
## 6 STP213.20423_S59 83
## 7 STP268.20424_S71 17
## 8 STP544.20419_S11 2259
## 9 STP570.20420_S23 822
## 10 STP579.20421_S35 1759
## # ℹ 254 more rows
## # A tibble: 264 × 2
## Sample Total_Reads
## <chr> <dbl>
## 1 spiked.blank.20433_S84 8
## 2 spiked.blank.20817_S84 47103
## 3 Std2uL.20625_S84 62444
## 4 StdSwab1uL.20624_S72 24897
## 5 STP1719.20422_S47 19142
## 6 STP213.20423_S59 8462
## 7 STP268.20424_S71 6968
## 8 STP544.20419_S11 2340
## 9 STP570.20420_S23 2647
## 10 STP579.20421_S35 5193
## # ℹ 254 more rows
scaling_factors <- result$scaling_factors
head(scaling_factors) # Vector of scaling factors per sample
## spiked.blank.20433_S84 spiked.blank.20817_S84 Std2uL.20625_S84
## 230.87500000 0.03924277 0.02958371
## StdSwab1uL.20624_S72 STP1719.20422_S47 STP213.20423_S59
## 0.05235558 0.12690669 22.25301205
# Convert relative counts to absolute counts
# absolute counts=relative counts×sample-specific scaling factor
# Convert to absolute counts
library(DspikeIn)
absolute <- convert_to_absolute_counts(Spiked_16S_sum_scaled, scaling_factors)
# Extract processed data
absolute_counts <- absolute$absolute_counts
tse_absolute <- absolute$obj_adj
tse_absolute <- tidy_phyloseq_tse(tse_absolute)
# View absolute count data
head(tse_absolute)
## class: TreeSummarizedExperiment
## dim: 6 264
## metadata(0):
## assays(1): counts
## rownames(6): 020e00d90ba97c5898944ab6f7b1b7c9
## b00466354053c9065c8aa3d6fbb33eaa ... ed285eb1aac505a1f062b482300b69f7
## 63f5509575600a9e7afb6847d6296976
## rowData names(7): Kingdom Phylum ... Genus Species
## colnames(264): spiked.blank.20433_S84 spiked.blank.20817_S84 ...
## UHM998.20618_S95 UHM999.20617_S83
## colData names(34): sample.id X16S.biosample ... swab.presence MK.spike
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
## rowLinks: a LinkDataFrame (6 rows)
## rowTree: 1 phylo tree(s) (9227 leaves)
## colLinks: NULL
## colTree: NULL
# CALCULATE SPIKE PERCENTAGE & summary stat
# ** Calculate spike percentage & Generate summary statistics for absolute counts**
# Generate summary statistics for absolute counts
post_eval_summary <- calculate_summary_stats_table(absolute_counts)
# You may want to Back normal to check calculation accuracy
# the scaling factor was computed based on spiked species reads and fixed cell count.
# Multiplying the spiked species read count by this scaling factor restores the exact spiked cell count.
# lets check it
# BackNormal <- calculate_spike_percentage(
# tse_absolute,
# merged_spiked_species,
# passed_range = c(0.1, 20)
# )
The goal is to identify the range where, for example, the evenness of your community remains independent of spiked species retrieval—meaning the p-value should not be significant, and the R² value should be low, indicating minimal influence. Hill number is interpretable as “effective diversity” (number of abundant species).
# The acceptable range of spiked species retrieval is system-dependent
# Spiked species become centroid of the community (Distance to Centroid)
# Spiked species become dominant and imbalance the community (Evenness)
# What range of spiked species retrieval is appropriate for your system?
# Calculate Pielou's Evenness using Shannon index and species richness (Observed)
# Hill number q = 1 = exp(Shannon index), representing the effective number of equally abundant species. Hill is weighted by relative abundance, so dominant species have more influence.
# Unlike Pielou's evenness, this metric is not normalized by richness and it shows Effective number of common species
library(TreeSummarizedExperiment)
library(SummarizedExperiment)
library(dplyr)
library(tibble)
library(microbiome)
library(mia)
library(vegan)
library(S4Vectors)
library(ggplot2)
# --- 1. Extract current metadata from TSE ---
metadata <- colData(tse_absolute) %>%
as.data.frame() %>%
rownames_to_column("Sample")
# --- 2. Add spike-in reads (Perc) ---
# Ensure Perc has 'Sample' column and matching format
metadata <- dplyr::left_join(metadata, Perc, by = "Sample")
# --- 3. Estimate alpha diversity indices and extract ---
tse_absolute <- mia::addAlpha(
tse_absolute,
index = c("observed", "shannon", "pielou", "hill")
)
alpha_df <- colData(tse_absolute) %>%
as.data.frame() %>%
rownames_to_column("Sample") %>%
select(Sample, observed, shannon, pielou, hill)
metadata <- dplyr::left_join(metadata, alpha_df, by = "Sample")
# --- 4. Compute Bray-Curtis distance to centroid ---
otu_mat <- assay(tse_absolute, "counts")
otu_mat <- t(otu_mat) # samples as rows
otu_mat_rel <- vegan::decostand(otu_mat, method = "total")
centroid_profile <- colMeans(otu_mat_rel)
dist_to_centroid <- apply(otu_mat_rel, 1, function(x) {
vegan::vegdist(rbind(x, centroid_profile), method = "bray")[1]
})
# Match and assign distances
metadata$Dist_to_Centroid <- dist_to_centroid[metadata$Sample]
# --- 5. Assign updated metadata to TSE ---
metadata <- column_to_rownames(metadata, var = "Sample")
metadata <- metadata[colnames(tse_absolute), , drop = FALSE] # ensure correct order and size
colData(tse_absolute) <- S4Vectors::DataFrame(metadata)
# 4. Regression Plots: Diversity vs. Spike-in Reads
# ============================================================================
# Pielou’s Evenness
plot_object_pielou <- regression_plot(
data = metadata,
x_var = "pielou",
y_var = "Spiked_Reads",
custom_range = c(0.1, 20, 30, 40, 50, 60, 100),
plot_title = "Pielou’s Evenness vs. Spike-in Reads"
)
# Hill Number (q = 1)
plot_object_hill <- regression_plot(
data = metadata,
x_var = "hill",
y_var = "Spiked_Reads",
custom_range = c(0.1, 10, 20, 30, 100),
plot_title = "Hill Number (q = 1) vs. Spike-in Reads"
)
# Distance to Centroid
plot_object_centroid <- regression_plot(
data = metadata,
x_var = "Dist_to_Centroid",
y_var = "Spiked_Reads",
custom_range = c(0.1, 20, 30, 40, 50, 60, 100),
plot_title = "Distance to Global Centroid vs. Spike-in Reads"
)
# Interpretation
# ============================================================================
# - Pielou's evenness is normalized by richness; useful for detecting imbalance.
# - Hill number q = 1 gives effective number of common species; sensitive to dominance.
# - Distance to centroid in full Bray-Curtis space shows deviation from the average community.
# * Calculate the percentage of spiked species retrieval per sample*
library(mia)
library(dplyr)
library(SummarizedExperiment)
# Subset TSE to remove blanks
tse_absolute_filtered <- tse_absolute[, colData(tse_absolute)$sample.or.blank != "blank"]
# Calculate spike-in retrieval percentage
result_perc <- calculate_spike_percentage(
tse_absolute_filtered,
merged_spiked_species,
passed_range = c(0.1, 20)
)
# Generate conclusion report
conc <- conclusion(
tse_absolute_filtered,
merged_spiked_species,
max_passed_range = 20,
output_path = output_path)
conc$full_report
## # A tibble: 260 × 5
## Sample Total_Reads Spiked_Reads Percentage Result
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 STP1719.20422_S47 2427 1847 76.1 failed
## 2 STP213.20423_S59 188314 1847 0.981 passed
## 3 STP268.20424_S71 757053 1847 0.244 passed
## 4 STP544.20419_S11 1913 1847 96.5 failed
## 5 STP570.20420_S23 5948 1847 31.1 failed
## 6 STP579.20421_S35 5452 1847 33.9 failed
## 7 STP614.20418_S94 5 0 0 failed
## 8 UHM1000.20604_S22 43347 924 2.13 passed
## 9 UHM1001.20609_S82 39309 924 2.35 passed
## 10 UHM1007.20622_S48 86418 924 1.07 passed
## # ℹ 250 more rows
# Filter to keep only the samples that passed
passed_samples <- result_perc$Sample[result_perc$Result == "passed"]
# Subset the TSE object to include only passed samples
tse_passed <- tse_absolute_filtered[, colnames(tse_absolute_filtered) %in% passed_samples]
dim(tse_passed)
## [1] 8742 177
you may select to transform your data befor moving forward with Differential Abundance
# you may need to normalize/transform your data to reduce biases
ps <- tse_16SOTU
# TC Normalization
result_TC <- normalization_set(ps, method = "TC", groups = "Host.species")
normalized_ps_TC <- result_TC$dat.normed
scaling_factors_TC <- result_TC$scaling.factor
# UQ Normalization
result_UQ <- normalization_set(ps, method = "UQ", groups = "Host.species")
normalized_ps_UQ <- result_UQ$dat.normed
scaling_factors_UQ <- result_UQ$scaling.factor
# Median Normalization
result_med <- normalization_set(ps, method = "med", groups = "Host.species")
normalized_ps_med <- result_med$dat.normed
scaling_factors_med <- result_med$scaling.factor
# DESeq Normalization
ps_n <- remove_zero_negative_count_samples(ps)
result_DESeq <- normalization_set(ps_n, method = "DESeq", groups = "Animal.type")
normalized_ps_DESeq <- result_DESeq$dat.normed
scaling_factors_DESeq <- result_DESeq$scaling.factor
# Poisson Normalization
result_Poisson <- normalization_set(ps, method = "Poisson", groups = "Host.genus")
normalized_ps_Poisson <- result_Poisson$dat.normed
scaling_factors_Poisson <- result_Poisson$scaling.factor
# Quantile Normalization
result_QN <- normalization_set(ps, method = "QN")
normalized_ps_QN <- result_QN$dat.normed
scaling_factors_QN <- result_QN$scaling.factor
# TMM Normalization
result_TMM <- normalization_set(ps, method = "TMM", groups = "Animal.type")
normalized_ps_TMM <- result_TMM$dat.normed
scaling_factors_TMM <- result_TMM$scaling.factor
# CLR Normalization
result_clr <- normalization_set(ps, method = "clr")
normalized_ps_clr <- result_clr$dat.normed
scaling_factors_clr <- result_clr$scaling.factor
# Rarefying
result_rar <- normalization_set(ps, method = "rar")
normalized_ps_rar <- result_rar$dat.normed
scaling_factors_rar <- result_rar$scaling.factor
# CSS Normalization
result_css <- normalization_set(ps, method = "css")
normalized_ps_css <- result_css$dat.normed
scaling_factors_css <- result_css$scaling.factor
# TSS Normalization
result_tss <- normalization_set(ps, method = "tss")
normalized_ps_tss <- result_tss$dat.normed
scaling_factors_tss <- result_tss$scaling.factor
# RLE Normalization
result_rle <- normalization_set(ps, method = "rle")
normalized_ps_rle <- result_rle$dat.normed
scaling_factors_rle <- result_rle$scaling.factor
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.2 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.22-bioc/R/lib/libRblas.so
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] mia_1.17.0 MultiAssayExperiment_1.35.3
## [3] vegan_2.7-1 permute_0.9-7
## [5] microbiome_1.31.0 tibble_3.3.0
## [7] dplyr_1.1.4 flextable_0.9.9
## [9] TreeSummarizedExperiment_2.17.0 SingleCellExperiment_1.31.0
## [11] SummarizedExperiment_1.39.0 Biobase_2.69.0
## [13] GenomicRanges_1.61.0 MatrixGenerics_1.21.0
## [15] matrixStats_1.5.0 ggplot2_3.5.2
## [17] ggstar_1.0.4 DspikeIn_0.99.13
## [19] phyloseq_1.53.0 Biostrings_2.77.1
## [21] GenomeInfoDb_1.45.4 XVector_0.49.0
## [23] IRanges_2.43.0 S4Vectors_0.47.0
## [25] BiocGenerics_0.55.0 generics_0.1.4
## [27] BiocStyle_2.37.0
##
## loaded via a namespace (and not attached):
## [1] splines_4.5.1 ggplotify_0.1.2
## [3] cellranger_1.1.0 DirichletMultinomial_1.51.0
## [5] lifecycle_1.0.4 rstatix_0.7.2
## [7] edgeR_4.7.2 lattice_0.22-7
## [9] MASS_7.3-65 backports_1.5.0
## [11] magrittr_2.0.3 limma_3.65.1
## [13] sass_0.4.10 rmarkdown_2.29
## [15] jquerylib_0.1.4 yaml_2.3.10
## [17] zip_2.3.3 askpass_1.2.1
## [19] DBI_1.2.3 RColorBrewer_1.1-3
## [21] ade4_1.7-23 multcomp_1.4-28
## [23] abind_1.4-8 Rtsne_0.17
## [25] quadprog_1.5-8 fillpattern_1.0.2
## [27] purrr_1.0.4 TH.data_1.1-3
## [29] yulab.utils_0.2.0 sandwich_3.1-1
## [31] gdtools_0.4.2 ggrepel_0.9.6
## [33] irlba_2.3.5.1 tidytree_0.4.6
## [35] rbiom_2.2.0 parallelly_1.45.0
## [37] DelayedMatrixStats_1.31.0 codetools_0.2-20
## [39] DelayedArray_0.35.2 ggtext_0.1.2
## [41] scuttle_1.19.0 xml2_1.3.8
## [43] tidyselect_1.2.1 aplot_0.2.6
## [45] UCSC.utils_1.5.0 farver_2.1.2
## [47] viridis_0.6.5 ScaledMatrix_1.17.0
## [49] jsonlite_2.0.0 ggtreeExtra_1.19.0
## [51] BiocNeighbors_2.3.1 multtest_2.65.0
## [53] msa_1.41.0 decontam_1.29.0
## [55] Formula_1.2-5 emmeans_1.11.1
## [57] ggalluvial_0.12.5 survival_3.8-3
## [59] scater_1.37.0 iterators_1.0.14
## [61] systemfonts_1.2.3 foreach_1.5.2
## [63] tools_4.5.1 ggnewscale_0.5.1
## [65] treeio_1.33.0 ragg_1.4.0
## [67] Rcpp_1.0.14 glue_1.8.0
## [69] gridExtra_2.3 SparseArray_1.9.0
## [71] BiocBaseUtils_1.11.0 xfun_0.52
## [73] mgcv_1.9-3 DESeq2_1.49.2
## [75] withr_3.0.2 BiocManager_1.30.26
## [77] fastmap_1.2.0 bluster_1.19.0
## [79] rhdf5filters_1.21.0 openssl_2.3.3
## [81] rsvd_1.0.5 digest_0.6.37
## [83] estimability_1.5.1 R6_2.6.1
## [85] gridGraphics_0.5-1 textshaping_1.0.1
## [87] dichromat_2.0-0.1 utf8_1.2.6
## [89] tidyr_1.3.1 fontLiberation_0.1.0
## [91] data.table_1.17.6 DECIPHER_3.5.0
## [93] httr_1.4.7 S4Arrays_1.9.1
## [95] pkgconfig_2.0.3 gtable_0.3.6
## [97] htmltools_0.5.8.1 fontBitstreamVera_0.1.1
## [99] carData_3.0-5 bookdown_0.43
## [101] biomformat_1.37.0 scales_1.4.0
## [103] ggfun_0.1.8 knitr_1.50
## [105] tzdb_0.5.0 reshape2_1.4.4
## [107] uuid_1.2-1 coda_0.19-4.1
## [109] nlme_3.1-168 zoo_1.8-14
## [111] cachem_1.1.0 rhdf5_2.53.1
## [113] stringr_1.5.1 vipor_0.4.7
## [115] parallel_4.5.1 pillar_1.10.2
## [117] grid_4.5.1 vctrs_0.6.5
## [119] slam_0.1-55 randomForest_4.7-1.2
## [121] ggpubr_0.6.0 car_3.1-3
## [123] BiocSingular_1.25.0 beachmat_2.25.1
## [125] xtable_1.8-4 cluster_2.1.8.1
## [127] beeswarm_0.4.0 evaluate_1.0.4
## [129] readr_2.1.5 mvtnorm_1.3-3
## [131] cli_3.6.5 locfit_1.5-9.12
## [133] compiler_4.5.1 rlang_1.1.6
## [135] crayon_1.5.3 ggsignif_0.6.4
## [137] labeling_0.4.3 ggbeeswarm_0.7.2
## [139] plyr_1.8.9 fs_1.6.6
## [141] stringi_1.8.7 viridisLite_0.4.2
## [143] BiocParallel_1.43.4 lazyeval_0.2.2
## [145] fontquiver_0.2.1 Matrix_1.7-3
## [147] hms_1.1.3 patchwork_1.3.0
## [149] sparseMatrixStats_1.21.0 Rhdf5lib_1.31.0
## [151] statmod_1.5.0 gridtext_0.1.5
## [153] igraph_2.1.4 broom_1.0.8
## [155] bslib_0.9.0 phangorn_2.12.1
## [157] ggtree_3.17.0 fastmatch_1.1-6
## [159] readxl_1.4.5 officer_0.6.10
## [161] ape_5.8-1
Spike-in volume Protocol;
The species Tetragenococcus halophilus (bacterial spike; ATCC33315) and Dekkera bruxellensis (fungal spike; WLP4642-White Labs) were selected as taxa to spike into gut microbiome samples as they were not found in an extensive collection of wildlife skin (GenBank BioProjects: PRJNA1114724, PRJNA 1114659) or gut microbiome samples. Stock cell suspensions of both microbes were grown in either static tryptic soy broth (T. halophilus) or potato dextrose broth (D. bruxellensis) for 72 hours then serially diluted and optical density (OD600) determined on a ClarioStar plate reader. Cell suspensions with an optical density of 1.0, 0.1, 0.01, 0.001 were DNA extracted using the Qiagen DNeasy Powersoil Pro Kit. These DNA isolations were used as standards to determine the proper spike in volume of cells to represent 0.1-10% of a sample (Rao et al., 2021b) Fecal pellets (3.1 ± 1.6 mg; range = 1 – 5.1 mg) from an ongoing live animal study using wood frogs (Lithobates sylvaticus) were used to standardize the input material for the development of this protocol. A total of (n=9) samples were used to validate the spike in protocol. Each fecal sample was homogenized in 1mL of sterile molecular grade water then 250uL of fecal slurry was DNA extracted as above with and without spiked cells. Two approaches were used to evaluate the target spike-in of 0.1-10%, the range of effective spike-in percentage described in (Rao et al., 2021b), including 1) an expected increase of qPCR cycle threshold (Ct) value that is proportional to the amount of spiked cells and 2) the expected increase in copy number of T. halophilus and D. bruxellensis in spiked vs. unspiked samples. A standard curve was generated using a synthetic fragment of DNA for the 16S-V4 rRNA and ITS1 rDNA regions of T. halophilus and D. bruxellensis, respectively. The standard curve was used to convert Ct values into log copy number for statistical analyses (detailed approach in[2, 3]) using the formula y = -0.2426x + 10.584 for T. halophilus and y = -0.3071x + 10.349 for D. bruxellensis, where x is the average Ct for each unknown sample. Quantitative PCR (qPCR) was used to compare known copy numbers from synthetic DNA sequences of T. halophilus and D. bruxellensis to DNA extractions of T. halophilus and D. bruxellensis independently, and wood frog fecal samples with and without spiked cells. SYBR qPCR assays were run at 20ul total volume including 10ul 2X Quantabio PerfeCTa SYBR Green Fastmix, 1ul of 10uM forward and reverse primers, 1ul of ArcticEnzymes dsDNAse master mix clean up kit, and either 1ul of DNA for D. bruxellensis or 3ul for T. halophilus. Different volumes of DNA were chosen for amplification of bacteria and fungi due to previous optimization of library preparation and sequencing steps [3]. The 515F [4] and 806R [5] primers were chosen to amplify bacteria and ITS1FI2 [6] and ITS2 for fungi, as these are the same primers used during amplicon library preparation and sequencing. Cycling conditions on an Agilent AriaMX consisted of 95 C for 3 mins followed by 40 cycles of 95 C for 15 sec, 60 C for 30 sec and 72 C for 30 sec. Following amplification, a melt curve was generated under the following conditions including 95 C for 30 sec, and a melt from 60 C to 90 C increasing in resolution of 0.5 C in increments of a 5 sec soak time. To validate the spike in protocol we selected two sets of fecal samples including 360 samples from a diverse species pool of frogs, lizards, salamanders and snakes and a more targeted approach of 122 fecal samples from three genera of salamanders from the Plethodontidae. (Supplemental Table #). FFecal samples were not weighed in the field, rather, a complete fecal pellet was diluted in an equal volume of sterile water and standardized volume of fecal slurry (250µL) extracted for independent samples.. A volume of 1ul T. halophilus (1847 copies) and 1ul D. bruxellensis (733 copies) were spiked into each fecal sample then DNA was extracted as above, libraries constructed, and amplicon sequenced on an Illumina MiSeq as in [7] .