Published

2025-08-01

0.1 Overview

Goal: Get the full taxonomic hierarchy for all observed taxa in the larval and egg counts from the NOAA CalCOFI database from the World Register of Marine Species (WoRMS; marinespecies.org).

Code
librarian::shelf(
  DBI, dplyr, DT, duckdb, fs, glue, here, stringr,
  quiet = T)

url_cc <- "https://file.calcofi.io/calcofi.duckdb"
cc_db  <- here("data/tmp.duckdb")
spp_db <- glue("~/My Drive/projects/msens/data/derived/spp.duckdb")

is_ben_laptop <- Sys.info()[["nodename"]] == "Bens-MacBook-Air.local"

con_cc <- dbConnect(duckdb(), dbdir = cc_db)
res <- dbExecute(con_cc, glue("ATTACH IF NOT EXISTS '{url_cc}' AS calcofi; USE calcofi"))

# helper functions ----

id2url <- function(id, type = "worms") {
  # add urls for WoRMS and ITIS, show datatable with links
  if (type == "worms") {
    glue("<a href='https://www.marinespecies.org/aphia.php?p=taxdetails&id={id}'
        _target='_blank'>{id}</a>")
  } else if (type == "itis") {
    glue("<a href='https://itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value={id}'
        _target='_blank'>{id}</a>")
  } else {
    stop("Unknown type. Use 'worms' or 'itis'.")
  }
}

0.2 Fetch CalCOFI species

Code
# dbListTables(con_cc)
# [1] "cruise"   "egg"      "grid"     "larva"    "net"      "ship"    
# [7] "site"     "site_seg" "species"  "tow"      "tow_type"

d_species <- tbl(con_cc, "calcofi.species") |> 
  select(
    sp_id,
    scientific_name,
    common_name,
    itis_id_old  = itis_tsn,
    worms_id_old = aphia_id) |> 
  collect()

# https://www.marinespecies.org/aphia.php?p=taxdetails&id=293496
# https://itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=168736

d_species |> 
  mutate(
    worms_old_url = id2url(worms_id_old, "worms"),
    itis_old_url  = id2url(itis_id_old,  "itis")) |> 
  select(sp_id, common_name, scientific_name, worms_id_old = worms_old_url, itis_id_old = itis_old_url) |> 
  datatable(escape = F, rownames = F)

0.3 Use pre-processed WoRMS database

The entire WoRMS database has already been downloaded from World Register of Marine Species at ChecklistBank.org, which seems much more extensive (2.7 GB) than the one downloadable directly from marinespecies.org/download (547.8 MB).

This ChecklistBank.org WoRMS database was ingested into a duckdb database in the standard DarwinCore format as with other major taxonomic authorities (GBIF, ITIS, IUCN) with the notebook ingest_taxon.qmd. This is only locally available on Ben’s laptop for the moment, but can be recreated by running the ingest_taxon.qmd workflow.

Code
con_spp <- dbConnect(duckdb(dbdir = spp_db, read_only = F))
Code
dbListTables(con_spp)
# [1] "gbif"             "gbif_vernacular"  "iucn_redlist"
# [4] "iucn_vernacular"  "worms"            "worms_vernacular"

dbListFields(con_spp, "worms")
#  [1] "taxonID"                  "scientificNameID"
#  [3] "acceptedNameUsageID"      "parentNameUsageID"
#  [5] "namePublishedInID"        "scientificName"
#  [7] "acceptedNameUsage"        "parentNameUsage"
#  [9] "namePublishedIn"          "namePublishedInYear"
# [11] "kingdom"                  "phylum"
# [13] "class"                    "order"
# [15] "family"                   "genus"
# [17] "subgenus"                 "specificEpithet"
# [19] "infraspecificEpithet"     "taxonRank"
# [21] "scientificNameAuthorship" "nomenclaturalCode"
# [23] "taxonomicStatus"          "nomenclaturalStatus"
# [25] "modified"                 "bibliographicCitation"
# [27] "references"               "license"
# [29] "rightsHolder"             "datasetName"
# [31] "institutionCode"          "datasetID"

dbListFields(con_spp, "worms_vernacular")
# "taxonID"  "vernacularName"  "source"  "language"  "isPreferredName"

0.4 Update WoRMS ID

Code
d_species <- d_species |> # 1,148 × 5
  left_join(
    tbl(con_spp, "worms") |>
      filter(taxonID %in% d_species$worms_id_old) |> 
      select(
        taxonID,
        worms_id               = acceptedNameUsageID,
        worms_scientific_name  = acceptedNameUsage) |>
      collect(),
    by = join_by(worms_id_old == taxonID))

d_worms_chgs <- d_species |> 
  filter(
    (worms_id_old != worms_id) | 
      (scientific_name != worms_scientific_name)) |> 
  select(
    sp_id, common_name, 
    scientific_name, worms_scientific_name, 
    worms_id_old, worms_id)

if (nrow(d_worms_chgs) == 0){
  message("No changes to WoRMS IDs or scientific_name found.")
} else {
  message("Found ", nrow(d_worms_chgs), " changes to WoRMS IDs.")
  d_worms_chgs |> 
    mutate(
      worms_old_url = id2url(worms_id_old, "worms"),
      worms_url     = id2url(worms_id,     "worms")) |> 
  select(
    sp_id, common_name, 
    scientific_name, worms_scientific_name,
    worms_id_old = worms_old_url, worms_id = worms_url) |> 
    datatable(escape = F, rownames = F)
}
Code
# https://www.marinespecies.org/aphia.php?p=taxdetails&id=293496

0.5 Update ITIS ID

Code
d_species <- d_species |> # 1,148 × 5
  left_join(
    tbl(con_spp, "itis") |>
      filter(taxonID %in% d_species$itis_id_old) |> 
      select(
        taxonID,
        itis_id               = acceptedNameUsageID) |>
      collect(),
    by = join_by(itis_id_old == taxonID))

d_itis_chgs <- d_species |> 
  filter(
    (itis_id_old != itis_id)) |> 
  select(
    sp_id, common_name, scientific_name,
    itis_id_old, itis_id)

if (nrow(d_itis_chgs) == 0){
  message("No changes to ITIS IDs found.")
} else {
  message("Found ", nrow(d_itis_chgs), " changes to ITIS IDs.")
  d_itis_chgs |> 
    mutate(
      itis_old_url = id2url(itis_id_old, "itis"),
      itis_url     = id2url(itis_id,     "itis")) |> 
    select(
      sp_id, common_name, 
      scientific_name,
      itis_id_old = itis_old_url, itis_id = itis_url) |> 
    datatable(escape = F, rownames = F)
}

1 Get WoRMS common name

Code
d_species <- d_species |> # 1,148 × 8
  left_join(
    tbl(con_spp, "worms_vernacular") |>
      filter(
        taxonID %in% d_species$worms_id,
        language == "ENG") |> 
      select(
        taxonID,
        worms_common_name = vernacularName,
        isPreferredName) |>
      collect(),
    by           = join_by(worms_id == taxonID),
    relationship = "many-to-many") |> 
  slice_max(
    by        = scientific_name,
    order_by  = isPreferredName, 
    with_ties = F) |> 
  select(-isPreferredName)

table(is.na(d_species$worms_common_name))

FALSE  TRUE 
  742   406 
Code
d_species <- d_species |> # 1,148 × 8
  left_join(
    tbl(con_spp, "itis_vernacular") |>
      filter(
        taxonID %in% d_species$itis_id,
        language == "English") |> 
      group_by(taxonID) |> 
      summarize(
        # get one common name per taxonID
        itis_common_name = first(vernacularName), 
        .groups = "drop") |>
      collect(),
    by = join_by(itis_id == taxonID))
table(is.na(d_species$itis_common_name))

FALSE  TRUE 
    1  1147 
Code
knitr::opts_chunk$set(eval = F)

1.1 Get parent taxa

Code
# Function to get taxon and all parent taxa using recursive CTE
get_taxon_parentage <- function(con_spp, scientific_name) {
  
  # Method 1: Using dbGetQuery with raw SQL (most efficient)
  query_sql <- "
    WITH RECURSIVE taxon_hierarchy AS (
      -- Base case: find the initial taxon
      SELECT 
        taxonID,
        scientificNameID,
        acceptedNameUsageID,
        parentNameUsageID,
        scientificName,
        acceptedNameUsage,
        parentNameUsage,
        taxonRank,
        kingdom,
        phylum,
        class,
        \"order\",
        family,
        genus,
        taxonomicStatus,
        0 as level
      FROM worms 
      WHERE scientificName = ?
      
      UNION ALL
      
      -- Recursive case: find parent taxa
      SELECT 
        w.taxonID,
        w.scientificNameID,
        w.acceptedNameUsageID,
        w.parentNameUsageID,
        w.scientificName,
        w.acceptedNameUsage,
        w.parentNameUsage,
        w.taxonRank,
        w.kingdom,
        w.phylum,
        w.class,
        w.\"order\",
        w.family,
        w.genus,
        w.taxonomicStatus,
        th.level + 1 as level
      FROM worms w
      INNER JOIN taxon_hierarchy th ON w.taxonID = th.parentNameUsageID
      WHERE w.taxonID IS NOT NULL
    )
    SELECT * FROM taxon_hierarchy
    ORDER BY level, taxonRank
  "
  
  result <- dbGetQuery(con_spp, query_sql, params = list(scientific_name))
  return(as_tibble(result))
}

# Example usage:
d_parentage <- get_taxon_parentage(con_spp, "Sebastes atrovirens")
d_parentage

# To see just the taxonomic path:
d_parentage |>
  select(taxonID, parentNameUsageID, scientificName, taxonRank, level) |> 
  left_join(
    tbl(con_spp, "worms_vernacular") |> 
      filter(
        taxonID %in% d_parentage$taxonID,
        language == "ENG") |> 
      select(taxonID, vernacularName, isPreferredName) |>
      collect(),
      by = "taxonID") |> 
  arrange(level, taxonID, desc(isPreferredName))

1.2 Get taxonomic children

Code
# Function to get all children (descendants) of a given taxon
get_taxon_children <- function(con_spp, scientific_name) {
  
  query_sql <- "
    WITH RECURSIVE taxon_children AS (
      -- Base case: find the parent taxon
      SELECT 
        taxonID,
        scientificNameID,
        acceptedNameUsageID,
        parentNameUsageID,
        scientificName,
        acceptedNameUsage,
        parentNameUsage,
        taxonRank,
        kingdom,
        phylum,
        class,
        \"order\",
        family,
        genus,
        taxonomicStatus,
        0 as depth_level
      FROM worms 
      WHERE scientificName = ?
      
      UNION ALL
      
      -- Recursive case: find children taxa
      SELECT 
        w.taxonID,
        w.scientificNameID,
        w.acceptedNameUsageID,
        w.parentNameUsageID,
        w.scientificName,
        w.acceptedNameUsage,
        w.parentNameUsage,
        w.taxonRank,
        w.kingdom,
        w.phylum,
        w.class,
        w.\"order\",
        w.family,
        w.genus,
        w.taxonomicStatus,
        tc.depth_level + 1 as depth_level
      FROM worms w
      INNER JOIN taxon_children tc ON w.parentNameUsageID = tc.taxonID
      WHERE w.parentNameUsageID IS NOT NULL
    )
    SELECT * FROM taxon_children
    ORDER BY depth_level, 
             CASE taxonRank 
               WHEN 'kingdom' THEN 1
               WHEN 'phylum' THEN 2  
               WHEN 'class' THEN 3
               WHEN 'order' THEN 4
               WHEN 'family' THEN 5
               WHEN 'genus' THEN 6
               WHEN 'species' THEN 7
               WHEN 'subspecies' THEN 8
               WHEN 'variety' THEN 9
               WHEN 'form' THEN 10
               ELSE 11
             END,
             scientificName
  "
  
  result <- dbGetQuery(con_spp, query_sql, params = list(scientific_name))
  return(as_tibble(result))
}

# Get all descendants of a family
d_children <- get_taxon_children(con_spp, "Sebastidae")
d_children

tbl(con_spp, "worms") |> 
  pull(taxonRank) |> 
  table()

d_children |> 
  select(depth_level, taxonRank) |> 
  table()

d_children |>
  select(taxonID, parentNameUsageID, scientificName, taxonRank, depth_level) |> 
  arrange(depth_level, taxonID)
Code
# Create taxonomic rank lookup table
create_taxa_ranks_table <- function(con_spp) {
  
  taxa_ranks_chr <- c(
      "Kingdom", "Subkingdom", "Infrakingdom",
      "Superphylum", "Phylum", "Phylum (Division)", "Subphylum", 
      "Subphylum (Subdivision)", "Infraphylum", "Parvphylum",
      "Gigaclass", "Megaclass", "Superclass", "Class", "Subterclass", 
      "Subclass", "Infraclass",
      "Superorder", "Order", "Suborder", "Infraorder", "Parvorder",
      "Section", "Subsection",
      "Superfamily", "Epifamily", "Family", "Subfamily",
      "Supertribe", "Tribe", "Subtribe",
      "Genus", "Subgenus",
      "Series", "Subseries",
      "Species", "Subspecies",
      "Natio", "Mutatio",
      "Forma", "Subforma",
      "Variety", "Subvariety",
      "Coll. sp.", "Aggr.")
  
  # Define the taxonomic hierarchy with proper ordering
  taxa_ranks <- data.frame(
    taxonRank = ,
    rank_order = 1:length(taxa_ranks_chr),
    stringsAsFactors = FALSE
  )
  
  # Create the table in DuckDB
  dbWriteTable(con_spp, "taxa_ranks", taxa_ranks, overwrite = TRUE)
  
  return(taxa_ranks)
}

# Function to get all children (descendants) of a given taxon
get_taxon_children <- function(con_spp, scientific_name) {
  
  query_sql <- "
    WITH RECURSIVE taxon_children AS (
      -- Base case: find the parent taxon
      SELECT 
        taxonID,
        scientificNameID,
        acceptedNameUsageID,
        parentNameUsageID,
        scientificName,
        acceptedNameUsage,
        parentNameUsage,
        taxonRank,
        kingdom,
        phylum,
        class,
        \"order\",
        family,
        genus,
        taxonomicStatus,
        0 as depth_level
      FROM worms 
      WHERE scientificName = ?
      
      UNION ALL
      
      -- Recursive case: find children taxa
      SELECT 
        w.taxonID,
        w.scientificNameID,
        w.acceptedNameUsageID,
        w.parentNameUsageID,
        w.scientificName,
        w.acceptedNameUsage,
        w.parentNameUsage,
        w.taxonRank,
        w.kingdom,
        w.phylum,
        w.class,
        w.\"order\",
        w.family,
        w.genus,
        w.taxonomicStatus,
        tc.depth_level + 1 as depth_level
      FROM worms w
      INNER JOIN taxon_children tc ON w.parentNameUsageID = tc.taxonID
      WHERE w.parentNameUsageID IS NOT NULL
    )
    SELECT tc.*, COALESCE(tr.rank_order, 99) as rank_order
    FROM taxon_children tc
    LEFT JOIN taxa_ranks tr ON tc.taxonRank = tr.taxonRank
    ORDER BY tc.depth_level, COALESCE(tr.rank_order, 99), tc.scientificName
  "
  
  result <- dbGetQuery(con_spp, query_sql, params = list(scientific_name))
  return(as_tibble(result))
}

# Function to get only direct children (one level down)
get_direct_children <- function(con_spp, scientific_name) {
  
  query_sql <- "
    WITH parent_taxon AS (
      SELECT taxonID 
      FROM worms 
      WHERE scientificName = ?
    )
    SELECT 
      w.taxonID,
      w.scientificNameID,
      w.acceptedNameUsageID,
      w.parentNameUsageID,
      w.scientificName,
      w.acceptedNameUsage,
      w.parentNameUsage,
      w.taxonRank,
      w.kingdom,
      w.phylum,
      w.class,
      w.\"order\",
      w.family,
      w.genus,
      w.taxonomicStatus,
      COALESCE(tr.rank_order, 99) as rank_order
    FROM worms w
    INNER JOIN parent_taxon pt ON w.parentNameUsageID = pt.taxonID
    LEFT JOIN taxa_ranks tr ON w.taxonRank = tr.taxonRank
    ORDER BY COALESCE(tr.rank_order, 99), w.scientificName
  "
  
  result <- dbGetQuery(con_spp, query_sql, params = list(scientific_name))
  return(as_tibble(result))
}

# Function to get children up to a specific depth level
get_taxon_children_depth <- function(con_spp, scientific_name, max_depth = NULL) {
  
  # Build the depth condition
  depth_condition <- if (!is.null(max_depth)) {
    paste0("WHERE depth_level <= ", max_depth)
  } else {
    ""
  }
  
  query_sql <- paste0("
    WITH RECURSIVE taxon_children AS (
      -- Base case: find the parent taxon
      SELECT 
        taxonID,
        scientificNameID,
        acceptedNameUsageID,
        parentNameUsageID,
        scientificName,
        acceptedNameUsage,
        parentNameUsage,
        taxonRank,
        kingdom,
        phylum,
        class,
        \"order\",
        family,
        genus,
        taxonomicStatus,
        0 as depth_level
      FROM worms 
      WHERE scientificName = ?
      
      UNION ALL
      
      -- Recursive case: find children taxa
      SELECT 
        w.taxonID,
        w.scientificNameID,
        w.acceptedNameUsageID,
        w.parentNameUsageID,
        w.scientificName,
        w.acceptedNameUsage,
        w.parentNameUsage,
        w.taxonRank,
        w.kingdom,
        w.phylum,
        w.class,
        w.\"order\",
        w.family,
        w.genus,
        w.taxonomicStatus,
        tc.depth_level + 1 as depth_level
      FROM worms w
      INNER JOIN taxon_children tc ON w.parentNameUsageID = tc.taxonID
      WHERE w.parentNameUsageID IS NOT NULL", 
      if (!is.null(max_depth)) paste0(" AND tc.depth_level < ", max_depth) else "", "
    )
    SELECT tc.*, COALESCE(tr.rank_order, 99) as rank_order
    FROM taxon_children tc
    LEFT JOIN taxa_ranks tr ON tc.taxonRank = tr.taxonRank
    ", depth_condition, "
    ORDER BY tc.depth_level, COALESCE(tr.rank_order, 99), tc.scientificName
  ")
  
  result <- dbGetQuery(con_spp, query_sql, params = list(scientific_name))
  return(as_tibble(result))
}

# Function to get children of a specific taxonomic rank
get_children_by_rank <- function(con_spp, scientific_name, target_rank) {
  
  query_sql <- "
    WITH RECURSIVE taxon_children AS (
      -- Base case: find the parent taxon
      SELECT 
        taxonID,
        scientificNameID,
        acceptedNameUsageID,
        parentNameUsageID,
        scientificName,
        acceptedNameUsage,
        parentNameUsage,
        taxonRank,
        kingdom,
        phylum,
        class,
        \"order\",
        family,
        genus,
        taxonomicStatus,
        0 as depth_level
      FROM worms 
      WHERE scientificName = ?
      
      UNION ALL
      
      -- Recursive case: find children taxa
      SELECT 
        w.taxonID,
        w.scientificNameID,
        w.acceptedNameUsageID,
        w.parentNameUsageID,
        w.scientificName,
        w.acceptedNameUsage,
        w.parentNameUsage,
        w.taxonRank,
        w.kingdom,
        w.phylum,
        w.class,
        w.\"order\",
        w.family,
        w.genus,
        w.taxonomicStatus,
        tc.depth_level + 1 as depth_level
      FROM worms w
      INNER JOIN taxon_children tc ON w.parentNameUsageID = tc.taxonID
      WHERE w.parentNameUsageID IS NOT NULL
    )
    SELECT tc.*, COALESCE(tr.rank_order, 99) as rank_order
    FROM taxon_children tc
    LEFT JOIN taxa_ranks tr ON tc.taxonRank = tr.taxonRank
    WHERE tc.taxonRank = ?
    ORDER BY tc.scientificName
  "
  
  result <- dbGetQuery(con_spp, query_sql, params = list(scientific_name, target_rank))
  return(as_tibble(result))
}

# Helper function to summarize children by taxonomic rank
summarize_children_by_rank <- function(children_df) {
  children_df |>
    filter(depth_level > 0) |>  # Exclude the parent taxon itself
    count(taxonRank, name = "count") |>
    left_join(
      data.frame(
        taxonRank = c(
          "Kingdom", "Subkingdom", "Infrakingdom",
          "Superphylum", "Phylum", "Phylum (Division)", "Subphylum", 
          "Subphylum (Subdivision)", "Infraphylum", "Parvphylum",
          "Gigaclass", "Megaclass", "Superclass", "Class", "Subterclass", 
          "Subclass", "Infraclass",
          "Superorder", "Order", "Suborder", "Infraorder", "Parvorder",
          "Section", "Subsection",
          "Superfamily", "Epifamily", "Family", "Subfamily",
          "Supertribe", "Tribe", "Subtribe",
          "Genus", "Subgenus",
          "Series", "Subseries",
          "Species", "Subspecies",
          "Natio", "Mutatio",
          "Forma", "Subforma",
          "Variety", "Subvariety",
          "Coll. sp.", "Aggr."
        ),
        rank_order = 1:45,
        stringsAsFactors = FALSE
      ),
      by = "taxonRank"
    ) |>
    arrange(rank_order)
}

# Example usage:

# First, create the taxa_ranks lookup table (run once)
# create_taxa_ranks_table(con_spp)

# Get all descendants of a family
# family_children <- get_taxon_children(con_spp, "Sebastidae")

# Get only direct children (genera within the family)
# direct_children <- get_direct_children(con_spp, "Sebastidae")

# Get children up to 2 levels deep
# shallow_children <- get_taxon_children_depth(con_spp, "Sebastidae", max_depth = 2)

# Get all species within a genus
# species_in_genus <- get_children_by_rank(con_spp, "Sebastes", "Species")

# Summarize the taxonomic diversity
# children <- get_taxon_children(con_spp, "Sebastidae")
# summary <- summarize_children_by_rank(children)
# print(summary)

1.3 Close db connection(s)

Code
dbDisconnect(con_cc, shutdown = T); duckdb_shutdown(duckdb()); rm(con_cc)

if (is_ben_laptop)
  dbDisconnect(con_spp, shutdown = T); duckdb_shutdown(duckdb()); rm(con_spp)

Note that you might need to sometimes restart your R session (under Session menu in RStudio) to connect to the local duckdb.