Goal: Get the full taxonomic hierarchy for all observed taxa in the larval and egg counts from the NOAA CalCOFI database from the World Register of Marine Species (WoRMS; marinespecies.org).
Code
librarian::shelf( DBI, dplyr, DT, duckdb, fs, glue, here, stringr,quiet = T)url_cc <-"https://file.calcofi.io/calcofi.duckdb"cc_db <-here("data/tmp.duckdb")spp_db <-glue("~/My Drive/projects/msens/data/derived/spp.duckdb")is_ben_laptop <-Sys.info()[["nodename"]] =="Bens-MacBook-Air.local"con_cc <-dbConnect(duckdb(), dbdir = cc_db)res <-dbExecute(con_cc, glue("ATTACH IF NOT EXISTS '{url_cc}' AS calcofi; USE calcofi"))# helper functions ----id2url <-function(id, type ="worms") {# add urls for WoRMS and ITIS, show datatable with linksif (type =="worms") {glue("<a href='https://www.marinespecies.org/aphia.php?p=taxdetails&id={id}' _target='_blank'>{id}</a>") } elseif (type =="itis") {glue("<a href='https://itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value={id}' _target='_blank'>{id}</a>") } else {stop("Unknown type. Use 'worms' or 'itis'.") }}
This ChecklistBank.org WoRMS database was ingested into a duckdb database in the standard DarwinCore format as with other major taxonomic authorities (GBIF, ITIS, IUCN) with the notebook ingest_taxon.qmd. This is only locally available on Ben’s laptop for the moment, but can be recreated by running the ingest_taxon.qmd workflow.
d_species <- d_species |># 1,148 × 8left_join(tbl(con_spp, "itis_vernacular") |>filter( taxonID %in% d_species$itis_id, language =="English") |>group_by(taxonID) |>summarize(# get one common name per taxonIDitis_common_name =first(vernacularName), .groups ="drop") |>collect(),by =join_by(itis_id == taxonID))table(is.na(d_species$itis_common_name))
# Function to get taxon and all parent taxa using recursive CTEget_taxon_parentage <-function(con_spp, scientific_name) {# Method 1: Using dbGetQuery with raw SQL (most efficient) query_sql <-" WITH RECURSIVE taxon_hierarchy AS ( -- Base case: find the initial taxon SELECT taxonID, scientificNameID, acceptedNameUsageID, parentNameUsageID, scientificName, acceptedNameUsage, parentNameUsage, taxonRank, kingdom, phylum, class,\"order\", family, genus, taxonomicStatus, 0 as level FROM worms WHERE scientificName = ? UNION ALL -- Recursive case: find parent taxa SELECT w.taxonID, w.scientificNameID, w.acceptedNameUsageID, w.parentNameUsageID, w.scientificName, w.acceptedNameUsage, w.parentNameUsage, w.taxonRank, w.kingdom, w.phylum, w.class, w.\"order\", w.family, w.genus, w.taxonomicStatus, th.level + 1 as level FROM worms w INNER JOIN taxon_hierarchy th ON w.taxonID = th.parentNameUsageID WHERE w.taxonID IS NOT NULL ) SELECT * FROM taxon_hierarchy ORDER BY level, taxonRank " result <-dbGetQuery(con_spp, query_sql, params =list(scientific_name))return(as_tibble(result))}# Example usage:d_parentage <-get_taxon_parentage(con_spp, "Sebastes atrovirens")d_parentage# To see just the taxonomic path:d_parentage |>select(taxonID, parentNameUsageID, scientificName, taxonRank, level) |>left_join(tbl(con_spp, "worms_vernacular") |>filter( taxonID %in% d_parentage$taxonID, language =="ENG") |>select(taxonID, vernacularName, isPreferredName) |>collect(),by ="taxonID") |>arrange(level, taxonID, desc(isPreferredName))
1.2 Get taxonomic children
Code
# Function to get all children (descendants) of a given taxonget_taxon_children <-function(con_spp, scientific_name) { query_sql <-" WITH RECURSIVE taxon_children AS ( -- Base case: find the parent taxon SELECT taxonID, scientificNameID, acceptedNameUsageID, parentNameUsageID, scientificName, acceptedNameUsage, parentNameUsage, taxonRank, kingdom, phylum, class,\"order\", family, genus, taxonomicStatus, 0 as depth_level FROM worms WHERE scientificName = ? UNION ALL -- Recursive case: find children taxa SELECT w.taxonID, w.scientificNameID, w.acceptedNameUsageID, w.parentNameUsageID, w.scientificName, w.acceptedNameUsage, w.parentNameUsage, w.taxonRank, w.kingdom, w.phylum, w.class, w.\"order\", w.family, w.genus, w.taxonomicStatus, tc.depth_level + 1 as depth_level FROM worms w INNER JOIN taxon_children tc ON w.parentNameUsageID = tc.taxonID WHERE w.parentNameUsageID IS NOT NULL ) SELECT * FROM taxon_children ORDER BY depth_level, CASE taxonRank WHEN 'kingdom' THEN 1 WHEN 'phylum' THEN 2 WHEN 'class' THEN 3 WHEN 'order' THEN 4 WHEN 'family' THEN 5 WHEN 'genus' THEN 6 WHEN 'species' THEN 7 WHEN 'subspecies' THEN 8 WHEN 'variety' THEN 9 WHEN 'form' THEN 10 ELSE 11 END, scientificName " result <-dbGetQuery(con_spp, query_sql, params =list(scientific_name))return(as_tibble(result))}# Get all descendants of a familyd_children <-get_taxon_children(con_spp, "Sebastidae")d_childrentbl(con_spp, "worms") |>pull(taxonRank) |>table()d_children |>select(depth_level, taxonRank) |>table()d_children |>select(taxonID, parentNameUsageID, scientificName, taxonRank, depth_level) |>arrange(depth_level, taxonID)
Code
# Create taxonomic rank lookup tablecreate_taxa_ranks_table <-function(con_spp) { taxa_ranks_chr <-c("Kingdom", "Subkingdom", "Infrakingdom","Superphylum", "Phylum", "Phylum (Division)", "Subphylum", "Subphylum (Subdivision)", "Infraphylum", "Parvphylum","Gigaclass", "Megaclass", "Superclass", "Class", "Subterclass", "Subclass", "Infraclass","Superorder", "Order", "Suborder", "Infraorder", "Parvorder","Section", "Subsection","Superfamily", "Epifamily", "Family", "Subfamily","Supertribe", "Tribe", "Subtribe","Genus", "Subgenus","Series", "Subseries","Species", "Subspecies","Natio", "Mutatio","Forma", "Subforma","Variety", "Subvariety","Coll. sp.", "Aggr.")# Define the taxonomic hierarchy with proper ordering taxa_ranks <-data.frame(taxonRank = ,rank_order =1:length(taxa_ranks_chr),stringsAsFactors =FALSE )# Create the table in DuckDBdbWriteTable(con_spp, "taxa_ranks", taxa_ranks, overwrite =TRUE)return(taxa_ranks)}# Function to get all children (descendants) of a given taxonget_taxon_children <-function(con_spp, scientific_name) { query_sql <-" WITH RECURSIVE taxon_children AS ( -- Base case: find the parent taxon SELECT taxonID, scientificNameID, acceptedNameUsageID, parentNameUsageID, scientificName, acceptedNameUsage, parentNameUsage, taxonRank, kingdom, phylum, class,\"order\", family, genus, taxonomicStatus, 0 as depth_level FROM worms WHERE scientificName = ? UNION ALL -- Recursive case: find children taxa SELECT w.taxonID, w.scientificNameID, w.acceptedNameUsageID, w.parentNameUsageID, w.scientificName, w.acceptedNameUsage, w.parentNameUsage, w.taxonRank, w.kingdom, w.phylum, w.class, w.\"order\", w.family, w.genus, w.taxonomicStatus, tc.depth_level + 1 as depth_level FROM worms w INNER JOIN taxon_children tc ON w.parentNameUsageID = tc.taxonID WHERE w.parentNameUsageID IS NOT NULL ) SELECT tc.*, COALESCE(tr.rank_order, 99) as rank_order FROM taxon_children tc LEFT JOIN taxa_ranks tr ON tc.taxonRank = tr.taxonRank ORDER BY tc.depth_level, COALESCE(tr.rank_order, 99), tc.scientificName " result <-dbGetQuery(con_spp, query_sql, params =list(scientific_name))return(as_tibble(result))}# Function to get only direct children (one level down)get_direct_children <-function(con_spp, scientific_name) { query_sql <-" WITH parent_taxon AS ( SELECT taxonID FROM worms WHERE scientificName = ? ) SELECT w.taxonID, w.scientificNameID, w.acceptedNameUsageID, w.parentNameUsageID, w.scientificName, w.acceptedNameUsage, w.parentNameUsage, w.taxonRank, w.kingdom, w.phylum, w.class, w.\"order\", w.family, w.genus, w.taxonomicStatus, COALESCE(tr.rank_order, 99) as rank_order FROM worms w INNER JOIN parent_taxon pt ON w.parentNameUsageID = pt.taxonID LEFT JOIN taxa_ranks tr ON w.taxonRank = tr.taxonRank ORDER BY COALESCE(tr.rank_order, 99), w.scientificName " result <-dbGetQuery(con_spp, query_sql, params =list(scientific_name))return(as_tibble(result))}# Function to get children up to a specific depth levelget_taxon_children_depth <-function(con_spp, scientific_name, max_depth =NULL) {# Build the depth condition depth_condition <-if (!is.null(max_depth)) {paste0("WHERE depth_level <= ", max_depth) } else {"" } query_sql <-paste0(" WITH RECURSIVE taxon_children AS ( -- Base case: find the parent taxon SELECT taxonID, scientificNameID, acceptedNameUsageID, parentNameUsageID, scientificName, acceptedNameUsage, parentNameUsage, taxonRank, kingdom, phylum, class,\"order\", family, genus, taxonomicStatus, 0 as depth_level FROM worms WHERE scientificName = ? UNION ALL -- Recursive case: find children taxa SELECT w.taxonID, w.scientificNameID, w.acceptedNameUsageID, w.parentNameUsageID, w.scientificName, w.acceptedNameUsage, w.parentNameUsage, w.taxonRank, w.kingdom, w.phylum, w.class, w.\"order\", w.family, w.genus, w.taxonomicStatus, tc.depth_level + 1 as depth_level FROM worms w INNER JOIN taxon_children tc ON w.parentNameUsageID = tc.taxonID WHERE w.parentNameUsageID IS NOT NULL", if (!is.null(max_depth)) paste0(" AND tc.depth_level < ", max_depth) else"", " ) SELECT tc.*, COALESCE(tr.rank_order, 99) as rank_order FROM taxon_children tc LEFT JOIN taxa_ranks tr ON tc.taxonRank = tr.taxonRank ", depth_condition, " ORDER BY tc.depth_level, COALESCE(tr.rank_order, 99), tc.scientificName ") result <-dbGetQuery(con_spp, query_sql, params =list(scientific_name))return(as_tibble(result))}# Function to get children of a specific taxonomic rankget_children_by_rank <-function(con_spp, scientific_name, target_rank) { query_sql <-" WITH RECURSIVE taxon_children AS ( -- Base case: find the parent taxon SELECT taxonID, scientificNameID, acceptedNameUsageID, parentNameUsageID, scientificName, acceptedNameUsage, parentNameUsage, taxonRank, kingdom, phylum, class,\"order\", family, genus, taxonomicStatus, 0 as depth_level FROM worms WHERE scientificName = ? UNION ALL -- Recursive case: find children taxa SELECT w.taxonID, w.scientificNameID, w.acceptedNameUsageID, w.parentNameUsageID, w.scientificName, w.acceptedNameUsage, w.parentNameUsage, w.taxonRank, w.kingdom, w.phylum, w.class, w.\"order\", w.family, w.genus, w.taxonomicStatus, tc.depth_level + 1 as depth_level FROM worms w INNER JOIN taxon_children tc ON w.parentNameUsageID = tc.taxonID WHERE w.parentNameUsageID IS NOT NULL ) SELECT tc.*, COALESCE(tr.rank_order, 99) as rank_order FROM taxon_children tc LEFT JOIN taxa_ranks tr ON tc.taxonRank = tr.taxonRank WHERE tc.taxonRank = ? ORDER BY tc.scientificName " result <-dbGetQuery(con_spp, query_sql, params =list(scientific_name, target_rank))return(as_tibble(result))}# Helper function to summarize children by taxonomic ranksummarize_children_by_rank <-function(children_df) { children_df |>filter(depth_level >0) |># Exclude the parent taxon itselfcount(taxonRank, name ="count") |>left_join(data.frame(taxonRank =c("Kingdom", "Subkingdom", "Infrakingdom","Superphylum", "Phylum", "Phylum (Division)", "Subphylum", "Subphylum (Subdivision)", "Infraphylum", "Parvphylum","Gigaclass", "Megaclass", "Superclass", "Class", "Subterclass", "Subclass", "Infraclass","Superorder", "Order", "Suborder", "Infraorder", "Parvorder","Section", "Subsection","Superfamily", "Epifamily", "Family", "Subfamily","Supertribe", "Tribe", "Subtribe","Genus", "Subgenus","Series", "Subseries","Species", "Subspecies","Natio", "Mutatio","Forma", "Subforma","Variety", "Subvariety","Coll. sp.", "Aggr." ),rank_order =1:45,stringsAsFactors =FALSE ),by ="taxonRank" ) |>arrange(rank_order)}# Example usage:# First, create the taxa_ranks lookup table (run once)# create_taxa_ranks_table(con_spp)# Get all descendants of a family# family_children <- get_taxon_children(con_spp, "Sebastidae")# Get only direct children (genera within the family)# direct_children <- get_direct_children(con_spp, "Sebastidae")# Get children up to 2 levels deep# shallow_children <- get_taxon_children_depth(con_spp, "Sebastidae", max_depth = 2)# Get all species within a genus# species_in_genus <- get_children_by_rank(con_spp, "Sebastes", "Species")# Summarize the taxonomic diversity# children <- get_taxon_children(con_spp, "Sebastidae")# summary <- summarize_children_by_rank(children)# print(summary)
Note that you might need to sometimes restart your R session (under Session menu in RStudio) to connect to the local duckdb.
Source Code
---title: "taxa_worms"editor_options: chunk_output_type: console---## OverviewGoal: Get the full taxonomic hierarchy for all observed taxa in the larvaland egg counts from the NOAA CalCOFI database from the World Register of Marine Species (WoRMS; [marinespecies.org](https://www.marinespecies.org)).```{r setup}librarian::shelf( DBI, dplyr, DT, duckdb, fs, glue, here, stringr,quiet = T)url_cc <-"https://file.calcofi.io/calcofi.duckdb"cc_db <-here("data/tmp.duckdb")spp_db <-glue("~/My Drive/projects/msens/data/derived/spp.duckdb")is_ben_laptop <-Sys.info()[["nodename"]] =="Bens-MacBook-Air.local"con_cc <-dbConnect(duckdb(), dbdir = cc_db)res <-dbExecute(con_cc, glue("ATTACH IF NOT EXISTS '{url_cc}' AS calcofi; USE calcofi"))# helper functions ----id2url <-function(id, type ="worms") {# add urls for WoRMS and ITIS, show datatable with linksif (type =="worms") {glue("<a href='https://www.marinespecies.org/aphia.php?p=taxdetails&id={id}' _target='_blank'>{id}</a>") } elseif (type =="itis") {glue("<a href='https://itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value={id}' _target='_blank'>{id}</a>") } else {stop("Unknown type. Use 'worms' or 'itis'.") }}```## Fetch CalCOFI species```{r d_species}# dbListTables(con_cc)# [1] "cruise" "egg" "grid" "larva" "net" "ship" # [7] "site" "site_seg" "species" "tow" "tow_type"d_species <-tbl(con_cc, "calcofi.species") |>select( sp_id, scientific_name, common_name,itis_id_old = itis_tsn,worms_id_old = aphia_id) |>collect()# https://www.marinespecies.org/aphia.php?p=taxdetails&id=293496# https://itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=168736d_species |>mutate(worms_old_url =id2url(worms_id_old, "worms"),itis_old_url =id2url(itis_id_old, "itis")) |>select(sp_id, common_name, scientific_name, worms_id_old = worms_old_url, itis_id_old = itis_old_url) |>datatable(escape = F, rownames = F)```## Use pre-processed WoRMS databaseThe entire WoRMS database has already been downloaded from [World Register of Marine Species at ChecklistBank.org](https://www.checklistbank.org/dataset/2011/download), which seems much more extensive (2.7 GB) than the one downloadable directly from [marinespecies.org/download](https://www.marinespecies.org/download/) (547.8 MB).This ChecklistBank.org WoRMS database was ingested into a duckdb database inthe standard DarwinCore format as with other major taxonomic authorities(GBIF, ITIS, IUCN) with the notebook [ingest_taxon.qmd](https://github.com/MarineSensitivity/workflows/blob/9fdfdb6ee98148755c50c39e06d3b9b3469363be/ingest_taxon.qmd). This is only locally available on Ben's laptop for the moment, but can be recreated by running the `ingest_taxon.qmd` workflow.```{r con_spp}#| eval: !expr is_ben_laptopcon_spp <-dbConnect(duckdb(dbdir = spp_db, read_only = F))``````{r con_spp_tables}#| eval: falsedbListTables(con_spp)# [1] "gbif" "gbif_vernacular" "iucn_redlist"# [4] "iucn_vernacular" "worms" "worms_vernacular"dbListFields(con_spp, "worms")# [1] "taxonID" "scientificNameID"# [3] "acceptedNameUsageID" "parentNameUsageID"# [5] "namePublishedInID" "scientificName"# [7] "acceptedNameUsage" "parentNameUsage"# [9] "namePublishedIn" "namePublishedInYear"# [11] "kingdom" "phylum"# [13] "class" "order"# [15] "family" "genus"# [17] "subgenus" "specificEpithet"# [19] "infraspecificEpithet" "taxonRank"# [21] "scientificNameAuthorship" "nomenclaturalCode"# [23] "taxonomicStatus" "nomenclaturalStatus"# [25] "modified" "bibliographicCitation"# [27] "references" "license"# [29] "rightsHolder" "datasetName"# [31] "institutionCode" "datasetID"dbListFields(con_spp, "worms_vernacular")# "taxonID" "vernacularName" "source" "language" "isPreferredName"```## Update WoRMS ID```{r worms_id}#| eval: !expr is_ben_laptopd_species <- d_species |># 1,148 × 5left_join(tbl(con_spp, "worms") |>filter(taxonID %in% d_species$worms_id_old) |>select( taxonID,worms_id = acceptedNameUsageID,worms_scientific_name = acceptedNameUsage) |>collect(),by =join_by(worms_id_old == taxonID))d_worms_chgs <- d_species |>filter( (worms_id_old != worms_id) | (scientific_name != worms_scientific_name)) |>select( sp_id, common_name, scientific_name, worms_scientific_name, worms_id_old, worms_id)if (nrow(d_worms_chgs) ==0){message("No changes to WoRMS IDs or scientific_name found.")} else {message("Found ", nrow(d_worms_chgs), " changes to WoRMS IDs.") d_worms_chgs |>mutate(worms_old_url =id2url(worms_id_old, "worms"),worms_url =id2url(worms_id, "worms")) |>select( sp_id, common_name, scientific_name, worms_scientific_name,worms_id_old = worms_old_url, worms_id = worms_url) |>datatable(escape = F, rownames = F)}# https://www.marinespecies.org/aphia.php?p=taxdetails&id=293496```## Update ITIS ID```{r itis_id}#| eval: !expr is_ben_laptopd_species <- d_species |># 1,148 × 5left_join(tbl(con_spp, "itis") |>filter(taxonID %in% d_species$itis_id_old) |>select( taxonID,itis_id = acceptedNameUsageID) |>collect(),by =join_by(itis_id_old == taxonID))d_itis_chgs <- d_species |>filter( (itis_id_old != itis_id)) |>select( sp_id, common_name, scientific_name, itis_id_old, itis_id)if (nrow(d_itis_chgs) ==0){message("No changes to ITIS IDs found.")} else {message("Found ", nrow(d_itis_chgs), " changes to ITIS IDs.") d_itis_chgs |>mutate(itis_old_url =id2url(itis_id_old, "itis"),itis_url =id2url(itis_id, "itis")) |>select( sp_id, common_name, scientific_name,itis_id_old = itis_old_url, itis_id = itis_url) |>datatable(escape = F, rownames = F)}```# Get WoRMS common name```{r worms_common_name}#| eval: !expr is_ben_laptopd_species <- d_species |># 1,148 × 8left_join(tbl(con_spp, "worms_vernacular") |>filter( taxonID %in% d_species$worms_id, language =="ENG") |>select( taxonID,worms_common_name = vernacularName, isPreferredName) |>collect(),by =join_by(worms_id == taxonID),relationship ="many-to-many") |>slice_max(by = scientific_name,order_by = isPreferredName, with_ties = F) |>select(-isPreferredName)table(is.na(d_species$worms_common_name))``````{r itis_common_name}#| eval: !expr is_ben_laptopd_species <- d_species |># 1,148 × 8left_join(tbl(con_spp, "itis_vernacular") |>filter( taxonID %in% d_species$itis_id, language =="English") |>group_by(taxonID) |>summarize(# get one common name per taxonIDitis_common_name =first(vernacularName), .groups ="drop") |>collect(),by =join_by(itis_id == taxonID))table(is.na(d_species$itis_common_name))``````{r eval_f}knitr::opts_chunk$set(eval = F)```## Get parent taxa- [USING KEY in Recursive CTEs – DuckDB](https://duckdb.org/2025/05/23/using-key.html)```{r get_taxon_parentage}# Function to get taxon and all parent taxa using recursive CTEget_taxon_parentage <-function(con_spp, scientific_name) {# Method 1: Using dbGetQuery with raw SQL (most efficient) query_sql <-" WITH RECURSIVE taxon_hierarchy AS ( -- Base case: find the initial taxon SELECT taxonID, scientificNameID, acceptedNameUsageID, parentNameUsageID, scientificName, acceptedNameUsage, parentNameUsage, taxonRank, kingdom, phylum, class,\"order\", family, genus, taxonomicStatus, 0 as level FROM worms WHERE scientificName = ? UNION ALL -- Recursive case: find parent taxa SELECT w.taxonID, w.scientificNameID, w.acceptedNameUsageID, w.parentNameUsageID, w.scientificName, w.acceptedNameUsage, w.parentNameUsage, w.taxonRank, w.kingdom, w.phylum, w.class, w.\"order\", w.family, w.genus, w.taxonomicStatus, th.level + 1 as level FROM worms w INNER JOIN taxon_hierarchy th ON w.taxonID = th.parentNameUsageID WHERE w.taxonID IS NOT NULL ) SELECT * FROM taxon_hierarchy ORDER BY level, taxonRank " result <-dbGetQuery(con_spp, query_sql, params =list(scientific_name))return(as_tibble(result))}# Example usage:d_parentage <-get_taxon_parentage(con_spp, "Sebastes atrovirens")d_parentage# To see just the taxonomic path:d_parentage |>select(taxonID, parentNameUsageID, scientificName, taxonRank, level) |>left_join(tbl(con_spp, "worms_vernacular") |>filter( taxonID %in% d_parentage$taxonID, language =="ENG") |>select(taxonID, vernacularName, isPreferredName) |>collect(),by ="taxonID") |>arrange(level, taxonID, desc(isPreferredName))```## Get taxonomic children```{r taxon_children}# Function to get all children (descendants) of a given taxonget_taxon_children <-function(con_spp, scientific_name) { query_sql <-" WITH RECURSIVE taxon_children AS ( -- Base case: find the parent taxon SELECT taxonID, scientificNameID, acceptedNameUsageID, parentNameUsageID, scientificName, acceptedNameUsage, parentNameUsage, taxonRank, kingdom, phylum, class,\"order\", family, genus, taxonomicStatus, 0 as depth_level FROM worms WHERE scientificName = ? UNION ALL -- Recursive case: find children taxa SELECT w.taxonID, w.scientificNameID, w.acceptedNameUsageID, w.parentNameUsageID, w.scientificName, w.acceptedNameUsage, w.parentNameUsage, w.taxonRank, w.kingdom, w.phylum, w.class, w.\"order\", w.family, w.genus, w.taxonomicStatus, tc.depth_level + 1 as depth_level FROM worms w INNER JOIN taxon_children tc ON w.parentNameUsageID = tc.taxonID WHERE w.parentNameUsageID IS NOT NULL ) SELECT * FROM taxon_children ORDER BY depth_level, CASE taxonRank WHEN 'kingdom' THEN 1 WHEN 'phylum' THEN 2 WHEN 'class' THEN 3 WHEN 'order' THEN 4 WHEN 'family' THEN 5 WHEN 'genus' THEN 6 WHEN 'species' THEN 7 WHEN 'subspecies' THEN 8 WHEN 'variety' THEN 9 WHEN 'form' THEN 10 ELSE 11 END, scientificName " result <-dbGetQuery(con_spp, query_sql, params =list(scientific_name))return(as_tibble(result))}# Get all descendants of a familyd_children <-get_taxon_children(con_spp, "Sebastidae")d_childrentbl(con_spp, "worms") |>pull(taxonRank) |>table()d_children |>select(depth_level, taxonRank) |>table()d_children |>select(taxonID, parentNameUsageID, scientificName, taxonRank, depth_level) |>arrange(depth_level, taxonID)``````{r taxon_children_lookup}#| eval: false# Create taxonomic rank lookup tablecreate_taxa_ranks_table <-function(con_spp) { taxa_ranks_chr <-c("Kingdom", "Subkingdom", "Infrakingdom","Superphylum", "Phylum", "Phylum (Division)", "Subphylum", "Subphylum (Subdivision)", "Infraphylum", "Parvphylum","Gigaclass", "Megaclass", "Superclass", "Class", "Subterclass", "Subclass", "Infraclass","Superorder", "Order", "Suborder", "Infraorder", "Parvorder","Section", "Subsection","Superfamily", "Epifamily", "Family", "Subfamily","Supertribe", "Tribe", "Subtribe","Genus", "Subgenus","Series", "Subseries","Species", "Subspecies","Natio", "Mutatio","Forma", "Subforma","Variety", "Subvariety","Coll. sp.", "Aggr.")# Define the taxonomic hierarchy with proper ordering taxa_ranks <-data.frame(taxonRank = ,rank_order =1:length(taxa_ranks_chr),stringsAsFactors =FALSE )# Create the table in DuckDBdbWriteTable(con_spp, "taxa_ranks", taxa_ranks, overwrite =TRUE)return(taxa_ranks)}# Function to get all children (descendants) of a given taxonget_taxon_children <-function(con_spp, scientific_name) { query_sql <-" WITH RECURSIVE taxon_children AS ( -- Base case: find the parent taxon SELECT taxonID, scientificNameID, acceptedNameUsageID, parentNameUsageID, scientificName, acceptedNameUsage, parentNameUsage, taxonRank, kingdom, phylum, class,\"order\", family, genus, taxonomicStatus, 0 as depth_level FROM worms WHERE scientificName = ? UNION ALL -- Recursive case: find children taxa SELECT w.taxonID, w.scientificNameID, w.acceptedNameUsageID, w.parentNameUsageID, w.scientificName, w.acceptedNameUsage, w.parentNameUsage, w.taxonRank, w.kingdom, w.phylum, w.class, w.\"order\", w.family, w.genus, w.taxonomicStatus, tc.depth_level + 1 as depth_level FROM worms w INNER JOIN taxon_children tc ON w.parentNameUsageID = tc.taxonID WHERE w.parentNameUsageID IS NOT NULL ) SELECT tc.*, COALESCE(tr.rank_order, 99) as rank_order FROM taxon_children tc LEFT JOIN taxa_ranks tr ON tc.taxonRank = tr.taxonRank ORDER BY tc.depth_level, COALESCE(tr.rank_order, 99), tc.scientificName " result <-dbGetQuery(con_spp, query_sql, params =list(scientific_name))return(as_tibble(result))}# Function to get only direct children (one level down)get_direct_children <-function(con_spp, scientific_name) { query_sql <-" WITH parent_taxon AS ( SELECT taxonID FROM worms WHERE scientificName = ? ) SELECT w.taxonID, w.scientificNameID, w.acceptedNameUsageID, w.parentNameUsageID, w.scientificName, w.acceptedNameUsage, w.parentNameUsage, w.taxonRank, w.kingdom, w.phylum, w.class, w.\"order\", w.family, w.genus, w.taxonomicStatus, COALESCE(tr.rank_order, 99) as rank_order FROM worms w INNER JOIN parent_taxon pt ON w.parentNameUsageID = pt.taxonID LEFT JOIN taxa_ranks tr ON w.taxonRank = tr.taxonRank ORDER BY COALESCE(tr.rank_order, 99), w.scientificName " result <-dbGetQuery(con_spp, query_sql, params =list(scientific_name))return(as_tibble(result))}# Function to get children up to a specific depth levelget_taxon_children_depth <-function(con_spp, scientific_name, max_depth =NULL) {# Build the depth condition depth_condition <-if (!is.null(max_depth)) {paste0("WHERE depth_level <= ", max_depth) } else {"" } query_sql <-paste0(" WITH RECURSIVE taxon_children AS ( -- Base case: find the parent taxon SELECT taxonID, scientificNameID, acceptedNameUsageID, parentNameUsageID, scientificName, acceptedNameUsage, parentNameUsage, taxonRank, kingdom, phylum, class,\"order\", family, genus, taxonomicStatus, 0 as depth_level FROM worms WHERE scientificName = ? UNION ALL -- Recursive case: find children taxa SELECT w.taxonID, w.scientificNameID, w.acceptedNameUsageID, w.parentNameUsageID, w.scientificName, w.acceptedNameUsage, w.parentNameUsage, w.taxonRank, w.kingdom, w.phylum, w.class, w.\"order\", w.family, w.genus, w.taxonomicStatus, tc.depth_level + 1 as depth_level FROM worms w INNER JOIN taxon_children tc ON w.parentNameUsageID = tc.taxonID WHERE w.parentNameUsageID IS NOT NULL", if (!is.null(max_depth)) paste0(" AND tc.depth_level < ", max_depth) else"", " ) SELECT tc.*, COALESCE(tr.rank_order, 99) as rank_order FROM taxon_children tc LEFT JOIN taxa_ranks tr ON tc.taxonRank = tr.taxonRank ", depth_condition, " ORDER BY tc.depth_level, COALESCE(tr.rank_order, 99), tc.scientificName ") result <-dbGetQuery(con_spp, query_sql, params =list(scientific_name))return(as_tibble(result))}# Function to get children of a specific taxonomic rankget_children_by_rank <-function(con_spp, scientific_name, target_rank) { query_sql <-" WITH RECURSIVE taxon_children AS ( -- Base case: find the parent taxon SELECT taxonID, scientificNameID, acceptedNameUsageID, parentNameUsageID, scientificName, acceptedNameUsage, parentNameUsage, taxonRank, kingdom, phylum, class,\"order\", family, genus, taxonomicStatus, 0 as depth_level FROM worms WHERE scientificName = ? UNION ALL -- Recursive case: find children taxa SELECT w.taxonID, w.scientificNameID, w.acceptedNameUsageID, w.parentNameUsageID, w.scientificName, w.acceptedNameUsage, w.parentNameUsage, w.taxonRank, w.kingdom, w.phylum, w.class, w.\"order\", w.family, w.genus, w.taxonomicStatus, tc.depth_level + 1 as depth_level FROM worms w INNER JOIN taxon_children tc ON w.parentNameUsageID = tc.taxonID WHERE w.parentNameUsageID IS NOT NULL ) SELECT tc.*, COALESCE(tr.rank_order, 99) as rank_order FROM taxon_children tc LEFT JOIN taxa_ranks tr ON tc.taxonRank = tr.taxonRank WHERE tc.taxonRank = ? ORDER BY tc.scientificName " result <-dbGetQuery(con_spp, query_sql, params =list(scientific_name, target_rank))return(as_tibble(result))}# Helper function to summarize children by taxonomic ranksummarize_children_by_rank <-function(children_df) { children_df |>filter(depth_level >0) |># Exclude the parent taxon itselfcount(taxonRank, name ="count") |>left_join(data.frame(taxonRank =c("Kingdom", "Subkingdom", "Infrakingdom","Superphylum", "Phylum", "Phylum (Division)", "Subphylum", "Subphylum (Subdivision)", "Infraphylum", "Parvphylum","Gigaclass", "Megaclass", "Superclass", "Class", "Subterclass", "Subclass", "Infraclass","Superorder", "Order", "Suborder", "Infraorder", "Parvorder","Section", "Subsection","Superfamily", "Epifamily", "Family", "Subfamily","Supertribe", "Tribe", "Subtribe","Genus", "Subgenus","Series", "Subseries","Species", "Subspecies","Natio", "Mutatio","Forma", "Subforma","Variety", "Subvariety","Coll. sp.", "Aggr." ),rank_order =1:45,stringsAsFactors =FALSE ),by ="taxonRank" ) |>arrange(rank_order)}# Example usage:# First, create the taxa_ranks lookup table (run once)# create_taxa_ranks_table(con_spp)# Get all descendants of a family# family_children <- get_taxon_children(con_spp, "Sebastidae")# Get only direct children (genera within the family)# direct_children <- get_direct_children(con_spp, "Sebastidae")# Get children up to 2 levels deep# shallow_children <- get_taxon_children_depth(con_spp, "Sebastidae", max_depth = 2)# Get all species within a genus# species_in_genus <- get_children_by_rank(con_spp, "Sebastes", "Species")# Summarize the taxonomic diversity# children <- get_taxon_children(con_spp, "Sebastidae")# summary <- summarize_children_by_rank(children)# print(summary)```## Close db connection(s)```{r cleanup_dbs}dbDisconnect(con_cc, shutdown = T); duckdb_shutdown(duckdb()); rm(con_cc)if (is_ben_laptop)dbDisconnect(con_spp, shutdown = T); duckdb_shutdown(duckdb()); rm(con_spp)```Note that you might need to sometimes restart your R session (under Session menu in RStudio) to connect to the local duckdb.