Merge Ichthyo & Bottle to Working DuckLake

Published

2026-02-05

Overview

Goal: Merge parquet outputs from both upstream ingest workflows, perform cross-dataset reconciliation (ship matching, cruise bridge, taxonomy standardization), then load into Working DuckLake and create a frozen release.

Steps:

  1. Load parquet files from both upstream workflows into local temp DuckDB
  2. Fuzzy match ships between datasets
  3. Build cruise bridge (casts → cruise_key)
  4. Standardize taxonomy (species → WoRMS/ITIS/GBIF)
  5. Validate cross-dataset integrity
  6. Write modified parquet outputs + manifest with GCS references
  7. Ingest all tables into Working DuckLake with provenance
  8. Freeze release (strip provenance)
  9. Upload to GCS

Upstream workflows (must complete first):

  • ingest_swfsc.noaa.gov_calcofi-db.qmd → ichthyo tables (10)
  • ingest_calcofi.org_bottle-database.qmd → bottle/cast tables (5)
Code
flowchart LR
    subgraph upstream["Upstream Parquet"]
        s["swfsc<br/>10 tables"]
        b["bottle<br/>5 tables"]
    end
    subgraph merge["Merge DuckDB"]
        m1["Ship matching"]
        m2["Cruise bridge"]
        m3["Taxonomy"]
    end
    subgraph output["Output"]
        p["Modified parquet<br/>+ manifest"]
        w["Working DuckLake"]
        f["Frozen Release"]
    end
    s --> merge
    b --> merge
    merge --> p --> w --> f

    style upstream fill:#e3f2fd,stroke:#1565c0
    style merge fill:#fff3e0,stroke:#ef6c00
    style output fill:#e8f4e8,stroke:#2e7d32
flowchart LR
    subgraph upstream["Upstream Parquet"]
        s["swfsc<br/>10 tables"]
        b["bottle<br/>5 tables"]
    end
    subgraph merge["Merge DuckDB"]
        m1["Ship matching"]
        m2["Cruise bridge"]
        m3["Taxonomy"]
    end
    subgraph output["Output"]
        p["Modified parquet<br/>+ manifest"]
        w["Working DuckLake"]
        f["Frozen Release"]
    end
    s --> merge
    b --> merge
    merge --> p --> w --> f

    style upstream fill:#e3f2fd,stroke:#1565c0
    style merge fill:#fff3e0,stroke:#ef6c00
    style output fill:#e8f4e8,stroke:#2e7d32
Figure 1: Integration workflow: upstream parquet → merge → Working DuckLake → Frozen Release

0.1 Setup

Code
devtools::load_all(here::here("../calcofi4db"))
devtools::load_all(here::here("../calcofi4R"))
librarian::shelf(
  CalCOFI/calcofi4db, DBI, dm, dplyr, DT, fs, glue, here,
  jsonlite, purrr, readr, stringr, tibble, tidyr,
  worrms, taxize,
  quiet = T)
options(readr.show_col_types = F)
options(DT.options = list(scrollX = TRUE))

# parquet input directories (from upstream workflows)
dir_pq_ichthyo <- here("data/parquet/swfsc.noaa.gov_calcofi-db")
dir_pq_bottle  <- here("data/parquet/calcofi.org_bottle-database")

# release version
release_version <- format(Sys.Date(), "v%Y.%m")

# local merge database
db_path   <- here("data/wrangling/merge_ichthyo_bottle.duckdb")
overwrite <- FALSE  # set TRUE to rebuild from scratch (takes ~1hr for taxonomy APIs)
if (overwrite && file_exists(db_path)) file_delete(db_path)
dir_create(dirname(db_path))
con <- get_duckdb_con(db_path)
load_duckdb_extension(con, "spatial")

# check if database already has tables (from previous run)
existing_tables <- DBI::dbListTables(con)
skip_load <- length(existing_tables) > 0
if (skip_load) {
  message(glue("Using existing database with {length(existing_tables)} tables"))
}

0.2 Check Upstream Workflows

Verify that parquet outputs and manifests exist from both upstream workflows.

Code
# check ichthyo manifest
ichthyo_manifest_path <- file.path(dir_pq_ichthyo, "manifest.json")
stopifnot(
  "Missing ichthyo parquet outputs. Run ingest_swfsc.noaa.gov_calcofi-db.qmd first." =
    file.exists(ichthyo_manifest_path))

ichthyo_manifest <- read_json(ichthyo_manifest_path)
message(glue(
  "Ichthyo parquet: {ichthyo_manifest$total_rows} total rows, ",
  "created {ichthyo_manifest$created_at}"))

# check bottle manifest
bottle_manifest_path <- file.path(dir_pq_bottle, "manifest.json")
stopifnot(
  "Missing bottle parquet outputs. Run ingest_calcofi.org_bottle-database.qmd first." =
    file.exists(bottle_manifest_path))

bottle_manifest <- read_json(bottle_manifest_path)
message(glue(
  "Bottle parquet: {bottle_manifest$total_rows} total rows, ",
  "created {bottle_manifest$created_at}"))

# show manifest stats
tibble(
  source   = c("ichthyo", "bottle"),
  tables   = c(
    length(ichthyo_manifest$tables),
    length(bottle_manifest$tables)),
  rows     = c(
    ichthyo_manifest$total_rows,
    bottle_manifest$total_rows),
  created  = c(
    ichthyo_manifest$created_at,
    bottle_manifest$created_at)) |>
  datatable(caption = "Upstream workflow manifest stats")

0.3 Load SWFSC Tables

Load all ichthyo parquet files into the local merge DuckDB. Tables with geometry columns (grid, site, segment) need WKB→GEOMETRY conversion.

Code
ichthyo_files <- list.files(
  dir_pq_ichthyo, pattern = "\\.parquet$", full.names = TRUE)

# tables with geometry columns that need WKB conversion
geom_tables <- c("grid", "site", "segment")

if (skip_load) {
  message("Using existing ichthyo tables from database")
  ichthyo_stats <- map_dfr(ichthyo_files, function(pqt_path) {
    tbl_name <- tools::file_path_sans_ext(basename(pqt_path))
    n <- DBI::dbGetQuery(con, glue("SELECT COUNT(*) AS n FROM {tbl_name}"))$n
    tibble(table = tbl_name, rows = n, source = "ichthyo")
  })
} else {
  ichthyo_stats <- map_dfr(ichthyo_files, function(pqt_path) {
    tbl_name <- tools::file_path_sans_ext(basename(pqt_path))

    if (tbl_name %in% geom_tables) {
      # use load_gcs_parquet_to_duckdb for WKB→GEOMETRY conversion
      DBI::dbExecute(con, glue(
        "CREATE OR REPLACE TABLE {tbl_name} AS
         SELECT * FROM read_parquet('{pqt_path}')"))
      # detect and convert WKB BLOB columns
      blob_cols <- DBI::dbGetQuery(con, glue(
        "SELECT column_name FROM information_schema.columns
         WHERE table_name = '{tbl_name}'
           AND data_type = 'BLOB'
           AND column_name LIKE '%geom%'"))$column_name
      for (gc in blob_cols) {
        tmp_col <- paste0(gc, "_tmp")
        DBI::dbExecute(con, glue(
          'ALTER TABLE {tbl_name} ADD COLUMN {tmp_col} GEOMETRY'))
        DBI::dbExecute(con, glue(
          'UPDATE {tbl_name} SET {tmp_col} = ST_GeomFromWKB({gc})'))
        DBI::dbExecute(con, glue(
          'ALTER TABLE {tbl_name} DROP COLUMN {gc}'))
        DBI::dbExecute(con, glue(
          'ALTER TABLE {tbl_name} RENAME COLUMN {tmp_col} TO {gc}'))
      }
    } else {
      DBI::dbExecute(con, glue(
        "CREATE OR REPLACE TABLE {tbl_name} AS
         SELECT * FROM read_parquet('{pqt_path}')"))
    }

    n <- DBI::dbGetQuery(con, glue("SELECT COUNT(*) AS n FROM {tbl_name}"))$n
    tibble(table = tbl_name, rows = n, source = "ichthyo")
  })
}

ichthyo_stats |> datatable(caption = "SWFSC ichthyo tables loaded")

0.4 Load Bottle Tables

Load bottle parquet files (grid excluded from bottle export — canonical from swfsc).

Code
bottle_files <- list.files(
  dir_pq_bottle, pattern = "\\.parquet$", full.names = TRUE)

# belt-and-suspenders: skip grid.parquet if it still exists
bottle_files <- bottle_files[!grepl("grid\\.parquet$", bottle_files)]

if (skip_load) {
  message("Using existing bottle tables from database")
  bottle_stats <- map_dfr(bottle_files, function(pqt_path) {
    tbl_name <- tools::file_path_sans_ext(basename(pqt_path))
    n <- DBI::dbGetQuery(con, glue("SELECT COUNT(*) AS n FROM {tbl_name}"))$n
    tibble(table = tbl_name, rows = n, source = "bottle")
  })
} else {
  bottle_stats <- map_dfr(bottle_files, function(pqt_path) {
    tbl_name <- tools::file_path_sans_ext(basename(pqt_path))

    DBI::dbExecute(con, glue(
      "CREATE OR REPLACE TABLE {tbl_name} AS
       SELECT * FROM read_parquet('{pqt_path}')"))

    n <- DBI::dbGetQuery(con, glue("SELECT COUNT(*) AS n FROM {tbl_name}"))$n
    tibble(table = tbl_name, rows = n, source = "bottle")
  })
}

bottle_stats |> datatable(caption = "Bottle tables loaded")

0.5 Show Loaded Tables

Code
all_stats <- bind_rows(ichthyo_stats, bottle_stats)

message(glue(
  "Loaded {nrow(all_stats)} tables: ",
  "{sum(all_stats$source == 'ichthyo')} ichthyo + ",
  "{sum(all_stats$source == 'bottle')} bottle"))

all_stats |>
  arrange(source, table) |>
  datatable(caption = "All loaded tables")

0.6 Fuzzy Ship Matching

Reconcile ship codes between bottle casts and swfsc ship reference table. Uses calcofi4db::match_ships().

Code
# get unmatched ships from casts
unmatched <- dbGetQuery(con, "
  SELECT DISTINCT c.ship_code, c.ship_name
  FROM casts c
  LEFT JOIN ship s ON c.ship_code = s.ship_nodc
  WHERE s.ship_key IS NULL")

message(glue("{nrow(unmatched)} unmatched ship codes in casts"))

# ship renames CSV (manual overrides from previous reconciliation)
ship_renames_csv <- here(
  "metadata/calcofi.org/bottle-database/ship_renames.csv")

# run fuzzy matching
ship_matches <- match_ships(
  unmatched_ships  = unmatched,
  reference_ships  = dbReadTable(con, "ship"),
  ship_renames_csv = ship_renames_csv,
  fetch_ices       = TRUE)

ship_matches |>
  datatable(caption = "Ship matching results")

0.6.1 Match Statistics

Code
ship_matches |>
  count(match_type) |>
  datatable(caption = "Ship match type counts")

0.7 Build Cruise Bridge

Link bottle casts to swfsc cruise table via ship_key → cruise_key.

0.7.1 Step 1: Add ship_key to casts

Code
# add ship_key column
dbExecute(con, "ALTER TABLE casts ADD COLUMN IF NOT EXISTS ship_key TEXT")
[1] 0
Code
# exact match: casts.ship_code = ship.ship_nodc
dbExecute(con, "
  UPDATE casts SET ship_key = (
    SELECT s.ship_key FROM ship s
    WHERE s.ship_nodc = casts.ship_code
    LIMIT 1)")
[1] 35644
Code
n_exact <- dbGetQuery(con,
  "SELECT COUNT(*) AS n FROM casts WHERE ship_key IS NOT NULL")$n
message(glue("Exact ship_nodc match: {n_exact} casts"))

# apply fuzzy match results for previously unmatched ships
matched_ships <- ship_matches |>
  filter(match_type != "unmatched", !is.na(matched_ship_key))

if (nrow(matched_ships) > 0) {
  for (i in seq_len(nrow(matched_ships))) {
    m <- matched_ships[i, ]
    dbExecute(con, glue("
      UPDATE casts SET ship_key = '{m$matched_ship_key}'
      WHERE ship_code = '{m$ship_code}'
        AND ship_key IS NULL"))
  }
  n_fuzzy <- dbGetQuery(con,
    "SELECT COUNT(*) AS n FROM casts WHERE ship_key IS NOT NULL")$n - n_exact
  message(glue("Fuzzy/manual match: {n_fuzzy} additional casts"))
}

# summary
ship_key_stats <- dbGetQuery(con, "
  SELECT
    CASE WHEN ship_key IS NULL THEN 'no_ship_match'
         ELSE 'matched' END AS status,
    COUNT(*) AS n_casts
  FROM casts
  GROUP BY status")
ship_key_stats |> datatable(caption = "ship_key assignment stats")

0.7.2 Step 2: Derive cruise_key

Cruise key format: YYMMKK (2-digit year, 2-digit month, 2-letter ship key).

Code
# add cruise_key column
dbExecute(con, "ALTER TABLE casts ADD COLUMN IF NOT EXISTS cruise_key TEXT")
[1] 0
Code
# derive YYMMKK from datetime_utc + ship_key
dbExecute(con, "
  UPDATE casts SET cruise_key = CONCAT(
    LPAD(CAST(EXTRACT(YEAR FROM datetime_utc) % 100 AS VARCHAR), 2, '0'),
    LPAD(CAST(EXTRACT(MONTH FROM datetime_utc) AS VARCHAR), 2, '0'),
    ship_key)
  WHERE ship_key IS NOT NULL")
[1] 35595
Code
n_cruise <- dbGetQuery(con,
  "SELECT COUNT(*) AS n FROM casts WHERE cruise_key IS NOT NULL")$n
message(glue("Derived cruise_key for {n_cruise} casts"))

0.7.3 Step 3: Validate against swfsc cruise table

Code
cruise_stats <- dbGetQuery(con, "
  SELECT
    CASE
      WHEN c.ship_key IS NULL THEN 'no_ship_match'
      WHEN c.cruise_key IS NULL THEN 'no_cruise_key'
      WHEN cr.cruise_key IS NULL THEN 'no_cruise_match'
      ELSE 'matched'
    END AS status,
    COUNT(*) AS n_casts
  FROM casts c
  LEFT JOIN cruise cr ON c.cruise_key = cr.cruise_key
  GROUP BY status
  ORDER BY status")

cruise_stats |> datatable(caption = "Cruise bridge match statistics")

0.7.4 Step 4: Report unmatched ship codes

Code
unmatched_report <- dbGetQuery(con, "
  SELECT DISTINCT
    c.ship_code, c.ship_name,
    COUNT(*) AS n_casts,
    MIN(c.datetime_utc) AS first_cast,
    MAX(c.datetime_utc) AS last_cast
  FROM casts c
  WHERE c.ship_key IS NULL
  GROUP BY c.ship_code, c.ship_name
  ORDER BY n_casts DESC")

if (nrow(unmatched_report) > 0) {
  unmatched_report |>
    datatable(caption = "Unmatched ship codes (no ship_key)")
} else {
  message("All ship codes matched!")
}

0.8 Standardize Taxonomy

Update species table with WoRMS/ITIS/GBIF identifiers using standardize_species() and build taxonomy hierarchy using build_taxon_table() from calcofi4db/R/taxonomy.R.

Code
# check if species already standardized (all taxonomy columns exist with data)
sp_cols <- DBI::dbGetQuery(con,
  "SELECT column_name FROM information_schema.columns
   WHERE table_name = 'species'")$column_name
sp_standardized <- all(c("worms_id", "itis_id", "gbif_id") %in% sp_cols) &&
  DBI::dbGetQuery(con,
    "SELECT COUNT(*) AS n FROM species WHERE worms_id IS NOT NULL")$n > 0

if (sp_standardized && !overwrite) {
  message("Species already standardized, skipping API calls")
  sp_results <- DBI::dbGetQuery(con,
    "SELECT species_id, scientific_name, worms_id, itis_id, gbif_id
     FROM species")
} else {
  # standardize species identifiers (slow - makes API calls)
  sp_results <- standardize_species(
    con           = con,
    species_tbl   = "species",
    include_gbif  = TRUE)
}

sp_results |>
  datatable(caption = "Species standardization results")
Code
# check if taxon table already exists with data
taxon_exists <- "taxon" %in% existing_tables &&
  DBI::dbGetQuery(con, "SELECT COUNT(*) AS n FROM taxon")$n > 0

if (taxon_exists && !overwrite) {
  message("Taxon table already exists, skipping API calls")
  taxon_rows <- DBI::dbReadTable(con, "taxon")
} else {
  # build taxon hierarchy table (slow - makes API calls)
  taxon_rows <- build_taxon_table(
    con          = con,
    species_tbl  = "species",
    taxon_tbl    = "taxon",
    include_itis = TRUE)
}

# show taxon stats
if (nrow(taxon_rows) > 0) {
  taxon_rows |>
    count(authority, taxonRank) |>
    arrange(authority, taxonRank) |>
    datatable(caption = "Taxon hierarchy by authority and rank")
}
Code
# show taxa_rank lookup
dbReadTable(con, "taxa_rank") |>
  datatable(caption = "Taxa rank ordering")

0.8.1 Taxonomy Statistics

Code
n_species   <- dbGetQuery(con, "SELECT COUNT(*) AS n FROM species")$n
n_worms     <- dbGetQuery(con,
  "SELECT COUNT(*) AS n FROM species WHERE worms_id IS NOT NULL")$n
n_itis      <- dbGetQuery(con,
  "SELECT COUNT(*) AS n FROM species WHERE itis_id IS NOT NULL")$n
n_gbif      <- dbGetQuery(con,
  "SELECT COUNT(*) AS n FROM species WHERE gbif_id IS NOT NULL")$n
n_taxon     <- dbGetQuery(con, "SELECT COUNT(*) AS n FROM taxon")$n

tibble(
  metric = c("total species", "with worms_id", "with itis_id",
             "with gbif_id", "taxon hierarchy rows"),
  count  = c(n_species, n_worms, n_itis, n_gbif, n_taxon)) |>
  datatable(caption = "Taxonomy standardization summary")

0.9 Validate Cross-Dataset Integrity

Code
# grid_key integrity: casts.grid_key should all be in grid.grid_key
if ("grid_key" %in% dbListFields(con, "casts") &&
    "grid_key" %in% dbListFields(con, "grid")) {
  grid_orphans <- dbGetQuery(con, "
    SELECT COUNT(*) AS n FROM casts c
    WHERE c.grid_key IS NOT NULL
      AND c.grid_key NOT IN (SELECT grid_key FROM grid)")$n
  message(glue("Grid key orphans in casts: {grid_orphans}"))
}

# ship PK uniqueness
ship_dups <- dbGetQuery(con, "
  SELECT ship_key, COUNT(*) AS n FROM ship
  GROUP BY ship_key HAVING COUNT(*) > 1")
if (nrow(ship_dups) > 0)
  warning(glue("Duplicate ship_key values: {nrow(ship_dups)}"))

# cruise PK uniqueness
cruise_dups <- dbGetQuery(con, "
  SELECT cruise_key, COUNT(*) AS n FROM cruise
  GROUP BY cruise_key HAVING COUNT(*) > 1")
if (nrow(cruise_dups) > 0)
  warning(glue("Duplicate cruise_key values: {nrow(cruise_dups)}"))

# cruise bridge match statistics
bridge_stats <- dbGetQuery(con, "
  SELECT
    COUNT(*) AS total_casts,
    SUM(CASE WHEN ship_key IS NOT NULL THEN 1 ELSE 0 END) AS with_ship_key,
    SUM(CASE WHEN cruise_key IS NOT NULL THEN 1 ELSE 0 END) AS with_cruise_key
  FROM casts")
bridge_stats |> datatable(caption = "Cruise bridge coverage")
Code
message("Cross-dataset validation complete")

0.10 Show Combined Schema

Code
# build dm from connection (exclude internal tables)
tbls <- setdiff(dbListTables(con), c("_sp_update"))

d <- dm::dm_from_con(con, table_names = tbls, learn_keys = FALSE)

# set table colors by group
d <- d |>
  dm::dm_set_colors(
    lightblue    = c(cruise, site, tow, net),      # ichthyo chain
    lightyellow  = c(ichthyo, species, lookup),     # species/lookup
    lightgreen   = c(grid, segment),                # spatial
    pink         = c(casts, bottle, bottle_measurement,
                     cast_condition, measurement_type))  # bottle chain

# add colors for new taxonomy tables if they exist
if ("taxon" %in% tbls)
  d <- d |> dm::dm_set_colors(lightyellow = taxon)
if ("taxa_rank" %in% tbls)
  d <- d |> dm::dm_set_colors(lightyellow = taxa_rank)
if ("ship" %in% tbls)
  d <- d |> dm::dm_set_colors(lightblue = ship)

dm::dm_draw(d, rankdir = "LR", view_type = "all")

0.11 Write Modified Parquet Outputs

Only export tables that differ from upstream (modified or new). Unchanged tables reference upstream GCS URIs in the manifest.

Code
# tables modified by this workflow
modified_tables <- c(
  "casts",      # added ship_key, cruise_key columns
  "species",    # updated worms_id, itis_id, gbif_id
  "taxon",      # new table (taxonomy hierarchy)
  "taxa_rank")  # new table (rank ordering)

dir_pq_merged <- here("data/parquet/merge_ichthyo_bottle")

merge_stats <- write_parquet_outputs(
  con              = con,
  output_dir       = dir_pq_merged,
  tables           = modified_tables,
  strip_provenance = FALSE)

merge_stats |>
  mutate(file = basename(path)) |>
  select(-path) |>
  datatable(caption = "Modified parquet export statistics")

0.11.1 Build Combined Manifest

Reference GCS URIs for unchanged upstream tables + local parquet for modified tables.

Code
# read upstream manifests
ichthyo_manifest <- read_json(ichthyo_manifest_path)
bottle_manifest  <- read_json(bottle_manifest_path)

# build combined manifest
manifest <- list(
  created_at      = as.character(Sys.time()),
  release_version = release_version,
  modified_tables = merge_stats |>
    select(table, rows, file_size) |>
    as.list(),
  upstream_refs   = list(
    ichthyo = list(
      manifest = ichthyo_manifest$created_at,
      tables   = setdiff(
        ichthyo_manifest$tables,
        modified_tables)),
    bottle  = list(
      manifest = bottle_manifest$created_at,
      tables   = setdiff(
        bottle_manifest$tables,
        c(modified_tables, "grid")))))  # grid canonical from swfsc

write_json(
  manifest,
  file.path(dir_pq_merged, "manifest.json"),
  auto_unbox = TRUE, pretty = TRUE)

message(glue(
  "Manifest written: {length(modified_tables)} modified tables, ",
  "upstream refs for {length(manifest$upstream_refs$ichthyo$tables)} ",
  "ichthyo + {length(manifest$upstream_refs$bottle$tables)} bottle"))

0.12 Ingest to Working DuckLake

Load ALL tables into Working DuckLake with provenance. Unchanged tables read directly from upstream parquet. Modified tables read from merge output dir.

Code
con_wdl <- get_working_ducklake()
load_duckdb_extension(con_wdl, "spatial")

# helper to ingest a parquet file with provenance
ingest_pqt <- function(pqt_path, tbl_name, source_label) {
  data <- arrow::read_parquet(pqt_path)
  ingest_to_working(
    con         = con_wdl,
    data        = data,
    table       = tbl_name,
    source_file = source_label,
    mode        = "replace")
}

# ingest ichthyo upstream tables (skip modified ones)
ichthyo_upstream <- setdiff(ichthyo_manifest$tables, modified_tables)
ichthyo_wdl <- map_dfr(ichthyo_upstream, function(tbl) {
  pqt <- file.path(dir_pq_ichthyo, paste0(tbl, ".parquet"))
  if (file.exists(pqt)) {
    ingest_pqt(pqt, tbl, glue("parquet/swfsc.noaa.gov_calcofi-db/{tbl}.parquet"))
  }
})

# ingest bottle upstream tables (skip modified + grid)
bottle_upstream <- setdiff(
  bottle_manifest$tables, c(modified_tables, "grid"))
bottle_wdl <- map_dfr(bottle_upstream, function(tbl) {
  pqt <- file.path(dir_pq_bottle, paste0(tbl, ".parquet"))
  if (file.exists(pqt)) {
    ingest_pqt(pqt, tbl, glue("parquet/calcofi.org_bottle-database/{tbl}.parquet"))
  }
})

# ingest modified tables from merge output
merge_wdl <- map_dfr(modified_tables, function(tbl) {
  pqt <- file.path(dir_pq_merged, paste0(tbl, ".parquet"))
  if (file.exists(pqt)) {
    ingest_pqt(pqt, tbl, glue("parquet/merge_ichthyo_bottle/{tbl}.parquet"))
  }
})

# combine stats
wdl_stats <- bind_rows(
  ichthyo_wdl |> mutate(source = "ichthyo_upstream"),
  bottle_wdl  |> mutate(source = "bottle_upstream"),
  merge_wdl   |> mutate(source = "merge_modified"))

wdl_stats |> datatable(caption = "Working DuckLake ingestion stats")

0.13 List Working Tables

Code
working_tables <- list_working_tables(con_wdl)
working_tables |>
  datatable(caption = "Working DuckLake tables with provenance")

0.14 Save Working DuckLake

Code
save_working_ducklake(con_wdl)
[1] "gs://calcofi-db/ducklake/working/calcofi.duckdb"
Code
message("Working DuckLake saved to GCS")

0.15 Create Frozen Release

Create a frozen (immutable) release for public access. Strips provenance columns and exports clean parquet files. See Frozen DuckLake pattern.

Code
dir_frozen <- here(glue("data/releases/{release_version}"))
message(glue("Creating frozen release: {release_version}"))

# get tables to freeze (exclude internal tables)
tables_to_freeze <- DBI::dbListTables(con_wdl) |>
  setdiff(c("_meta"))

# create frozen parquet (strips provenance)
freeze_stats <- write_parquet_outputs(
  con              = con_wdl,
  output_dir       = file.path(dir_frozen, "parquet"),
  tables           = tables_to_freeze,
  strip_provenance = TRUE,
  compression      = "zstd")

freeze_stats |>
  datatable(caption = glue("Frozen release {release_version} statistics"))

0.15.1 Release Notes

Code
# build release notes using paste() to avoid glue() parsing issues with code blocks
tables_list <- paste0(
  "- ", freeze_stats$table, " (",
  format(freeze_stats$rows, big.mark = ","), " rows)")

release_notes <- paste0(
  "# CalCOFI Database Release ", release_version, "\n\n",
  "**Release Date**: ", Sys.Date(), "\n\n",
  "## Tables Included\n\n",
  paste(tables_list, collapse = "\n"), "\n\n",
  "## Total\n\n",
  "- **Tables**: ", nrow(freeze_stats), "\n",
  "- **Total Rows**: ", format(sum(freeze_stats$rows), big.mark = ","), "\n",
  "- **Total Size**: ", round(sum(freeze_stats$file_size) / 1024 / 1024, 1), " MB\n\n",
  "## Data Sources\n\n",
  "- `ingest_swfsc.noaa.gov_calcofi-db.qmd` - Ichthyo tables (cruise, ship, site, tow, net, species, ichthyo, grid, segment, lookup)\n",
  "- `ingest_calcofi.org_bottle-database.qmd` - Bottle/cast tables (casts, bottle, bottle_measurement, cast_condition, measurement_type)\n\n",
  "## Cross-Dataset Integration\n\n",
  "- **Ship matching**: Reconciled ship codes between bottle casts and swfsc ship reference\n",
  "- **Cruise bridge**: Derived cruise_key (YYMMKK) for bottle casts via ship_key + datetime\n",
  "- **Taxonomy**: Standardized species with WoRMS AphiaID, ITIS TSN, GBIF backbone key\n",
  "- **Taxon hierarchy**: Built taxon + taxa_rank tables from WoRMS/ITIS classification\n\n",
  "## Access\n\n",
  "Parquet files can be queried directly from GCS:\n\n",
  "```r\n",
  "library(duckdb)\n",
  "con <- dbConnect(duckdb())\n",
  "dbExecute(con, 'INSTALL httpfs; LOAD httpfs;')\n",
  "dbGetQuery(con, \"\n",
  "  SELECT * FROM read_parquet(\n",
  "    'https://storage.googleapis.com/calcofi-db/ducklake/releases/", release_version, "/parquet/ichthyo.parquet')\n",
  "  LIMIT 10\")\n",
  "```\n\n",
  "Or use calcofi4r:\n\n",
  "```r\n",
  "library(calcofi4r)\n",
  "con <- cc_get_db(version = '", release_version, "')\n",
  "```\n")

writeLines(release_notes, file.path(dir_frozen, "RELEASE_NOTES.md"))
message(glue("Release notes written to {file.path(dir_frozen, 'RELEASE_NOTES.md')}"))
cat(release_notes)

1 CalCOFI Database Release v2026.02

Release Date: 2026-02-05

1.1 Tables Included

  • bottle ( 895,371 rows)
  • bottle_measurement (11,135,600 rows)
  • cast_condition ( 235,513 rows)
  • casts ( 35,644 rows)
  • cruise ( 691 rows)
  • grid ( 218 rows)
  • ichthyo ( 830,873 rows)
  • lookup ( 26 rows)
  • measurement_type ( 47 rows)
  • net ( 76,512 rows)
  • segment ( 60,413 rows)
  • ship ( 48 rows)
  • site ( 61,104 rows)
  • species ( 1,144 rows)
  • taxa_rank ( 41 rows)
  • taxon ( 1,671 rows)
  • tow ( 75,506 rows)

1.2 Total

  • Tables: 17
  • Total Rows: 13,410,422
  • Total Size: 80.9 MB

1.3 Data Sources

  • ingest_swfsc.noaa.gov_calcofi-db.qmd - Ichthyo tables (cruise, ship, site, tow, net, species, ichthyo, grid, segment, lookup)
  • ingest_calcofi.org_bottle-database.qmd - Bottle/cast tables (casts, bottle, bottle_measurement, cast_condition, measurement_type)

1.4 Cross-Dataset Integration

  • Ship matching: Reconciled ship codes between bottle casts and swfsc ship reference
  • Cruise bridge: Derived cruise_key (YYMMKK) for bottle casts via ship_key + datetime
  • Taxonomy: Standardized species with WoRMS AphiaID, ITIS TSN, GBIF backbone key
  • Taxon hierarchy: Built taxon + taxa_rank tables from WoRMS/ITIS classification

1.5 Access

Parquet files can be queried directly from GCS:

library(duckdb)
con <- dbConnect(duckdb())
dbExecute(con, 'INSTALL httpfs; LOAD httpfs;')
dbGetQuery(con, "
  SELECT * FROM read_parquet(
    'https://storage.googleapis.com/calcofi-db/ducklake/releases/v2026.02/parquet/ichthyo.parquet')
  LIMIT 10")

Or use calcofi4r:

library(calcofi4r)
con <- cc_get_db(version = 'v2026.02')

1.5.1 Upload Frozen Release to GCS

Code
gcs_release_path <- glue("gs://calcofi-db/ducklake/releases/{release_version}")
message(glue("Uploading frozen release to {gcs_release_path}..."))

# upload all parquet files
parquet_files <- list.files(
  file.path(dir_frozen, "parquet"),
  pattern = "\\.parquet$",
  full.names = TRUE)

for (pqt_file in parquet_files) {
  gcs_path <- glue("{gcs_release_path}/parquet/{basename(pqt_file)}")
  put_gcs_file(pqt_file, gcs_path)
}

# upload manifest and release notes
put_gcs_file(
  file.path(dir_frozen, "parquet", "manifest.json"),
  glue("{gcs_release_path}/manifest.json"))

put_gcs_file(
  file.path(dir_frozen, "RELEASE_NOTES.md"),
  glue("{gcs_release_path}/RELEASE_NOTES.md"))

# update latest symlink
put_gcs_file(
  file.path(dir_frozen, "parquet", "manifest.json"),
  "gs://calcofi-db/ducklake/releases/latest/manifest.json")

message(glue("Frozen release {release_version} uploaded to GCS"))

1.6 Cleanup

Code
# close working ducklake connection
close_duckdb(con_wdl)
message("Working DuckLake connection closed")

# close local merge database
close_duckdb(con)
message("Local merge database connection closed")

# summary
message(glue("\n=== Summary ==="))
message(glue("Working DuckLake: saved to GCS"))
message(glue("Frozen release: {release_version} created at {dir_frozen}"))
message(glue("Tables: {nrow(freeze_stats)}"))
message(glue("Total rows: {format(sum(freeze_stats$rows), big.mark = ',')}"))
Code
devtools::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.5.2 (2025-10-31)
 os       macOS Sequoia 15.7.1
 system   aarch64, darwin20
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/Mexico_City
 date     2026-02-05
 pandoc   3.8.3 @ /opt/homebrew/bin/ (via rmarkdown)
 quarto   1.8.25 @ /usr/local/bin/quarto

─ Packages ───────────────────────────────────────────────────────────────────
 !  package            * version    date (UTC) lib source
    abind                1.4-8      2024-09-12 [1] CRAN (R 4.5.0)
    ape                  5.8-1      2024-12-16 [1] CRAN (R 4.5.0)
    arrow                22.0.0.1   2025-12-23 [1] CRAN (R 4.5.2)
    assertthat           0.2.1      2019-03-21 [1] CRAN (R 4.5.0)
    backports            1.5.0      2024-05-23 [1] CRAN (R 4.5.0)
    base64enc            0.1-6      2026-02-02 [1] CRAN (R 4.5.2)
    bit                  4.6.0      2025-03-06 [1] CRAN (R 4.5.0)
    bit64                4.6.0-1    2025-01-16 [1] CRAN (R 4.5.0)
    blob                 1.3.0      2026-01-14 [1] CRAN (R 4.5.2)
    broom                1.0.10     2025-09-13 [1] CRAN (R 4.5.0)
    bslib                0.10.0     2026-01-26 [1] CRAN (R 4.5.2)
    cachem               1.1.0      2024-05-16 [1] CRAN (R 4.5.0)
 VP calcofi4db         * 2.3.0      2026-02-04 [?] load_all() (on disk 2.1.0)
 VP calcofi4r          * 1.1.2      2025-10-02 [?] Github (CalCOFI/calcofi4r@dbe213f) (on disk 0.8.1)
    chromote             0.5.1      2025-04-24 [1] CRAN (R 4.5.0)
    class                7.3-23     2025-01-01 [1] CRAN (R 4.5.2)
    classInt             0.4-11     2025-01-08 [1] CRAN (R 4.5.0)
    cli                  3.6.5      2025-04-23 [1] CRAN (R 4.5.0)
    codetools            0.2-20     2024-03-31 [1] CRAN (R 4.5.2)
    crayon               1.5.3      2024-06-20 [1] CRAN (R 4.5.0)
    crosstalk            1.2.2      2025-08-26 [1] CRAN (R 4.5.0)
    crul                 1.6.0      2025-07-23 [1] CRAN (R 4.5.0)
    curl                 7.0.0      2025-08-19 [1] CRAN (R 4.5.0)
    data.table           1.18.0     2025-12-24 [1] CRAN (R 4.5.2)
    DBI                * 1.2.3      2024-06-02 [1] CRAN (R 4.5.0)
    dbplyr               2.5.1      2025-09-10 [1] CRAN (R 4.5.0)
    desc                 1.4.3      2023-12-10 [1] CRAN (R 4.5.0)
    devtools             2.4.6      2025-10-03 [1] CRAN (R 4.5.0)
    DiagrammeR           1.0.11     2024-02-02 [1] CRAN (R 4.5.0)
    DiagrammeRsvg        0.1        2016-02-04 [1] CRAN (R 4.5.0)
    digest               0.6.39     2025-11-19 [1] CRAN (R 4.5.2)
    dm                 * 1.0.12     2025-07-02 [1] CRAN (R 4.5.0)
    dplyr              * 1.2.0      2026-02-03 [1] CRAN (R 4.5.2)
    DT                 * 0.34.0     2025-09-02 [1] CRAN (R 4.5.0)
    duckdb               1.4.4      2026-01-28 [1] CRAN (R 4.5.2)
    dygraphs             1.1.1.6    2018-07-11 [1] CRAN (R 4.5.0)
    e1071                1.7-17     2025-12-18 [1] CRAN (R 4.5.2)
    ellipsis             0.3.2      2021-04-29 [1] CRAN (R 4.5.0)
    evaluate             1.0.5      2025-08-27 [1] CRAN (R 4.5.0)
    farver               2.1.2      2024-05-13 [1] CRAN (R 4.5.0)
    fastmap              1.2.0      2024-05-15 [1] CRAN (R 4.5.0)
    fs                 * 1.6.6      2025-04-12 [1] CRAN (R 4.5.0)
    fuzzyjoin            0.1.6.1    2025-07-10 [1] CRAN (R 4.5.0)
    gargle               1.6.1      2026-01-29 [1] CRAN (R 4.5.2)
    generics             0.1.4      2025-05-09 [1] CRAN (R 4.5.0)
    geojsonsf            2.0.5      2025-11-26 [1] CRAN (R 4.5.2)
    ggplot2              4.0.1      2025-11-14 [1] CRAN (R 4.5.2)
    glue               * 1.8.0      2024-09-30 [1] CRAN (R 4.5.0)
    googledrive          2.1.2      2025-09-10 [1] CRAN (R 4.5.0)
    gtable               0.3.6      2024-10-25 [1] CRAN (R 4.5.0)
    here               * 1.0.2      2025-09-15 [1] CRAN (R 4.5.0)
    highcharter          0.9.4      2022-01-03 [1] CRAN (R 4.5.0)
    hms                  1.1.4      2025-10-17 [1] CRAN (R 4.5.0)
    htmltools            0.5.9      2025-12-04 [1] CRAN (R 4.5.2)
    htmlwidgets          1.6.4      2023-12-06 [1] CRAN (R 4.5.0)
    httpcode             0.3.0      2020-04-10 [1] CRAN (R 4.5.0)
    httpuv               1.6.16     2025-04-16 [1] CRAN (R 4.5.0)
    httr                 1.4.7      2023-08-15 [1] CRAN (R 4.5.0)
    httr2                1.2.2      2025-12-08 [1] CRAN (R 4.5.2)
    igraph               2.2.1      2025-10-27 [1] CRAN (R 4.5.0)
    isoband              0.3.0      2025-12-07 [1] CRAN (R 4.5.2)
    janitor              2.2.1      2024-12-22 [1] CRAN (R 4.5.0)
    jquerylib            0.1.4      2021-04-26 [1] CRAN (R 4.5.0)
    jsonlite           * 2.0.0      2025-03-27 [1] CRAN (R 4.5.0)
    KernSmooth           2.23-26    2025-01-01 [1] CRAN (R 4.5.2)
    knitr                1.51       2025-12-20 [1] CRAN (R 4.5.2)
    later                1.4.5      2026-01-08 [1] CRAN (R 4.5.2)
    lattice              0.22-7     2025-04-02 [1] CRAN (R 4.5.2)
    lazyeval             0.2.2      2019-03-15 [1] CRAN (R 4.5.0)
    leafem               0.2.5      2025-08-28 [1] CRAN (R 4.5.0)
    leaflet              2.2.3      2025-09-04 [1] CRAN (R 4.5.0)
    librarian            1.8.1      2021-07-12 [1] CRAN (R 4.5.0)
    lifecycle            1.0.5      2026-01-08 [1] CRAN (R 4.5.2)
    lubridate            1.9.4      2024-12-08 [1] CRAN (R 4.5.0)
    magrittr             2.0.4      2025-09-12 [1] CRAN (R 4.5.0)
    mapgl                0.4.4      2026-01-12 [1] CRAN (R 4.5.2)
    mapview              2.11.4     2025-09-08 [1] CRAN (R 4.5.0)
    markdown             2.0        2025-03-23 [1] CRAN (R 4.5.0)
    Matrix               1.7-4      2025-08-28 [1] CRAN (R 4.5.2)
    memoise              2.0.1      2021-11-26 [1] CRAN (R 4.5.0)
    mgcv                 1.9-3      2025-04-04 [1] CRAN (R 4.5.2)
    mime                 0.13       2025-03-17 [1] CRAN (R 4.5.0)
    nlme                 3.1-168    2025-03-31 [1] CRAN (R 4.5.2)
    otel                 0.2.0      2025-08-29 [1] CRAN (R 4.5.0)
    pillar               1.11.1     2025-09-17 [1] CRAN (R 4.5.0)
    pkgbuild             1.4.8      2025-05-26 [1] CRAN (R 4.5.0)
    pkgconfig            2.0.3      2019-09-22 [1] CRAN (R 4.5.0)
    pkgload              1.4.1      2025-09-23 [1] CRAN (R 4.5.0)
    plotly               4.11.0     2025-06-19 [1] CRAN (R 4.5.0)
    png                  0.1-8      2022-11-29 [1] CRAN (R 4.5.0)
    processx             3.8.6      2025-02-21 [1] CRAN (R 4.5.0)
    promises             1.5.0      2025-11-01 [1] CRAN (R 4.5.0)
    proxy                0.4-29     2025-12-29 [1] CRAN (R 4.5.2)
    ps                   1.9.1      2025-04-12 [1] CRAN (R 4.5.0)
    purrr              * 1.2.1      2026-01-09 [1] CRAN (R 4.5.2)
    quantmod             0.4.28     2025-06-19 [1] CRAN (R 4.5.0)
    R6                   2.6.1      2025-02-15 [1] CRAN (R 4.5.0)
    rappdirs             0.3.4      2026-01-17 [1] CRAN (R 4.5.2)
    raster               3.6-32     2025-03-28 [1] CRAN (R 4.5.0)
    RColorBrewer         1.1-3      2022-04-03 [1] CRAN (R 4.5.0)
    Rcpp                 1.1.1      2026-01-10 [1] CRAN (R 4.5.2)
    readr              * 2.1.6      2025-11-14 [1] CRAN (R 4.5.2)
    remotes              2.5.0      2024-03-17 [1] CRAN (R 4.5.0)
    rlang                1.1.7      2026-01-09 [1] CRAN (R 4.5.2)
    rlist                0.4.6.2    2021-09-03 [1] CRAN (R 4.5.0)
    rmarkdown            2.30       2025-09-28 [1] CRAN (R 4.5.0)
    rnaturalearth        1.1.0      2025-07-28 [1] CRAN (R 4.5.0)
    rnaturalearthhires   1.0.0.9000 2025-10-02 [1] Github (ropensci/rnaturalearthhires@e4736f6)
    RPostgres            1.4.8      2025-02-25 [1] CRAN (R 4.5.0)
    rprojroot            2.1.1      2025-08-26 [1] CRAN (R 4.5.0)
    rstudioapi           0.18.0     2026-01-16 [1] CRAN (R 4.5.2)
    rvest                1.0.5      2025-08-29 [1] CRAN (R 4.5.0)
    S7                   0.2.1      2025-11-14 [1] CRAN (R 4.5.2)
    sass                 0.4.10     2025-04-11 [1] CRAN (R 4.5.0)
    satellite            1.0.6      2025-08-21 [1] CRAN (R 4.5.0)
    scales               1.4.0      2025-04-24 [1] CRAN (R 4.5.0)
    selectr              0.4-2      2019-11-20 [1] CRAN (R 4.5.0)
    sessioninfo          1.2.3      2025-02-05 [1] CRAN (R 4.5.0)
    sf                   1.0-24     2026-01-13 [1] CRAN (R 4.5.2)
    shiny                1.11.1     2025-07-03 [1] CRAN (R 4.5.0)
    shinyWidgets         0.9.0      2025-02-21 [1] CRAN (R 4.5.0)
    snakecase            0.11.1     2023-08-27 [1] CRAN (R 4.5.0)
    sp                   2.2-0      2025-02-01 [1] CRAN (R 4.5.0)
    stars                0.6-8      2025-02-01 [1] CRAN (R 4.5.0)
    stringi              1.8.7      2025-03-27 [1] CRAN (R 4.5.0)
    stringr            * 1.6.0      2025-11-04 [1] CRAN (R 4.5.0)
    taxize             * 0.10.0     2025-02-07 [1] CRAN (R 4.5.0)
    terra                1.8-93     2026-01-12 [1] CRAN (R 4.5.2)
    tibble             * 3.3.1      2026-01-11 [1] CRAN (R 4.5.2)
    tidyr              * 1.3.2      2025-12-19 [1] CRAN (R 4.5.2)
    tidyselect           1.2.1      2024-03-11 [1] CRAN (R 4.5.0)
    timechange           0.4.0      2026-01-29 [1] CRAN (R 4.5.2)
    TTR                  0.24.4     2023-11-28 [1] CRAN (R 4.5.0)
    tzdb                 0.5.0      2025-03-15 [1] CRAN (R 4.5.0)
    units                1.0-0      2025-10-09 [1] CRAN (R 4.5.0)
    usethis              3.2.1      2025-09-06 [1] CRAN (R 4.5.0)
    uuid                 1.2-2      2026-01-23 [1] CRAN (R 4.5.2)
    V8                   8.0.1      2025-10-10 [1] CRAN (R 4.5.0)
    vctrs                0.7.1      2026-01-23 [1] CRAN (R 4.5.2)
    viridisLite          0.4.2      2023-05-02 [1] CRAN (R 4.5.0)
    visNetwork           2.1.4      2025-09-04 [1] CRAN (R 4.5.0)
    vroom                1.7.0      2026-01-27 [1] CRAN (R 4.5.2)
    websocket            1.4.4      2025-04-10 [1] CRAN (R 4.5.0)
    withr                3.0.2      2024-10-28 [1] CRAN (R 4.5.0)
    worrms             * 0.4.3      2023-06-20 [1] CRAN (R 4.5.0)
    xfun                 0.56       2026-01-18 [1] CRAN (R 4.5.2)
    xml2                 1.5.2      2026-01-17 [1] CRAN (R 4.5.2)
    xtable               1.8-4      2019-04-21 [1] CRAN (R 4.5.0)
    xts                  0.14.1     2024-10-15 [1] CRAN (R 4.5.0)
    yaml                 2.3.12     2025-12-10 [1] CRAN (R 4.5.2)
    zoo                  1.8-15     2025-12-15 [1] CRAN (R 4.5.2)

 [1] /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library

 * ── Packages attached to the search path.
 V ── Loaded and on-disk version mismatch.
 P ── Loaded and on-disk path mismatch.

──────────────────────────────────────────────────────────────────────────────