Ingest CalCOFI CTD Cast Data

Published

2026-04-02

1 Overview

This notebook ingests CalCOFI CTD data from https://calcofi.org/data/oceanographic-data/ctd-cast-files/, downloads and unzips final and preliminary CTD files, normalizes into tidy tables (ctd_cast, ctd_measurement, ctd_summary), and exports to Parquet for the CalCOFI integrated database.

1.1 Key Features

  1. Web Scraping: Scrapes all CTD .zip download links from calcofi.org

  2. Smart Filtering:

    • Downloads all .zip files for archival completeness
    • Only unzips final and preliminary files
    • Skips raw/cast/test files
  3. Priority-based Selection:

    • For each cruise, selects final if available, otherwise preliminary
    • Excludes raw/test/prodo cast files
  4. Tidy Normalization:

    • ctd_cast: one row per unique cast (cruise/station/direction)
    • ctd_measurement: long-format sensor readings at each depth
    • ctd_summary: summary stats per station/depth/measurement_type across cast directions
    • measurement_type: reference table for measurement codes
  5. Standardized Workflow: follows patterns from ingest_calcofi_bottle.qmd — deterministic UUIDs, GCS parquet uploads, metadata sidecars, calcofi4db utilities

2 Setup

Code
# chunk timing hook — prints elapsed time for each chunk
knitr::knit_hooks$set(time_it = function(before, options) {
  if (before) {
    .time_it_t0 <<- Sys.time()
  } else {
    elapsed <- round(difftime(Sys.time(), .time_it_t0, units = "secs"), 1)
    tnow <- format(Sys.time(), "%H:%M:%S")
    message(glue::glue("R chunk {options$label}: {elapsed}s ~ {tnow}"))
  }
})
knitr::opts_chunk$set(time_it = TRUE)

devtools::load_all(here::here("../calcofi4db"))
devtools::load_all(here::here("../calcofi4r"))
librarian::shelf(
  CalCOFI / calcofi4db,
  CalCOFI / calcofi4r,
  DBI,
  dplyr,
  DT,
  fs,
  glue,
  here,
  httr,
  janitor,
  lubridate,
  mapview,
  plotly,
  purrr,
  ps,
  readr,
  rvest,
  sf,
  stringr,
  tibble,
  tidyr,
  zip,
  quiet = T
)

dataset_name <- "CalCOFI CTD Cast Database"
provider <- "calcofi"
dataset <- "ctd-cast"
dir_label <- glue("{provider}_{dataset}")
dir_data <- "~/My Drive/projects/calcofi/data-public"
dir_dl <- path_expand(glue("{dir_data}/{provider}/{dataset}/download"))
dir_parquet <- here(glue("data/parquet/{dir_label}"))
db_path <- here(glue("data/wrangling/{dir_label}.duckdb"))
db_checkpoint <- here(glue("data/wrangling/{dir_label}_checkpoint.duckdb"))
dir_tmp <- here(glue("data/tmp/{dir_label}"))
url <- "https://calcofi.org/data/oceanographic-data/ctd-cast-files/"
dir_meta <- here(glue("metadata/{provider}/{dataset}"))

# use checkpoint DB if available (skips expensive read+bind+filter steps)
if (file_exists(db_checkpoint)) {
  if (file_exists(db_path)) file_delete(db_path)
  file_copy(db_checkpoint, db_path, overwrite = TRUE)
  message(glue("Restored from checkpoint: {db_checkpoint}"))
} else if (file_exists(db_path)) {
  file_delete(db_path)
}
dir_create(c(dir_dl, dirname(db_path), dir_parquet, dir_tmp), recurse = TRUE)

con <- get_duckdb_con(db_path)
load_duckdb_extension(con, "spatial")
load_duckdb_extension(con, "icu")

# limit memory to half of system RAM so DuckDB spills to disk instead of consuming all RAM
mem_gb <- ps::ps_system_memory()$total / 1024^3 / 2 |> floor()
dbExecute(con, glue("SET memory_limit = '{mem_gb}GB'"))
[1] 0
Code
dbExecute(con, glue("SET temp_directory = '{dir_tmp}'"))
[1] 0
Code
# load metadata
d_meas_type <- read_csv(
  here("metadata/measurement_type.csv"),
  show_col_types = F
)
d_flds_rd <- read_csv(glue("{dir_meta}/flds_redefine.csv"), show_col_types = F)
d_tbls_rd <- read_csv(glue("{dir_meta}/tbls_redefine.csv"), show_col_types = F)

3 Check for Resumable State

Code
# detect if parquet outputs are already complete (e.g. prior run failed
# only during GCS upload). If so, skip ALL computation and jump to upload.
parquet_complete <- FALSE
manifest_path <- file.path(dir_parquet, "manifest.json")
if (file_exists(manifest_path)) {
  mf <- jsonlite::read_json(manifest_path)
  expected <- setdiff(mf$tables, unlist(mf$supplemental))
  parquet_ok <- all(vapply(expected, function(tbl) {
    p <- file.path(dir_parquet, paste0(tbl, ".parquet"))
    d <- file.path(dir_parquet, tbl)
    file_exists(p) || dir_exists(d)
  }, logical(1)))
  if (parquet_ok) {
    parquet_complete <- TRUE
    message(glue(
      "Parquet output already complete ({length(mf$tables)} tables, ",
      "{format(mf$total_rows, big.mark = ',')} rows) — ",
      "skipping computation, resuming at upload"))
  }
}

# if parquet not complete, check for checkpoint (ctd_raw pre-computed)
has_ctd_raw <- FALSE
if (!parquet_complete) {
  has_ctd_raw <- "ctd_raw" %in% DBI::dbListTables(con)
  if (has_ctd_raw) {
    n_raw <- dbGetQuery(con, "SELECT COUNT(*) AS n FROM ctd_raw")$n
    message(glue(
      "Checkpoint: ctd_raw already loaded ",
      "({format(n_raw, big.mark = ',')} rows) — ",
      "skipping read+bind+filter"))
  }
}

# set eval for read+bind+filter chunks
skip_read_bind <- parquet_complete || has_ctd_raw
knitr::opts_chunk$set(eval = !skip_read_bind)

5 Download and Unzip Files

Code
download_and_unzip <- function(url, dest_dir, zip_type, unzip = TRUE) {
  file_zip <- basename(url)
  dest_file <- file.path(dest_dir, file_zip)
  dir_unzip <- file.path(dest_dir, str_remove(file_zip, "\\.zip$"))

  if (file_exists(dest_file)) {
    message(glue("Already exists: {file_zip}"))
  } else {
    message(glue("Downloading: {file_zip}"))
    tryCatch(
      download.file(url, dest_file, mode = "wb"),
      error = function(e) {
        warning(glue("Failed to download {file_zip}: {e$message}"))
        if (file_exists(dest_file)) {
          file_delete(dest_file)
        }
        return(invisible(NULL))
      }
    )
    if (!file_exists(dest_file)) return(invisible(NULL))
  }

  if (unzip && dir_exists(dir_unzip)) {
    message(glue("Already unzipped: {file_zip}"))
  } else {
    if (zip_type %in% c("final", "preliminary")) {
      message(glue("Unzipping: {file_zip}"))
      dir_create(dir_unzip, recurse = TRUE)
      unzip(dest_file, exdir = dir_unzip)
    } else if (unzip) {
      message(glue("Skipping unzip (not final/preliminary): {file_zip}"))
    }
  }
}

d_zips |>
  pwalk(function(url, file_zip, zip_type, ...) {
    download_and_unzip(url, dir_dl, zip_type, unzip = TRUE)
  })

message("Download and extraction complete!")

6 Find and Prioritize CTD Data Files

Code
d_csv <- tibble(
  path = list.files(
    dir_dl,
    pattern = "\\.csv$",
    recursive = TRUE,
    full.names = TRUE
  )
) |>
  mutate(
    file_csv = basename(path),
    path_unzip = str_replace(path, glue("{dir_dl}/"), ""),
    dir_unzip = str_extract(path_unzip, "^[^/]+"),
    cruise_key = str_extract(
      path_unzip,
      "\\d{2}-(\\d{4}[A-Z0-9]{2,4})_.*",
      group = 1
    ),
    data_stage = case_when(
      str_detect(path_unzip, "Final.*db[_|-]csv")  ~ "final",
      str_detect(path_unzip, "Prelim.*db[_|-]csv") ~ "preliminary",
      # prelim CSVs directly in unzipped dir (no db-csv subfolder)
      str_detect(dir_unzip, "Prelim") &
        path_unzip == paste0(dir_unzip, "/", file_csv) ~ "preliminary",
      # edge case: 2111SR has CSVs only in csvs-plots subfolder
      cruise_key == "2111SR" &
        str_detect(
          path_unzip,
          "Prelim.*csvs-plots.*2111SR.*csv"
        ) ~ "preliminary",
      .default = NA_character_
    ),
    cast_dir = case_when(
      str_detect(file_csv, regex("U\\.csv$", ignore_case = T)) ~ "U",
      str_detect(file_csv, regex("D\\.csv$", ignore_case = T)) ~ "D"
    ),
    priority = case_when(
      data_stage == "final" ~ 1,
      data_stage == "preliminary" ~ 2,
      TRUE ~ 3
    )
  ) |>
  relocate(cruise_key, path_unzip) |>
  arrange(cruise_key, path_unzip) |>
  filter(data_stage %in% c("final", "preliminary"))

# for each cruise, keep only final if available, otherwise preliminary
d_priority <- d_csv |>
  group_by(cruise_key) |>
  summarize(
    best_priority = min(priority),
    .groups = "drop"
  )

d_csv <- d_csv |>
  inner_join(d_priority, by = "cruise_key") |>
  filter(priority == best_priority) |>
  select(-best_priority)

cruises_csv_notzip <- setdiff(
  unique(d_csv$cruise_key),
  unique(d_zips$cruise_key)
) |>
  sort()
stopifnot(length(cruises_csv_notzip) == 0)

cruises_zip_notcsv <- setdiff(
  unique(d_zips$cruise_key),
  unique(d_csv$cruise_key)
) |>
  sort()

d_csv |>
  select(-any_of(c("path", "data", "col_empties", "col_types"))) |>
  relocate(cruise_key, path_unzip) |>
  arrange(cruise_key, path_unzip) |>
  dt(
    caption = "Files to Ingest",
    fname = "ctd_files_to_ingest"
  )

7 Read and Standardize CTD Files

Remove repeat header rows that can occur within files.

Code
d_csv <- d_csv |>
  arrange(basename(path)) |>
  mutate(
    data = map2(path, seq_along(path), \(path, idx) {
      message(glue("Reading {idx}/{nrow(d_csv)}: {basename(path)}"))

      all_lines <- read_lines(path)
      header_line <- all_lines[1]

      # find repeat header rows (excluding row 1)
      repeat_header_rows <- which(all_lines[-1] == header_line)

      if (length(repeat_header_rows) > 0) {
        all_lines <- all_lines[-(repeat_header_rows + 1)]
        tmp_file <- tempfile(fileext = ".csv")
        write_lines(all_lines, tmp_file)
        data <- read_csv(tmp_file, guess_max = Inf, show_col_types = F) |>
          clean_names()
        file_delete(tmp_file)
      } else {
        data <- read_csv(path, guess_max = Inf, show_col_types = F) |>
          clean_names()
      }
      data
    }),
    nrows = map_int(data, ~ if (is.null(.x)) 0 else nrow(.x))
  )

d_csv |>
  arrange(cruise_key, file_csv) |>
  select(cruise_key, file_csv, nrows) |>
  dt(
    caption = "Number of rows read per file",
    fname = "ctd_rows_per_file"
  ) |>
  formatCurrency("nrows", currency = "", digits = 0, mark = ",")

8 Detect and Correct Column Type Mismatches

Code
d_csv <- d_csv |>
  mutate(
    col_empties = map(data, \(x) {
      tibble(
        col_name = names(x),
        n_empty = map_int(x, \(col) sum(is.na(col)))
      ) |>
        filter(n_empty == nrow(x)) |>
        pull(col_name)
    }),
    col_types = map2(data, col_empties, \(x, y) {
      tibble(
        col_name = names(x),
        col_type = map_chr(x, \(col) class(col)[1])
      ) |>
        filter(!col_name %in% y)
    })
  )

# find most common type for each column across all files
d_types <- d_csv |>
  select(path, col_types) |>
  unnest(col_types) |>
  count(col_name, col_type) |>
  group_by(col_name) |>
  slice_max(n, n = 1, with_ties = FALSE) |>
  ungroup() |>
  select(col_name, expected_type = col_type)

d_mismatches <- d_csv |>
  select(cruise_key, path, col_types) |>
  unnest(col_types) |>
  left_join(d_types, by = "col_name") |>
  filter(col_type != expected_type) |>
  arrange(col_name, path)

if (nrow(d_mismatches) > 0) {
  message("Type mismatches detected - converting columns...")
}

# bind data, converting mismatched columns to expected type
d_bind <- d_csv |>
  mutate(
    data = map2(data, path, \(x, p) {
      x_mismatches <- d_mismatches |>
        filter(path == p)

      if (nrow(x_mismatches) > 0) {
        for (i in 1:nrow(x_mismatches)) {
          col <- x_mismatches$col_name[i]
          expected <- x_mismatches$expected_type[i]
          na_before <- sum(is.na(x[[col]]))

          suppressWarnings({
            x[[col]] <- switch(
              expected,
              "numeric" = as.numeric(x[[col]]),
              "integer" = as.integer(x[[col]]),
              "logical" = as.logical(x[[col]]),
              "character" = as.character(x[[col]]),
              x[[col]]
            )
          })

          na_after <- sum(is.na(x[[col]]))
          na_generated <- na_after - na_before

          if (na_generated > 0) {
            message(glue(
              "  {basename(p)}: {col} ({x_mismatches$col_type[i]} -> {expected}) generated {na_generated} NAs"
            ))
          }

          d_mismatches[
            d_mismatches$path == p & d_mismatches$col_name == col,
            "nas_generated"
          ] <<- na_generated
        }
      }
      x
    })
  ) |>
  unnest(data)

if (nrow(d_mismatches) > 0) {
  d_mismatches |>
    group_by(col_name, expected_type, col_type) |>
    summarize(
      n_files = n(),
      total_nas = sum(nas_generated, na.rm = TRUE),
      files = paste(basename(path), collapse = "; "),
      .groups = "drop"
    ) |>
    arrange(desc(total_nas)) |>
    dt(
      caption = "Type mismatches by column",
      fname = "ctd_type_mismatches_by_column"
    ) |>
    formatCurrency(
      c("n_files", "total_nas"),
      currency = "",
      digits = 0,
      mark = ","
    )
}

# reconcile cruise_key from study where mismatched
d_bind <- d_bind |>
  mutate(
    cruise_key = if_else(
      !is.na(study) &
        cruise_key != study &
        study != "Study",
      study,
      cruise_key
    ),
    `_source_file` = path_unzip
  ) |>
  relocate(`_source_file`, .after = cruise_key) |>
  select(
    -path_unzip,
    -path,
    -file_csv,
    -dir_unzip,
    -priority,
    -project,
    -study,
    -nrows,
    -col_empties,
    -col_types
  )

# fill missing ord_occ from cast_id
d_bind <- d_bind |>
  mutate(
    ord_occ = if_else(
      is.na(ord_occ) & !is.na(cast_id),
      str_extract(cast_id, "_(\\d{3})", group = 1),
      ord_occ
    )
  )

# save intermediate for resumability
write_rds(d_bind, glue("{dir_tmp}/d_bind_pre_datetime.rds"), compress = "gz")

9 Format Date-Time Column

Code
d_bind <- read_rds(glue("{dir_tmp}/d_bind_pre_datetime.rds"))

d_bind <- d_bind |>
  mutate(
    date_time_utc = trimws(date_time_utc),
    date_time_format = case_when(
      str_detect(
        date_time_utc,
        "^\\d{1,2}-[A-z]{3}-\\d{4} \\d{1,2}:\\d{1,2}:\\d{1,2}$"
      ) ~ "dmy_hms",
      str_detect(
        date_time_utc,
        "^\\d{1,2}/\\d{1,2}/\\d{4} \\d{1,2}:\\d{1,2}$"
      ) ~ "mdy_hm",
      TRUE ~ "unknown"
    ),
    datetime_utc = NA,
    datetime_utc = if_else(
      date_time_format == "dmy_hms",
      suppressWarnings(dmy_hms(date_time_utc, tz = "UTC")),
      datetime_utc
    ),
    datetime_utc = if_else(
      date_time_format == "mdy_hm",
      suppressWarnings(mdy_hm(date_time_utc, tz = "UTC")),
      datetime_utc
    )
  ) |>
  relocate(date_time_format, datetime_utc, .after = date_time_utc) |>
  arrange(datetime_utc, depth)

stopifnot(all(!is.na(d_bind$datetime_utc)))

write_rds(d_bind, glue("{dir_tmp}/d_bind_post_datetime.rds"), compress = "gz")

d_bind |>
  group_by(cruise_key, `_source_file`, date_time_format) |>
  summarize(
    n_records = n(),
    datetime_min = min(datetime_utc),
    datetime_max = max(datetime_utc),
    .groups = "drop"
  ) |>
  arrange(cruise_key, `_source_file`, date_time_format) |>
  pivot_wider(
    names_from = date_time_format,
    values_from = n_records,
    values_fill = NA_real_
  ) |>
  relocate(dmy_hms, mdy_hm, .after = cruise_key) |>
  dt(
    caption = "Date-Time formats detected by cruise_key and source_file",
    fname = "ctd_datetime_formats"
  ) |>
  formatDate(c("datetime_min", "datetime_max"), method = "toLocaleString")

10 Pseudo-NA Values and Fill Missing Coordinates

Code
d_bind <- read_rds(glue("{dir_tmp}/d_bind_post_datetime.rds"))

pseudoNA_values <- c(-9.99e-29, -99)

d_bind <- d_bind |>
  mutate(
    is_lnsta_pseudoNA = ifelse(
      as.numeric(line) == 0 & as.numeric(sta) == 0,
      TRUE,
      FALSE
    ),
    is_lon_pseudoNA = map_lgl(lon_dec, ~ some(pseudoNA_values, near, .x)),
    is_lat_pseudoNA = map_lgl(lat_dec, ~ some(pseudoNA_values, near, .x)),
    lon_dec = if_else(
      is_lon_pseudoNA | is_lat_pseudoNA,
      NA_real_,
      lon_dec
    ),
    lat_dec = if_else(
      is_lon_pseudoNA | is_lat_pseudoNA,
      NA_real_,
      lat_dec
    ),
    lon_lnst = if_else(
      !is.na(line) & !is.na(sta) & !is_lnsta_pseudoNA,
      as.numeric(sf_project(
        from = "+proj=calcofi",
        to = "+proj=longlat +datum=WGS84",
        pts = cbind(x = as.numeric(line), y = as.numeric(sta))
      )[, 1]),
      NA_real_
    ),
    lat_lnst = if_else(
      !is.na(line) & !is.na(sta) & !is_lnsta_pseudoNA,
      as.numeric(sf_project(
        from = "+proj=calcofi",
        to = "+proj=longlat +datum=WGS84",
        pts = cbind(x = as.numeric(line), y = as.numeric(sta))
      )[, 2]),
      NA_real_
    ),
    lon_dec = if_else(is.na(lon_dec), lon_lnst, lon_dec),
    lat_dec = if_else(is.na(lat_dec), lat_lnst, lat_dec)
  )

stopifnot(sum(is.na(d_bind$lon_dec)) == 0)
stopifnot(sum(is.na(d_bind$lat_dec)) == 0)

d_bind |>
  filter(is_lon_pseudoNA | is_lat_pseudoNA) |>
  group_by(cruise_key) |>
  summarize(
    n_lon_pseudoNA = sum(is_lon_pseudoNA),
    n_lat_pseudoNA = sum(is_lat_pseudoNA)
  ) |>
  dt(
    caption = glue(
      "Cruises with pseudo-NAs ({paste(pseudoNA_values, collapse = ',')}) ",
      "set to NA, then filled from line/station coordinates."
    ),
    fname = "ctd_cruises_pseudoNA_lonlat"
  ) |>
  formatCurrency(
    c("n_lon_pseudoNA", "n_lat_pseudoNA"),
    currency = "",
    digits = 0,
    mark = ","
  )

11 Distance Filtering

Code
max_dist_dec_lnst_km <- 10

pts <- d_bind |>
  st_as_sf(coords = c("lon_dec", "lat_dec"), remove = F, crs = 4326) |>
  mutate(
    geom_lnst = purrr::map2(lon_lnst, lat_lnst, \(x, y) st_point(c(x, y))) |>
      st_sfc(crs = 4326),
    dist_dec_lnst_km = st_distance(geometry, geom_lnst, by_element = T) |>
      units::set_units(km) |>
      units::drop_units(),
    is_dist_dec_lnst_within_max = dist_dec_lnst_km <= max_dist_dec_lnst_km
  ) |>
  st_join(
    calcofi4r::cc_grid |>
      rename(any_of(c(site_key = "sta_key"))) |>
      select(grid_site = site_key),
    join = st_intersects
  )
st_agr(pts) <- "constant"

11.1 View sample cruise with excess distance points

Code
badcr_id <- "1507OC"
pts_badcr <- pts |>
  filter(cruise_key == badcr_id)

bb_badcr <- st_bbox(pts_badcr) |> st_as_sfc()

mapView(bb_badcr) +
  mapView(
    pts_badcr |>
      filter(is_dist_dec_lnst_within_max) |>
      slice_sample(n = 1000) |>
      bind_rows(
        pts_badcr |>
          filter(!is_dist_dec_lnst_within_max)
      ) |>
      select(
        cruise_key,
        datetime_utc,
        lon_dec,
        lat_dec,
        sta_id,
        line,
        sta,
        dist_dec_lnst_km,
        is_dist_dec_lnst_within_max
      ),
    layer.name = glue(
      "Cruise {badcr_id}<br>distance (km)<br>lon/lat to line/station"
    ),
    zcol = "dist_dec_lnst_km",
    cex = 5,
    alpha = 0.5
  )

11.2 Filter points

Code
d_pts_cruise_filt_smry <- pts |>
  st_drop_geometry() |>
  group_by(cruise_key) |>
  filter(any(!is_dist_dec_lnst_within_max)) |>
  summarize(
    n_all = n(),
    n_outside_grid = sum(is.na(grid_site)),
    pct_outside_grid = n_outside_grid / n_all,
    n_gt_cutoff = sum(!is_dist_dec_lnst_within_max, na.rm = T),
    avg_dist_gt_cutoff_km = if_else(
      n_gt_cutoff > 0,
      mean(dist_dec_lnst_km[!is_dist_dec_lnst_within_max], na.rm = T),
      NA_real_
    ),
    pct_gt_cutoff = n_gt_cutoff / n_all,
    n_rm = sum(!is_dist_dec_lnst_within_max | is.na(grid_site), na.rm = T),
    pct_rm = n_rm / n_all,
    .groups = "drop"
  ) |>
  filter(n_rm > 0) |>
  arrange(desc(pct_rm))

pts_filt <- pts |>
  filter(
    is_dist_dec_lnst_within_max,
    !is.na(grid_site)
  )

d_pts_cruise_filt_smry |>
  dt(
    caption = glue(
      "Cruises with rows filtered: outside CalCOFI grid or exceeded ",
      "{max_dist_dec_lnst_km} km cutoff from line/station coordinates."
    ),
    escape = F,
    fname = "ctd_cruises_distance_from_lnst"
  ) |>
  formatCurrency(
    c(
      "n_all",
      "n_gt_cutoff",
      "avg_dist_gt_cutoff_km",
      "n_outside_grid",
      "n_rm"
    ),
    currency = "",
    digits = 0,
    mark = ","
  ) |>
  formatPercentage(
    c("pct_gt_cutoff", "pct_outside_grid", "pct_rm"),
    digits = 2
  )

11.3 View filtered points

Code
bb_cr <- st_bbox(pts_filt) |> st_as_sfc()

mapView(bb_cr) +
  mapView(
    pts_filt |>
      group_by(cruise_key) |>
      slice_sample(n = 100) |>
      ungroup() |>
      select(
        cruise_key,
        lon_dec,
        lat_dec,
        sta_id,
        line,
        sta,
        dist_dec_lnst_km,
        datetime_utc
      ),
    zcol = "cruise_key",
    cex = 5,
    alpha = 0.5
  )

11.4 Check for duplicates by datetime and depth

Code
d_dupes <- pts_filt |>
  st_drop_geometry() |>
  select(datetime_utc, depth) |>
  group_by(datetime_utc, depth) |>
  summarize(n = n(), .groups = "drop") |>
  filter(n > 1) |>
  arrange(desc(n))

n_distinct_dtime_depth <- pts_filt |>
  st_drop_geometry() |>
  select(datetime_utc, depth) |>
  n_distinct()

d_dupes |>
  slice(c(1:10, (n() - 9):n())) |>
  dt(
    caption = glue(
      "First and last 10 duplicates by datetime_utc, depth ",
      "(nrows = {format(nrow(d_dupes), big.mark = ',')}; ",
      "n_distinct = {format(n_distinct_dtime_depth, big.mark = ',')})"
    ),
    fname = "ctd_duplicates_by_datetime_depth"
  )

12 Rename Fields

Apply field renames from flds_redefine.csv.

Code
# apply column renames from flds_redefine
renames <- d_flds_rd |>
  filter(fld_old != fld_new) |>
  select(fld_old, fld_new)

d_filt <- pts_filt |>
  st_drop_geometry() |>
  select(-geom_lnst)

for (i in seq_len(nrow(renames))) {
  old <- renames$fld_old[i]
  new <- renames$fld_new[i]
  if (old %in% names(d_filt)) {
    d_filt <- d_filt |> rename(!!new := !!old)
  }
}

# drop intermediate columns
d_filt <- d_filt |>
  select(
    -any_of(c(
      "date_time_pst",
      "date_time_utc",
      "date_time_format",
      "is_lon_pseudoNA",
      "is_lat_pseudoNA",
      "is_lnsta_pseudoNA",
      "lon_lnst",
      "lat_lnst",
      "dist_dec_lnst_km",
      "is_dist_dec_lnst_within_max",
      "grid_site"
    ))
  )

# write raw table for pivoting
dbWriteTable(con, "ctd_raw", d_filt, overwrite = TRUE)
message(glue("Loaded {nrow(d_filt)} rows into ctd_raw"))
Code
# if parquet already complete, skip computation through write_parquet
# only re-enable for upload_gcs and cleanup
if (parquet_complete) {
  knitr::opts_chunk$set(eval = FALSE)
  message("Parquet complete — skipping to upload")
} else {
  knitr::opts_chunk$set(eval = TRUE)
}

13 Cross-Dataset Bridge

Load reference tables (ship, cruise, grid) from the ichthyo ingest, derive ship_key, and validate against known ships/cruises.

Code
# load reference tables from ichthyo workflow's local parquet output
load_prior_tables(
  con = con,
  tables = c("ship", "cruise", "grid"),
  parquet_dir = here("data/parquet/swfsc_ichthyo")
)

# detect whether cruise_key already in YYYY-MM-NODC format (from checkpoint)
sample_ck <- dbGetQuery(
  con, "SELECT cruise_key FROM ctd_raw LIMIT 1")$cruise_key
cruise_key_already_converted <- grepl("^\\d{4}-\\d{2}-", sample_ck)

if (!cruise_key_already_converted) {
  # 0404NHJD is a combined New Horizon + Jordan Davis cruise; use primary ship NH
  dbExecute(
    con,
    "UPDATE ctd_raw SET cruise_key = LEFT(cruise_key, 6)
     WHERE LENGTH(cruise_key) > 6"
  )

  # derive ship_key from last 2 chars of cruise_key (YYMMKK format from filename)
  dbExecute(con, "ALTER TABLE ctd_raw ADD COLUMN IF NOT EXISTS ship_key VARCHAR")
  dbExecute(con, "UPDATE ctd_raw SET ship_key = RIGHT(cruise_key, 2)")

  # validate ship_key against ship reference table
  ctd_ships <- dbGetQuery(con, "SELECT DISTINCT ship_key FROM ctd_raw")
  ref_ships <- dbGetQuery(con, "SELECT ship_key FROM ship")
  orphan_ships <- setdiff(ctd_ships$ship_key, ref_ships$ship_key)

  if (length(orphan_ships) > 0) {
    message(glue(
      "{length(orphan_ships)} ship_key(s) in CTD not in ship table: ",
      "{paste(orphan_ships, collapse = ', ')}"
    ))

    # run match_ships() with centralized renames for orphan ships
    orphan_tbl <- dbGetQuery(con, glue(
      "SELECT DISTINCT ship_key AS ship_code, NULL AS ship_name
       FROM ctd_raw
       WHERE ship_key IN ({paste(shQuote(orphan_ships, type = 'sh'), collapse = ', ')})")) |>
      mutate(ship_name = as.character(ship_name))

    ship_result <- match_ships(
      unmatched_ships  = orphan_tbl,
      reference_ships  = dbReadTable(con, "ship"),
      ship_renames_csv = here("metadata/ship_renames.csv"),
      fetch_ices       = FALSE)

    # insert interim entries for still-unmatched ships (ship_nodc = "?XX?")
    ensure_interim_ships(con, ship_result)

    dbGetQuery(
      con,
      glue(
        "SELECT ship_key, COUNT(DISTINCT cruise_key) AS n_cruises,
                COUNT(*) AS n_rows
         FROM ctd_raw
         WHERE ship_key IN ({paste(shQuote(orphan_ships, type = 'sh'), collapse = ', ')})
         GROUP BY ship_key
         ORDER BY ship_key"
      )
    ) |>
      dt(
        caption = "Orphan ship_keys (in CTD but not in ship table)",
        fname = "ctd_orphan_ship_keys"
      )
  } else {
    message("All CTD ship_keys found in ship reference table")
  }

  # convert cruise_key from YYMMKK → YYYY-MM-NODC format
  convert_cruise_key_format(con, "ctd_raw", old_key_col = "cruise_key")
  dbExecute(con, "ALTER TABLE ctd_raw DROP COLUMN cruise_key")
  dbExecute(con, "ALTER TABLE ctd_raw RENAME COLUMN cruise_key_new TO cruise_key")
} else {
  message("cruise_key already in YYYY-MM-NODC format (from checkpoint)")
}

# validate cruise_key against cruise reference table
ctd_cruises <- dbGetQuery(con, "SELECT DISTINCT cruise_key FROM ctd_raw")
ref_cruises <- dbGetQuery(con, "SELECT cruise_key FROM cruise")
orphan_cruises <- setdiff(ctd_cruises$cruise_key, ref_cruises$cruise_key)

if (length(orphan_cruises) > 0) {
  message(glue(
    "{length(orphan_cruises)} cruise_key(s) in CTD not in cruise table: ",
    "{paste(head(orphan_cruises, 10), collapse = ', ')}",
    "{if (length(orphan_cruises) > 10) '...' else ''}"
  ))
  dbGetQuery(
    con,
    glue(
      "SELECT cruise_key, COUNT(*) AS n_rows
       FROM ctd_raw
       WHERE cruise_key IN ({paste(shQuote(orphan_cruises, type = 'sh'), collapse = ', ')})
       GROUP BY cruise_key
       ORDER BY cruise_key"
    )
  ) |>
    dt(
      caption = "Orphan cruise_keys (in CTD but not in cruise table)",
      fname = "ctd_orphan_cruise_keys"
    )
} else {
  message("All CTD cruise_keys found in cruise reference table")
}

14 Save Checkpoint

Save a DuckDB checkpoint after the expensive read+bind+filter+bridge steps so subsequent runs can resume from here.

Code
if (!file_exists(db_checkpoint)) {
  close_duckdb(con)
  file_copy(db_path, db_checkpoint, overwrite = TRUE)
  con <- get_duckdb_con(db_path)
  load_duckdb_extension(con, "spatial")
  load_duckdb_extension(con, "icu")
  message(glue("Saved checkpoint: {db_checkpoint}"))
} else {
  message(glue("Checkpoint already exists: {db_checkpoint}"))
}

15 Wide-Format Output (for ERDDAP)

Create a wide-format table preserving the original column layout for serving into ERDDAP. This is a supplemental output — not part of the normalized database.

Code
# identify measurement and quality columns from measurement_type metadata
d_meas_ctd <- d_meas_type |>
  filter(str_detect(`_source_datasets`, "calcofi_ctd-cast"))
meas_cols <- d_meas_ctd$`_source_column`
qual_cols <- d_meas_ctd$`_qual_column` |> na.omit() |> as.character()
qual_cols <- qual_cols[qual_cols != ""]

# get raw columns
raw_cols <- dbGetQuery(
  con,
  "SELECT column_name FROM information_schema.columns WHERE table_name = 'ctd_raw'"
)$column_name

# wide-format: cast metadata + depth + all measurement/quality columns
# exclude provenance and intermediate columns
wide_keep <- intersect(
  raw_cols,
  c("cruise_key", "site_key", "cast_key", "cast_dir",
    "datetime_utc", "lon_dec", "lat_dec", "ship_key",
    "data_stage", "depth_m", meas_cols, qual_cols))

dbExecute(
  con,
  glue(
    "CREATE OR REPLACE TABLE ctd_wide AS
     SELECT {paste(wide_keep, collapse = ', ')}
     FROM ctd_raw
     ORDER BY cruise_key, site_key, cast_dir, depth_m")
)

n_wide <- dbGetQuery(con, "SELECT COUNT(*) AS n FROM ctd_wide")$n
message(glue("ctd_wide: {format(n_wide, big.mark = ',')} rows (supplemental, for ERDDAP)"))

16 Split into Tidy Tables

16.1 ctd_cast — cast-level metadata

Code
# d_meas_ctd, meas_cols, qual_cols already defined in ctd_wide chunk above

# use ctd_raw columns (includes ship_key and data_stage added via SQL)
raw_cols <- dbGetQuery(
  con,
  "SELECT column_name FROM information_schema.columns WHERE table_name = 'ctd_raw'"
)$column_name

cast_cols <- setdiff(
  raw_cols,
  c(meas_cols, qual_cols, "depth_m", "_source_file")
)

# extract unique casts
dbExecute(
  con,
  glue(
    "
  CREATE OR REPLACE TABLE ctd_cast AS
  SELECT DISTINCT {paste(cast_cols, collapse = ', ')}
  FROM ctd_raw
  ORDER BY datetime_utc, cruise_key"
  )
)

n_cast <- dbGetQuery(con, "SELECT COUNT(*) AS n FROM ctd_cast")$n
message(glue("ctd_cast: {format(n_cast, big.mark = ',')} rows"))

# assign deterministic UUID from composite natural key (md5-based, DuckDB-native)
assign_deterministic_uuids_md5(
  con = con,
  table_name = "ctd_cast",
  id_col = "ctd_cast_uuid",
  key_cols = c("cruise_key", "cast_key", "cast_dir", "datetime_utc")
)

16.2 ctd_measurement — long-format sensor readings

Code
# add ctd_cast_uuid to ctd_raw for FK linkage
dbExecute(
  con,
  "
  ALTER TABLE ctd_raw ADD COLUMN IF NOT EXISTS ctd_cast_uuid VARCHAR"
)
dbExecute(
  con,
  "
  UPDATE ctd_raw AS r
  SET ctd_cast_uuid = c.ctd_cast_uuid
  FROM ctd_cast AS c
  WHERE r.cruise_key   = c.cruise_key
    AND r.cast_key     = c.cast_key
    AND r.cast_dir     = c.cast_dir
    AND r.datetime_utc = c.datetime_utc"
)

# pivot wide-to-long one measurement type at a time to avoid OOM
dbExecute(con, "DROP TABLE IF EXISTS ctd_measurement")

for (i in seq_len(nrow(d_meas_ctd))) {
  row <- d_meas_ctd[i, ]
  src_col <- row$`_source_column`
  meas_type <- row$measurement_type
  qual_col <- row$`_qual_column`

  qual_expr <- if (!is.na(qual_col) && qual_col != "") {
    glue("CAST({qual_col} AS VARCHAR)")
  } else {
    "NULL"
  }

  sql_select <- glue(
    "
    SELECT ctd_cast_uuid, depth_m,
      '{meas_type}' AS measurement_type,
      CAST({src_col} AS DOUBLE) AS measurement_value,
      {qual_expr} AS measurement_qual
    FROM ctd_raw
    WHERE {src_col} IS NOT NULL
      AND NOT isnan(CAST({src_col} AS DOUBLE))
      AND isfinite(CAST({src_col} AS DOUBLE))
    "
  )

  if (i == 1) {
    dbExecute(con, glue("CREATE TABLE ctd_measurement AS {sql_select}"))
  } else {
    dbExecute(con, glue("INSERT INTO ctd_measurement {sql_select}"))
  }
  message(glue("  {i}/{nrow(d_meas_ctd)}: {meas_type}"))
}

n_meas <- dbGetQuery(con, "SELECT COUNT(*) AS n FROM ctd_measurement")$n
message(glue("ctd_measurement: {format(n_meas, big.mark = ',')} rows"))

# assign deterministic UUID (md5-based, DuckDB-native — single SQL, no R data transfer)
assign_deterministic_uuids_md5(
  con = con,
  table_name = "ctd_measurement",
  id_col = "ctd_measurement_uuid",
  key_cols = c("ctd_cast_uuid", "depth_m", "measurement_type")
)

# add cruise_key for partitioned parquet output
dbExecute(
  con,
  "ALTER TABLE ctd_measurement ADD COLUMN IF NOT EXISTS cruise_key VARCHAR"
)
dbExecute(
  con,
  "UPDATE ctd_measurement AS m
   SET cruise_key = c.cruise_key
   FROM ctd_cast AS c
   WHERE m.ctd_cast_uuid = c.ctd_cast_uuid"
)

16.3 ctd_summary — summary stats across cast directions

Code
dbExecute(
  con,
  "
  CREATE OR REPLACE TABLE ctd_summary AS
  SELECT
    c.cruise_key,
    c.site_key,
    m.depth_m,
    m.measurement_type,
    AVG(m.measurement_value)    AS avg,
    CASE
      WHEN COUNT(*) = 1 THEN 0
      ELSE COALESCE(STDDEV_SAMP(m.measurement_value), 0)
    END                          AS stddev,
    COUNT(*)                     AS n_obs
  FROM ctd_measurement m
  INNER JOIN ctd_cast c ON m.ctd_cast_uuid = c.ctd_cast_uuid
  WHERE NOT isnan(m.measurement_value)
    AND isfinite(m.measurement_value)
  GROUP BY c.cruise_key, c.site_key, m.depth_m, m.measurement_type"
)

n_summ <- dbGetQuery(con, "SELECT COUNT(*) AS n FROM ctd_summary")$n
message(glue("ctd_summary: {format(n_summ, big.mark = ',')} rows"))

assign_deterministic_uuids_md5(
  con = con,
  table_name = "ctd_summary",
  id_col = "ctd_summary_uuid",
  key_cols = c("cruise_key", "site_key", "depth_m", "measurement_type")
)

16.4 measurement_type — reference table

Code
dbWriteTable(con, "measurement_type", d_meas_type, overwrite = TRUE)
message(glue("measurement_type: {nrow(d_meas_type)} rows"))

# validate: all measurement_types used by this dataset are registered
ctd_types_used <- dbGetQuery(
  con,
  "SELECT DISTINCT measurement_type FROM ctd_measurement"
)$measurement_type

registered <- d_meas_type |>
  filter(str_detect(`_source_datasets`, "calcofi_ctd-cast")) |>
  pull(measurement_type)

unregistered <- setdiff(ctd_types_used, registered)
stopifnot(
  "all measurement_types used by calcofi_ctd-cast must be registered in _source_datasets" = length(
    unregistered
  ) ==
    0
)

16.5 Drop intermediate table

Code
dbExecute(con, "DROP TABLE IF EXISTS ctd_raw")

17 Schema Diagram

Code
# define PK/FK relationships for visualization and relationships.json
ctd_rels <- list(
  primary_keys = list(
    ctd_cast        = "ctd_cast_uuid",
    ctd_measurement = "ctd_measurement_uuid",
    ctd_summary     = "ctd_summary_uuid",
    measurement_type = "measurement_type"),
  foreign_keys = list(
    list(table = "ctd_measurement", column = "ctd_cast_uuid", ref_table = "ctd_cast", ref_column = "ctd_cast_uuid")))

cc_erd(con, rels = ctd_rels)

18 Add Spatial

Code
add_point_geom(con, "ctd_cast", lon_col = "lon_dec", lat_col = "lat_dec")

# grid already loaded in Cross-Dataset Bridge section
assign_grid_key(con, "ctd_cast")

19 Derived View

Code
dbExecute(
  con,
  "
  CREATE OR REPLACE VIEW ctd_cast_derived AS
  SELECT *,
    datetime_utc AT TIME ZONE 'UTC' AT TIME ZONE 'US/Pacific' AS datetime_pst,
    EXTRACT(YEAR FROM datetime_utc)  AS year,
    EXTRACT(MONTH FROM datetime_utc) AS month,
    EXTRACT(DOY FROM datetime_utc)   AS julian_day
  FROM ctd_cast"
)

20 Load Dataset Metadata

Code
d_dataset <- read_csv(here("metadata/dataset.csv"))
dbWriteTable(con, "dataset", d_dataset, overwrite = TRUE)
message(glue("dataset: {nrow(d_dataset)} datasets registered"))

21 Validate and Enforce Types

Code
validate_for_release(con)
enforce_column_types(con, d_flds_rd = d_flds_rd)

22 Preview Tables

Code
preview_tables(
  con,
  tables = c("ctd_cast", "ctd_measurement", "ctd_summary",
             "measurement_type", "ctd_wide")
)

23 Write Parquet

Code
# collect mismatches for manifest
mismatches <- list(
  ships             = collect_ship_mismatches(con, "ctd_cast"),
  measurement_types = collect_measurement_type_mismatches(
    con, here("metadata/measurement_type.csv")),
  cruise_keys       = collect_cruise_key_mismatches(con, "ctd_cast"))

parquet_stats <- write_parquet_outputs(
  con          = con,
  output_dir   = dir_parquet,
  tables       = c("ctd_cast", "ctd_measurement", "ctd_summary",
                    "measurement_type", "ctd_wide"),
  partition_by = list(
    ctd_measurement = "cruise_key",
    ctd_summary     = "cruise_key",
    ctd_wide        = "cruise_key"
  ),
  strip_provenance = FALSE,
  mismatches       = mismatches,
  supplemental     = "ctd_wide"
)

parquet_stats |>
  dt(fname = "ctd_parquet_stats")

24 Write Metadata JSON

Code
build_metadata_json(
  con = con,
  d_tbls_rd = d_tbls_rd,
  d_flds_rd = d_flds_rd,
  metadata_derived_csv = glue("{dir_meta}/metadata_derived.csv"),
  output_dir = dir_parquet,
  provider = provider,
  dataset = dataset
)

# write relationships.json sidecar with PKs/FKs
build_relationships_json(
  rels       = ctd_rels,
  output_dir = dir_parquet,
  provider   = provider,
  dataset    = dataset
)

25 Upload to GCS

Code
# always re-enable eval for upload and cleanup
knitr::opts_chunk$set(eval = TRUE)
Code
sync_to_gcs(
  local_dir = dir_parquet,
  gcs_prefix = glue("ingest/{dir_label}"),
  bucket = "calcofi-db"
)
# A tibble: 294 × 3
   file                                                         action local_md5
   <chr>                                                        <chr>  <chr>    
 1 ctd_cast.parquet                                             skipp… 10d056e0…
 2 ctd_measurement/cruise_key=__HIVE_DEFAULT_PARTITION__/data_… uploa… 8b00aa99…
 3 ctd_measurement/cruise_key=1998-02-31JD/data_0.parquet       uploa… b3ef155b…
 4 ctd_measurement/cruise_key=1998-04-31JD/data_0.parquet       uploa… 296864d1…
 5 ctd_measurement/cruise_key=1998-07-32NM/data_0.parquet       uploa… 653a324a…
 6 ctd_measurement/cruise_key=2003-02-31JD/data_0.parquet       uploa… 6905484c…
 7 ctd_measurement/cruise_key=2003-07-32NM/data_0.parquet       uploa… db861d74…
 8 ctd_measurement/cruise_key=2003-10-32NM/data_0.parquet       uploa… 80cf2e3d…
 9 ctd_measurement/cruise_key=2004-01-31JD/data_0.parquet       uploa… aa76e504…
10 ctd_measurement/cruise_key=2004-04-31JD/data_0.parquet       uploa… 576a961e…
# ℹ 284 more rows

26 Cleanup

Code
close_duckdb(con)

# remove checkpoint after successful completion (fresh next run)
if (file_exists(db_checkpoint)) {
  file_delete(db_checkpoint)
  message(glue("Removed checkpoint: {db_checkpoint}"))
}

27 Appendix: Interactive Gantt Chart

Code
# reconnect for gantt chart
con_pq <- get_duckdb_con()
load_duckdb_extension(con_pq, "spatial")

# read ctd_cast from parquet (exclude GEOMETRY columns for R driver compat)
dbExecute(con_pq, glue("
  CREATE OR REPLACE VIEW _ctd_probe AS
  SELECT * FROM read_parquet('{dir_parquet}/ctd_cast.parquet') LIMIT 0"))
[1] 0
Code
pq_geom_cols <- dbGetQuery(con_pq,
  "SELECT column_name FROM information_schema.columns
   WHERE table_name = '_ctd_probe' AND data_type LIKE 'GEOMETRY%'")$column_name
pq_all_cols <- dbGetQuery(con_pq,
  "SELECT column_name FROM information_schema.columns
   WHERE table_name = '_ctd_probe'
   ORDER BY ordinal_position")$column_name
pq_safe_cols <- setdiff(pq_all_cols, pq_geom_cols)
dbExecute(con_pq, "DROP VIEW IF EXISTS _ctd_probe")
[1] 0
Code
dbExecute(con_pq, glue("
  CREATE VIEW ctd_cast AS
  SELECT {paste(pq_safe_cols, collapse = ', ')}
  FROM read_parquet('{dir_parquet}/ctd_cast.parquet')"))
[1] 0
Code
cruise_spans <- tbl(con_pq, "ctd_cast") |>
  group_by(cruise_key, data_stage) |>
  summarize(
    beg_date = min(datetime_utc, na.rm = TRUE) |> as.Date(),
    end_date = max(datetime_utc, na.rm = TRUE) |> as.Date(),
    .groups = "drop"
  ) |>
  collect() |>
  filter(!is.na(beg_date), !is.na(end_date))

# split multi-year cruises into separate rows
cruise_spans <- cruise_spans |>
  rowwise() |>
  mutate(
    years_spanned = list(year(beg_date):year(end_date))
  ) |>
  unnest(years_spanned) |>
  mutate(
    year = years_spanned,
    adj_beg_date = if_else(
      years_spanned == year(beg_date),
      beg_date,
      as.Date(paste0(years_spanned, "-01-01"))
    ),
    adj_end_date = if_else(
      years_spanned == year(end_date),
      end_date,
      as.Date(paste0(years_spanned, "-12-31"))
    ),
    begin_jday = yday(adj_beg_date),
    end_jday = yday(adj_end_date),
    stage_label = data_stage,
    hover_text = glue(
      "Cruise: {cruise_key}<br>",
      "Stage: {data_stage}<br>",
      "Begin: {format(beg_date, '%Y-%m-%d')}<br>",
      "End: {format(end_date, '%Y-%m-%d')}<br>",
      "Duration: {as.numeric(difftime(end_date, beg_date, units = 'days'))} days"
    )
  ) |>
  ungroup() |>
  arrange(adj_beg_date)

colors <- c("final" = "#23d355ff", "preliminary" = "#A23B72")

p <- plot_ly()

for (i in 1:nrow(cruise_spans)) {
  row <- cruise_spans[i, ]
  p <- p |>
    add_trace(
      type = "scatter",
      mode = "lines",
      x = c(row$begin_jday, row$end_jday),
      y = c(row$year, row$year),
      line = list(
        color = colors[row$stage_label],
        width = 8
      ),
      text = row$hover_text,
      hoverinfo = "text",
      showlegend = FALSE,
      legendgroup = row$stage_label
    )
}

p <- p |>
  add_trace(
    type = "scatter",
    mode = "lines",
    x = c(NA, NA),
    y = c(NA, NA),
    line = list(color = colors["final"], width = 8),
    name = "Final",
    showlegend = TRUE
  ) |>
  add_trace(
    type = "scatter",
    mode = "lines",
    x = c(NA, NA),
    y = c(NA, NA),
    line = list(color = colors["preliminary"], width = 8),
    name = "Preliminary",
    showlegend = TRUE
  )

p <- p |>
  layout(
    title = "CalCOFI CTD Cruise Timeline",
    xaxis = list(
      title = "Day of Year",
      range = c(1, 365),
      dtick = 30,
      tickangle = 0
    ),
    yaxis = list(
      title = "Year",
      autorange = "reversed",
      dtick = 1
    ),
    hovermode = "closest",
    plot_bgcolor = "#f8f9fa",
    paper_bgcolor = "white",
    legend = list(
      x = 1.02,
      y = 1,
      xanchor = "left",
      yanchor = "top"
    )
  )

p
Code
close_duckdb(con_pq)
Code
devtools::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.5.2 (2025-10-31)
 os       macOS Sequoia 15.7.1
 system   aarch64, darwin20
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       Europe/Rome
 date     2026-04-02
 pandoc   3.8.3 @ /opt/homebrew/bin/ (via rmarkdown)
 quarto   1.8.25 @ /usr/local/bin/quarto

─ Packages ───────────────────────────────────────────────────────────────────
 !  package            * version    date (UTC) lib source
    abind                1.4-8      2024-09-12 [1] CRAN (R 4.5.0)
    arrow                23.0.1.1   2026-02-24 [1] CRAN (R 4.5.2)
    assertthat           0.2.1      2019-03-21 [1] CRAN (R 4.5.0)
    backports            1.5.0      2024-05-23 [1] CRAN (R 4.5.0)
    base64enc            0.1-6      2026-02-02 [1] CRAN (R 4.5.2)
    bit                  4.6.0      2025-03-06 [1] CRAN (R 4.5.0)
    bit64                4.6.0-1    2025-01-16 [1] CRAN (R 4.5.0)
    blob                 1.3.0      2026-01-14 [1] CRAN (R 4.5.2)
    broom                1.0.12     2026-01-27 [1] CRAN (R 4.5.2)
    bslib                0.10.0     2026-01-26 [1] CRAN (R 4.5.2)
    cachem               1.1.0      2024-05-16 [1] CRAN (R 4.5.0)
 VP calcofi4db         * 2.5.1      2026-04-02 [?] load_all() (on disk 2.5.0)
 VP calcofi4r          * 1.1.5      2026-03-26 [?] Github (calcofi/calcofi4r@a1d14ec) (on disk 1.1.4)
    chromote             0.5.1      2025-04-24 [1] CRAN (R 4.5.0)
    class                7.3-23     2025-01-01 [1] CRAN (R 4.5.2)
    classInt             0.4-11     2025-01-08 [1] CRAN (R 4.5.0)
    cli                  3.6.5      2025-04-23 [1] CRAN (R 4.5.0)
    codetools            0.2-20     2024-03-31 [1] CRAN (R 4.5.2)
    crayon               1.5.3      2024-06-20 [1] CRAN (R 4.5.0)
    crosstalk            1.2.2      2025-08-26 [1] CRAN (R 4.5.0)
    curl                 7.0.0      2025-08-19 [1] CRAN (R 4.5.0)
    data.table           1.18.2.1   2026-01-27 [1] CRAN (R 4.5.2)
    DBI                * 1.3.0      2026-02-25 [1] CRAN (R 4.5.2)
    dbplyr               2.5.2      2026-02-13 [1] CRAN (R 4.5.2)
    desc                 1.4.3      2023-12-10 [1] CRAN (R 4.5.0)
    devtools             2.5.0      2026-03-14 [1] CRAN (R 4.5.2)
    DiagrammeR           1.0.11     2024-02-02 [1] CRAN (R 4.5.0)
    DiagrammeRsvg        0.1        2016-02-04 [1] CRAN (R 4.5.0)
    digest               0.6.39     2025-11-19 [1] CRAN (R 4.5.2)
    dm                   1.1.1      2026-03-05 [1] CRAN (R 4.5.2)
    dplyr              * 1.2.0      2026-02-03 [1] CRAN (R 4.5.2)
    DT                 * 0.34.0     2025-09-02 [1] CRAN (R 4.5.0)
    duckdb               1.5.0      2026-03-14 [1] CRAN (R 4.5.2)
    dygraphs             1.1.1.6    2018-07-11 [1] CRAN (R 4.5.0)
    e1071                1.7-17     2025-12-18 [1] CRAN (R 4.5.2)
    ellipsis             0.3.2      2021-04-29 [1] CRAN (R 4.5.0)
    evaluate             1.0.5      2025-08-27 [1] CRAN (R 4.5.0)
    farver               2.1.2      2024-05-13 [1] CRAN (R 4.5.0)
    fastmap              1.2.0      2024-05-15 [1] CRAN (R 4.5.0)
    fs                 * 2.0.1      2026-03-24 [1] CRAN (R 4.5.2)
    fuzzyjoin            0.1.8      2026-02-20 [1] CRAN (R 4.5.2)
    gargle               1.6.1      2026-01-29 [1] CRAN (R 4.5.2)
    generics             0.1.4      2025-05-09 [1] CRAN (R 4.5.0)
    geojsonsf            2.0.5      2025-11-26 [1] CRAN (R 4.5.2)
    ggplot2            * 4.0.2      2026-02-03 [1] CRAN (R 4.5.2)
    glue               * 1.8.0      2024-09-30 [1] CRAN (R 4.5.0)
    googledrive          2.1.2      2025-09-10 [1] CRAN (R 4.5.0)
    gtable               0.3.6      2024-10-25 [1] CRAN (R 4.5.0)
    here               * 1.0.2      2025-09-15 [1] CRAN (R 4.5.0)
    highcharter          0.9.4      2022-01-03 [1] CRAN (R 4.5.0)
    hms                  1.1.4      2025-10-17 [1] CRAN (R 4.5.0)
    htmltools            0.5.9      2025-12-04 [1] CRAN (R 4.5.2)
    htmlwidgets          1.6.4      2023-12-06 [1] CRAN (R 4.5.0)
    httpuv               1.6.17     2026-03-18 [1] CRAN (R 4.5.2)
    httr               * 1.4.8      2026-02-13 [1] CRAN (R 4.5.2)
    httr2                1.2.2      2025-12-08 [1] CRAN (R 4.5.2)
    igraph               2.2.2      2026-02-12 [1] CRAN (R 4.5.2)
    isoband              0.3.0      2025-12-07 [1] CRAN (R 4.5.2)
    janitor            * 2.2.1      2024-12-22 [1] CRAN (R 4.5.0)
    jquerylib            0.1.4      2021-04-26 [1] CRAN (R 4.5.0)
    jsonlite             2.0.0      2025-03-27 [1] CRAN (R 4.5.0)
    KernSmooth           2.23-26    2025-01-01 [1] CRAN (R 4.5.2)
    knitr                1.51       2025-12-20 [1] CRAN (R 4.5.2)
    later                1.4.8      2026-03-05 [1] CRAN (R 4.5.2)
    lattice              0.22-9     2026-02-09 [1] CRAN (R 4.5.2)
    lazyeval             0.2.2      2019-03-15 [1] CRAN (R 4.5.0)
    leafem               0.2.5      2025-08-28 [1] CRAN (R 4.5.0)
    leaflet              2.2.3      2025-09-04 [1] CRAN (R 4.5.0)
    librarian            1.8.1      2021-07-12 [1] CRAN (R 4.5.0)
    lifecycle            1.0.5      2026-01-08 [1] CRAN (R 4.5.2)
    lubridate          * 1.9.5      2026-02-04 [1] CRAN (R 4.5.2)
    magrittr             2.0.4      2025-09-12 [1] CRAN (R 4.5.0)
    mapgl                0.4.5.9000 2026-03-31 [1] Github (bbest/mapgl@74df3b1)
    mapview            * 2.11.4     2025-09-08 [1] CRAN (R 4.5.0)
    markdown             2.0        2025-03-23 [1] CRAN (R 4.5.0)
    Matrix               1.7-5      2026-03-21 [1] CRAN (R 4.5.2)
    memoise              2.0.1      2021-11-26 [1] CRAN (R 4.5.0)
    mgcv                 1.9-4      2025-11-07 [1] CRAN (R 4.5.0)
    mime                 0.13       2025-03-17 [1] CRAN (R 4.5.0)
    nlme                 3.1-168    2025-03-31 [1] CRAN (R 4.5.2)
    otel                 0.2.0      2025-08-29 [1] CRAN (R 4.5.0)
    pillar               1.11.1     2025-09-17 [1] CRAN (R 4.5.0)
    pkgbuild             1.4.8      2025-05-26 [1] CRAN (R 4.5.0)
    pkgconfig            2.0.3      2019-09-22 [1] CRAN (R 4.5.0)
    pkgload              1.5.0      2026-02-03 [1] CRAN (R 4.5.2)
    plotly             * 4.12.0     2026-01-24 [1] CRAN (R 4.5.2)
    png                  0.1-9      2026-03-15 [1] CRAN (R 4.5.2)
    processx             3.8.6      2025-02-21 [1] CRAN (R 4.5.0)
    promises             1.5.0      2025-11-01 [1] CRAN (R 4.5.0)
    proxy                0.4-29     2025-12-29 [1] CRAN (R 4.5.2)
    ps                 * 1.9.1      2025-04-12 [1] CRAN (R 4.5.0)
    purrr              * 1.2.1      2026-01-09 [1] CRAN (R 4.5.2)
    quantmod             0.4.28     2025-06-19 [1] CRAN (R 4.5.0)
    R6                   2.6.1      2025-02-15 [1] CRAN (R 4.5.0)
    rappdirs             0.3.4      2026-01-17 [1] CRAN (R 4.5.2)
    raster               3.6-32     2025-03-28 [1] CRAN (R 4.5.0)
    RColorBrewer         1.1-3      2022-04-03 [1] CRAN (R 4.5.0)
    Rcpp                 1.1.1      2026-01-10 [1] CRAN (R 4.5.2)
    readr              * 2.2.0      2026-02-19 [1] CRAN (R 4.5.2)
    rlang                1.1.7      2026-01-09 [1] CRAN (R 4.5.2)
    rlist                0.4.6.2    2021-09-03 [1] CRAN (R 4.5.0)
    rmarkdown            2.31       2026-03-26 [1] CRAN (R 4.5.2)
    rnaturalearth        1.2.0      2026-01-19 [1] CRAN (R 4.5.2)
    rnaturalearthhires   1.0.0.9000 2025-10-02 [1] Github (ropensci/rnaturalearthhires@e4736f6)
    RPostgres            1.4.10     2026-02-16 [1] CRAN (R 4.5.2)
    rprojroot            2.1.1      2025-08-26 [1] CRAN (R 4.5.0)
    rstudioapi           0.18.0     2026-01-16 [1] CRAN (R 4.5.2)
    rvest              * 1.0.5      2025-08-29 [1] CRAN (R 4.5.0)
    S7                   0.2.1      2025-11-14 [1] CRAN (R 4.5.2)
    sass                 0.4.10     2025-04-11 [1] CRAN (R 4.5.0)
    satellite            1.0.6      2025-08-21 [1] CRAN (R 4.5.0)
    scales               1.4.0      2025-04-24 [1] CRAN (R 4.5.0)
    sessioninfo          1.2.3      2025-02-05 [1] CRAN (R 4.5.0)
    sf                 * 1.1-0      2026-02-24 [1] CRAN (R 4.5.2)
    shiny                1.13.0     2026-02-20 [1] CRAN (R 4.5.2)
    shinyWidgets         0.9.1      2026-03-09 [1] CRAN (R 4.5.2)
    snakecase            0.11.1     2023-08-27 [1] CRAN (R 4.5.0)
    sp                   2.2-1      2026-02-13 [1] CRAN (R 4.5.2)
    stars                0.7-1      2026-02-13 [1] CRAN (R 4.5.2)
    stringi              1.8.7      2025-03-27 [1] CRAN (R 4.5.0)
    stringr            * 1.6.0      2025-11-04 [1] CRAN (R 4.5.0)
    terra                1.9-11     2026-03-26 [1] CRAN (R 4.5.2)
    tibble             * 3.3.1      2026-01-11 [1] CRAN (R 4.5.2)
    tidyr              * 1.3.2      2025-12-19 [1] CRAN (R 4.5.2)
    tidyselect           1.2.1      2024-03-11 [1] CRAN (R 4.5.0)
    timechange           0.4.0      2026-01-29 [1] CRAN (R 4.5.2)
    TTR                  0.24.4     2023-11-28 [1] CRAN (R 4.5.0)
    tzdb                 0.5.0      2025-03-15 [1] CRAN (R 4.5.0)
    units                1.0-1      2026-03-11 [1] CRAN (R 4.5.2)
    usethis              3.2.1      2025-09-06 [1] CRAN (R 4.5.0)
    utf8                 1.2.6      2025-06-08 [1] CRAN (R 4.5.0)
    uuid                 1.2-2      2026-01-23 [1] CRAN (R 4.5.2)
    V8                   8.0.1      2025-10-10 [1] CRAN (R 4.5.0)
    vctrs                0.7.2      2026-03-21 [1] CRAN (R 4.5.2)
    viridisLite          0.4.3      2026-02-04 [1] CRAN (R 4.5.2)
    visNetwork           2.1.4      2025-09-04 [1] CRAN (R 4.5.0)
    vroom                1.7.0      2026-01-27 [1] CRAN (R 4.5.2)
    websocket            1.4.4      2025-04-10 [1] CRAN (R 4.5.0)
    withr                3.0.2      2024-10-28 [1] CRAN (R 4.5.0)
    xfun                 0.57       2026-03-20 [1] CRAN (R 4.5.2)
    xml2                 1.5.2      2026-01-17 [1] CRAN (R 4.5.2)
    xtable               1.8-8      2026-02-22 [1] CRAN (R 4.5.2)
    xts                  0.14.2     2026-02-28 [1] CRAN (R 4.5.2)
    yaml                 2.3.12     2025-12-10 [1] CRAN (R 4.5.2)
    zip                * 2.3.3      2025-05-13 [1] CRAN (R 4.5.0)
    zoo                  1.8-15     2025-12-15 [1] CRAN (R 4.5.2)

 [1] /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library

 * ── Packages attached to the search path.
 V ── Loaded and on-disk version mismatch.
 P ── Loaded and on-disk path mismatch.

──────────────────────────────────────────────────────────────────────────────