Ingest NOAA CalCOFI Database

Published

2025-07-02

Overview

Goal: Generate the database from source files with workflow scripts to make updating easier and provenance fully transparent. This allows us to:

  • Rename tables and column names, control data types and use Unicode encoding for a consistent database ingestion strategy, per Database naming conventions in Database – CalCOFI.io Docs.

  • Differentiate between raw and derived or updated tables and columns. For instance, the taxonomy for any given species can change over time, such as lumping or splitting of a given taxa, and by taxonomic authority (e.g., WoRMS, ITIS or GBIF). These taxonomic identifiers and the full taxonomic hierarchy should get regularly updated regardless of source observational data, and can either be updated in the table directly or joined one-to-one with a seperate table in a materialized view (so as not to slow down queries with a regular view).

This workflow processes NOAA CalCOFI database CSV files and updates the PostgreSQL database. The workflow:

  1. Reads CSV files from source directory
  2. Compares with lookup tables for field descriptions
  3. Initializes or updates database tables
  4. Generates summary statistics
Code
graph TD
    A[Read CSV Files] --> B[Extract Column Names]
    B --> C[Compare with Lookups]
    C --> D[Process Field Descriptions]
    D --> E{Table Exists?}
    E -->|Yes| F[Update Table]
    E -->|No| G[Initialize Table]
    F --> H[Summary Stats]
    G --> H
graph TD
    A[Read CSV Files] --> B[Extract Column Names]
    B --> C[Compare with Lookups]
    C --> D[Process Field Descriptions]
    D --> E{Table Exists?}
    E -->|Yes| F[Update Table]
    E -->|No| G[Initialize Table]
    F --> H[Summary Stats]
    G --> H

Overview diagram of CSV ingestion process into the database.

See also 5.3 Ingest datasets with documentation – Database – CalCOFI.io Docs for generic overview of ingestion process.

Code
# devtools::install_local(here::here("../calcofi4db"), force = T)
# devtools::load_all(here::here("../calcofi4db"))
# options(error=NULL)
librarian::shelf(
  calcofi4db, DBI, dm, dplyr, DT, fs, glue, gargle, googledrive, here, janitor,
  jsonlite, knitr, lubridate, purrr, readr, rlang, tibble, tidyr, uuid,
  quiet = T)
options(readr.show_col_types = F)

# get database connection
con     <- get_db_con(c("dev", "dev_ref"))
con_dev <- get_db_con(c("dev"))

# define paths ---
dataset  <- "calcofi-db"
provider <- "swfsc.noaa.gov"

# data directory with CSV files
# dir_data <- "/Users/bbest/My Drive/projects/calcofi/data"
# same data folder as Google Drive URL
# gdir_data <- "https://drive.google.com/drive/u/0/folders/13A9yDJCdV0M-lrMLqurzIb3G3ozknr9O"

# Load data using calcofi4db package
d <- read_csv_files(provider, dataset)

# Extract variables from data_info for backward compatibility
# workflow_info <- data_info$workflow_info
# workflow      <- workflow_info$workflow
# workflow_qmd  <- workflow_info$workflow_qmd
# workflow_url  <- workflow_info$workflow_url
# 
# dir_csv <- data_info$paths$dir_csv
# dir_ingest <- dirname(data_info$paths$dir_ingest)
# tbls_in_csv <- data_info$paths$tbls_in_csv
# flds_in_csv <- data_info$paths$flds_in_csv
# tbls_rn_csv <- data_info$paths$tbls_rn_csv
# flds_rn_csv <- data_info$paths$flds_rn_csv
# 
# d_gdir_data <- data_info$d_gdir_data
# d_tbls_rn <- data_info$d_tbls_rn
# d_flds_rn <- data_info$d_flds_rn
# d <- data_info$csv_metadata$data

0.1 Check for any mismatched tables and fields

Code
# detect changes between csv files and redefinitions
changes <- detect_csv_changes(d)

# print summary statistics
print_csv_change_stats(changes, verbose = TRUE)
CSV Change Summary:
==================
Tables added:    3
Tables removed:  0
Fields added:    1 (across 1 tables)
Fields removed:  1 (across 1 tables)
Type mismatches: 0 (across 0 tables)

Tables Added:
  + eggstage
  + larvaesize
  + larvaestage

Fields Added:
  cruise:
    + ship

Fields Removed:
  cruise:
    - Ship
Code
# display interactive table of changes
if (nrow(changes$summary) > 0) {
  display_csv_changes(changes, format = "DT", title = "CSV vs Redefinition Mismatches")
}
Code
# check if there are any mismatches that need to be resolved
if (nrow(changes$summary) > 0) {
  # disable evaluation of remaining chunks
  knitr::opts_chunk$set(eval = FALSE)
  
  # output markdown warning
  cat(glue("
    
    ## ⚠️ Data Integrity Check Failed
    
    ### Workflow Halted
    
    Mismatches have been detected between the CSV files and redefinition metadata. 
    These must be resolved before proceeding with database ingestion to ensure data integrity.
    
    ### Required Actions
    
    Please review the changes detected above and update the following redefinition files:
    
    - **Tables redefinition**: `{d$paths$tbls_rd_csv}`
    - **Fields redefinition**: `{d$paths$flds_rd_csv}`
    
    ### Common Resolutions
    
    1. **New tables/fields in CSV**: Add them to the appropriate redefinition file
    2. **Removed tables/fields from CSV**: Remove obsolete entries from redefinition files
    3. **Type mismatches**: Update field types in redefinition files to match CSV data types
    4. **Field name changes**: Update `fld_old` entries to match current CSV field names
    
    ### Next Steps
    
    After updating the redefinition files, re-run this workflow. The remaining code chunks 
    have been disabled and will not execute until all mismatches are resolved.
    
    ---
    
    *Note: The remainder of this document contains code that will not be executed due to 
    data integrity issues.*
    "))
} else {
  # no mismatches found - enable evaluation and show success message
  knitr::opts_chunk$set(eval = TRUE)
  
  cat(glue("
    
    ## ✅ Data Integrity Check Passed
    
    ### All Systems Go
    
    No mismatches were found between the CSV files and redefinition metadata. 
    The data structures are properly aligned and ready for database ingestion.
    
    ### Proceeding with Workflow
    
    The workflow will now continue with the following steps:
    
    1. Display CSV file metadata from Google Drive
    2. Show tables and fields to be ingested
    3. Apply data transformations based on redefinitions
    4. Load transformed data into the database
    5. Create indexes and relationships
    6. Add spatial data and generate reports
    
    ---
    
    "))
}

1 ⚠️ Data Integrity Check Failed

1.1 Workflow Halted

Mismatches have been detected between the CSV files and redefinition metadata. These must be resolved before proceeding with database ingestion to ensure data integrity.

1.2 Required Actions

Please review the changes detected above and update the following redefinition files:

  • Tables redefinition: /Users/bbest/Github/CalCOFI/workflows/ingest/swfsc.noaa.gov/calcofi-db/tbls_redefine.csv
  • Fields redefinition: /Users/bbest/Github/CalCOFI/workflows/ingest/swfsc.noaa.gov/calcofi-db/flds_redefine.csv

1.3 Common Resolutions

  1. New tables/fields in CSV: Add them to the appropriate redefinition file
  2. Removed tables/fields from CSV: Remove obsolete entries from redefinition files
  3. Type mismatches: Update field types in redefinition files to match CSV data types
  4. Field name changes: Update fld_old entries to match current CSV field names

1.4 Next Steps

After updating the redefinition files, re-run this workflow. The remaining code chunks have been disabled and will not execute until all mismatches are resolved.


Note: The remainder of this document contains code that will not be executed due to data integrity issues.

1.5 Show CSV Files on Google Drive

Code
show_googledrive_files(d)

1.6 Show CSV Tables and Fields to Ingest

Code
d$d_csv$tables |> 
  datatable(caption = "Tables to ingest.")
Code
d$d_csv$fields |> 
  datatable(caption = "Fields to ingest.")

2 Show tables and fields redefined

Code
show_tables_redefine(d)
Code
show_fields_redefine(d)

3 Apply remappings to data

Code
# Use calcofi4db to transform data
transformed_data <- transform_data(d)

# For backward compatibility
d <- transformed_data

4 Load Tables into Database

Code
schema    <- "dev"
overwrite <- T

# Use calcofi4db to ingest tables and generate metadata
# First, check for changes (this is now done above)
# changes <- detect_csv_changes(
#   con = con,
#   schema = schema,
#   transformed_data = transformed_data,
#   d_flds_rd = d$d_flds_rd
# )

# Display changes
if (length(changes$new_tables) > 0) {
  message("New tables to be added:")
  message(paste(" -", changes$new_tables, collapse = "\n"))
}

if (length(changes$field_changes) > 0) {
  message("\nTables with field changes:")
  for (tbl in names(changes$field_changes)) {
    message(glue(" - {tbl}:"))
    if (length(changes$field_changes[[tbl]]$added) > 0) {
      message(glue("   Added: {paste(changes$field_changes[[tbl]]$added, collapse = ', ')}"))
    }
    if (length(changes$field_changes[[tbl]]$removed) > 0) {
      message(glue("   Removed: {paste(changes$field_changes[[tbl]]$removed, collapse = ', ')}"))
    }
  }
}

if (length(changes$type_changes) > 0) {
  message("\nTables with type changes:")
  for (tbl in names(changes$type_changes)) {
    message(glue(" - {tbl}:"))
    for (fld in names(changes$type_changes[[tbl]])) {
      message(glue("   {fld}: {changes$type_changes[[tbl]][[fld]]$from} -> {changes$type_changes[[tbl]][[fld]]$to}"))
    }
  }
}

# Ingest data to database
tbl_stats <- ingest_csv_to_db(
  con = con,
  schema = schema,
  transformed_data = transformed_data,
  d_flds_rd = d$d_flds_rd,
  d_gdir_data = d$d_gdata,
  workflow_info = d$workflow_info,
  overwrite = overwrite
)

# Display summary statistics
tbl_stats |> 
  datatable(rownames = FALSE, filter = "top")

TODO:

5 Create Indexes, Relationships

Code
# source(here("../apps_dev/libs/db.R"))

tbls_dev <- dbListTables(con_dev) |> sort()

dm_dev <- dm_from_con(
  con_dev, 
  table_names = tbls_dev,
  learn_keys  = T)
dm_draw(dm_dev, view_type = "all")

5.1 Show existing unique keys across tables

Code
dm_get_all_uks(dm_dev)

5.2 Add candidate keys, automatically

Code
# Function to get primary key constraints for a table
tbl_pkeys <- function(con, tbl) {
  dbGetQuery(con, glue("
    SELECT a.attname as column_name
    FROM pg_index i
    JOIN pg_attribute a ON a.attrelid = i.indrelid
    AND a.attnum = ANY(i.indkey)
    WHERE i.indrelid = '{tbl}'::regclass
    AND i.indisprimary;
  "))$column_name
}

d_pk <- tibble(
  tbl = tbls_dev) |> 
  mutate(
    d        = map(tbl, \(x) eval(bquote(dm_enum_pk_candidates(dm_dev, .(x)))) ),
    n        = map_int(d, \(x) x |> filter(candidate == T) |> nrow()),
    flds     = map_chr(d, \(x){
      x |> filter(candidate == T) |> pull(columns) |> 
        unlist() |> paste(collapse = "; ") }),
    fld      = map_chr(d, \(x){
      y <- x |> filter(candidate == T) |> pull(columns) |> 
        unlist()
      if (length(y) == 0)
        return(NA)
      if (length(y) > 1)
        return(y[1])
      y }),
    add_pkey = map2_lgl(tbl, fld, \(tbl, fld){
      if (is.na(fld))
        return(F)
      # tbl <- "cruise"; fld = "cruise_uuid"
      pkey_exists <- tbl_pkeys(con_dev, "cruise") |> length() > 0
      if (pkey_exists)
        return(F)
      dbExecute(con_dev, glue("ALTER TABLE {tbl} ADD PRIMARY KEY ({fld})"))
      return(T) }) ) |> 
  select(-d)
    
d_pk |> 
  datatable()

5.3 Add primary keys, manually

Noticing egg and larva missing pkey.

Code
dbExecute(con_dev, "ALTER TABLE egg ADD PRIMARY KEY (net_uuid, sp_id)")
dbExecute(con_dev, "ALTER TABLE larva ADD PRIMARY KEY (net_uuid, sp_id)")

tibble(
  tbl = tbls_dev) |> 
  mutate(
    pkeys = map_chr(tbl, \(x) tbl_pkeys(x, con = con_dev) |> paste(collapse = ", "))) |> 
  datatable()

5.4 Add other indexes

That are not already the first field of a primary key.

Code
d_idx <- tibble(
  tbl = tbls_dev) |>
  mutate(
    fld   = map(tbl, ~dbListFields(.x, conn = con_dev)),
    pkeys = map(tbl, ~tbl_pkeys(.x, con = con_dev))) |> 
  unnest(fld) |>
  mutate(
    is_pkey  = map2_lgl(fld, pkeys, `%in%`),
    is_pkey1 = map2_lgl(fld, pkeys, \(fld, pkeys) fld == pkeys[1]),
    is_id    = map_lgl(fld, ~str_detect(.x, "_(id|uuid|tsn)$")),
    idx_todo = !is_pkey1 & is_id,
    mk_idx   = pmap_lgl(list(tbl, fld, idx_todo), \(tbl, fld, idx_todo){
      if (idx_todo){
        q <- glue("CREATE INDEX IF NOT EXISTS idx_{tbl}_{fld} ON {tbl} ({fld})")
        # message(q)
        dbExecute(con_dev, q)
        return(T)
      }
      return(F)
    }))

d_idx |> 
  select(-pkeys) |> 
  datatable()

5.5 Add foreign keys

egg.net_uuid -> net.net_uuid .tow_uuid -> tow.tow_uuid .site_uuid -> site.site_uuid .cruise_uuid -> cruise.cruise_uuid .ship_key -> ship.ship_key

Code
dm_dev <- dm_from_con(
  con_dev, 
  table_names = tbls_dev,
  learn_keys  = T)
dm_draw(dm_dev, view_type = "all")

dm_dev_fk <- dm_dev |> 
  dm_add_fk(egg, net_uuid, net) |> 
  dm_add_fk(larva, net_uuid, net) |> 
  dm_add_fk(net, tow_uuid, tow) |> 
  dm_add_fk(tow, site_uuid, site) |> 
  dm_add_fk(site, cruise_uuid, cruise) |> 
  dm_add_fk(cruise, ship_key, ship) |> 
  dm_add_fk(tow, tow_type_key, tow_type) |> 
  dm_add_fk(egg, sp_id, species) |> 
  dm_add_fk(larva, sp_id, species) 
dm_draw(dm_dev_fk, view_type = "all")
Code
add_fkey <- function(tbl_m, fld_m, tbl_1, fld_1 = fld_m, schema = "dev"){
  q <- glue(
    "ALTER TABLE {schema}.{tbl_m} ADD FOREIGN KEY ({fld_m}) 
    REFERENCES {schema}.{tbl_1} ({fld_1})")
  dbExecute(con_dev, q)
}

add_fkey("egg", "net_uuid", "net")
add_fkey("larva", "net_uuid", "net") 
add_fkey("net", "tow_uuid", "tow") 
add_fkey("tow", "site_uuid", "site")
add_fkey("site", "cruise_uuid", "cruise")
add_fkey("cruise", "ship_key", "ship")
add_fkey("tow", "tow_type_key", "tow_type")
add_fkey("egg", "sp_id", "species")
add_fkey("larva", "sp_id", "species")

dm_dev <- dm_from_con(
  con_dev, 
  table_names = tbls_dev,
  learn_keys  = T)
dm_draw(dm_dev, view_type = "all")
Error: Failed to fetch row : ERROR:  insert or update on table "larva" violates foreign key constraint "larva_sp_id_fkey"
DETAIL:  Key (sp_id)=(3023) is not present in table "species".
Code
sp_missing_csv <- here(
    "data/ingest/noaa-calcofi-db/larvae-species_not-in-species.csv")

d_sp_missing <- tbl(con_dev, "larva") |> 
  anti_join(
    tbl(con_dev, "species"),
    by = "sp_id") |> 
  group_by(sp_id) |> 
  summarize(
    n_rows = n()) |> 
  left_join(
    dm_dev |>
      dm_select_tbl(larva, net, tow, site) |> 
      dm_flatten_to_tbl(.start = larva, .recursive = TRUE) |> 
      select(sp_id, tally, time_start, line, station, cruise_uuid) |> 
      group_by(sp_id) |> 
      summarize(
        sum_tally  = sum(tally),
        date_beg   = min(as.Date(time_start)),
        date_end   = max(as.Date(time_start)),
        # ship_names = str_flatten(distinct(name), collapse = "; ")), 
        n_cruises = n_distinct(cruise_uuid)), 
    by = "sp_id") |> 
  collect()

write_csv(d_sp_missing, sp_missing_csv)

datatable(d_sp_missing)
Code
dbExecute(con_dev, glue(
  "DELETE FROM larva WHERE sp_id IN ({paste(d_sp_missing$sp_id, collapse = ',')})"))
add_fkey("larva", "sp_id", "species")

dm_dev <- dm_from_con(
  con_dev, 
  table_names = tbls_dev,
  learn_keys  = T)
dm_draw(dm_dev, view_type = "all")

6 Add Spatial

6.1 Add site.geom

Code
# source(here("../apps_dev/libs/db.R"))
# dbGetQuery(con_dev, "SELECT postgis_full_version()")

dbExecute(con_dev, "ALTER TABLE dev.site ADD COLUMN geom geometry(Point, 4326)")
dbExecute(con_dev, "UPDATE dev.site SET geom = ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)") # 61,220 rows

6.2 Fix calcofi4r grid

Problems with calcofi4r::cc_grid:

  • uses old station (line, position) vs newer site (line, station)
  • sta_lin, sta_pos: integer, so drops necessary decimal that is found in sta_key
  • sta_lin == 90, sta_pos == 120 repeats for:
    • sta_pattern == ‘historical’ (sta_dpos == 20); and
    • sta_pattern == ‘standard’ (sta_dpos == 10)
Code
librarian::shelf(
  calcofi4r, dplyr, mapview, sf, tidyr, units)

cc_grid_v2 <- calcofi4r::cc_grid |> 
  select(sta_key, shore = sta_shore, pattern = sta_pattern, spacing = sta_dpos) |>
  separate_wider_delim(
    sta_key, ",", names = c("line", "station"), cols_remove = F) |> 
  mutate(
    line    = as.double(line),
    station = as.double(station),
    grid_key = ifelse(
      pattern == "historical", 
      glue("st{station}-ln{line}_hist"),
      glue("st{station}-ln{line}")),
    zone = glue("{shore}-{pattern}")) |> 
  relocate(grid_key, station) |> 
  st_as_sf() |> 
  mutate(
    area_km2 = st_area(geom) |>
      set_units(km^2) |>
      as.numeric())

cc_grid_ctrs_v2 <- calcofi4r::cc_grid_ctrs |> 
  select(sta_key, pattern = sta_pattern) |> 
  left_join(
    cc_grid_v2 |> 
      st_drop_geometry(),
    by = c("sta_key", "pattern")) |> 
  select(-sta_key) |> 
  relocate(grid_key)
  
cc_grid_v2 <- cc_grid_v2 |> 
  select(-sta_key)

cc_grid_v2 |> 
  st_drop_geometry() |> 
  datatable()

mapview(cc_grid_v2, zcol="zone") +
  mapview(cc_grid_ctrs_v2, cex = 1)
Code
grid <- cc_grid_v2 |> 
  as.data.frame() |> 
  left_join(
    cc_grid_ctrs_v2 |> 
      as.data.frame() |> 
      select(grid_key, geom_ctr = geom),
    by = "grid_key") |> 
  st_as_sf(sf_column_name = "geom")

grid |> 
  st_write(con_dev, "grid")
dbExecute(con_dev, glue(
  "CREATE INDEX IF NOT EXISTS idx_grid_geom ON grid USING gist(geom);"))
dbExecute(con_dev, "ALTER TABLE grid ADD PRIMARY KEY (grid_key)")

6.3 Update site.grid_key

Code
dbExecute(con_dev, "ALTER TABLE site ADD COLUMN grid_key text")

dbExecute(con_dev, "
  UPDATE site
  SET
    grid_key = g.grid_key
  FROM (
    SELECT s.site_uuid, g.grid_key
    FROM site s
    INNER JOIN grid g ON ST_Intersects(s.geom, g.geom)) g
  WHERE site.site_uuid = g.site_uuid") # 59,136

# tbl(con_dev, "site") |>
#   mutate(is_in_grd = !is.na(grid_key)) |>
#   pull(is_in_grd) |>
#   table()
#  FALSE    TRUE 
#  2,084  59,136

dbExecute(con_dev, glue(
  "CREATE INDEX IF NOT EXISTS idx_site_geom_key ON grid (grid_key);"))
add_fkey("site", "grid_key", "grid")

dm_from_con(
  con_dev, 
  # table_names = dbListTables(con_dev_only),
  table_names = c("site", "grid"),
  learn_keys  = T) |> 
  dm_set_colors(green = grid) |>
  dm_draw(view_type = "all")

6.4 Add site_seg: segments from site

Code
site_seg <- tbl(con_dev, "site") |> 
  select(cruise_uuid, orderocc, site_uuid, lon = longitude, lat = latitude) |> 
  left_join(
    tbl(con_dev, "tow") |> 
      select(site_uuid, time_start),
    by = "site_uuid") |>
  arrange(cruise_uuid, orderocc) |> 
  group_by(
    cruise_uuid, orderocc, site_uuid, lon, lat) |> 
  summarize(
    time_beg = min(time_start, na.rm = T),
    time_end = max(time_start, na.rm = T),
    .groups = "drop") |> 
  collect() |> 
# table(is.na(site_seg$time_beg))
#   FALSE   TRUE 
#  57,935  3,285
site_seg <- site_seg |>   
  arrange(cruise_uuid, orderocc, time_beg) |> 
  group_by(cruise_uuid) |> 
  mutate(
    site_uuid_beg = lag(site_uuid),
    lon_beg       = lag(lon),
    lat_beg       = lag(lat),
    time_beg      = lag(time_beg)) |> 
  ungroup() |> 
  filter(!is.na(lon_beg), !is.na(lat_beg)) |> 
  mutate(
    m    = pmap(
      list(lon_beg, lat_beg, lon, lat), 
      \(x1, y1, x2, y2){
        matrix(c(x1, y1, x2, y2), nrow = 2, byrow = T) }),
    geom = map(m, st_linestring)) |> 
  select(
    cruise_uuid,
    site_uuid_beg, 
    site_uuid_end = site_uuid, 
    lon_beg, 
    lat_beg,
    lon_end = lon, 
    lat_end = lat,
    time_beg, 
    time_end, 
    geom) |> 
  st_as_sf(
    sf_column_name = "geom", 
    crs = 4326) |> 
  mutate(
    time_hr   = as.numeric(difftime(time_end, time_beg, units = "hours")),
    length_km = st_length(geom) |>
      set_units(km) |>
      as.numeric(),
    km_per_hr = length_km / time_hr)

# TODO: check that time_end of previous segment is time_beg of next

# TODO: investigate too slow, too fast
# site_seg |> 
#   mutate(
#     km_per_hr_flag = ifelse(
#       km_per_hr < 0.01,
#       "lt0.01",
#       ifelse(
#         km_per_hr > 30,
#         "gt30",
#         NA))) |> 
#   pull(km_per_hr_flag) |> 
#   table()
#   gt30 lt0.01 
#    928      9

fld_types <- tibble(
  lst = lapply(site_seg, class)) |> 
  mutate(
    fld    = names(lst),
    type_r = map_chr(lst, pluck, 1),
    type   = map2_chr(fld, type_r, \(fld, type_r){
      case_when(
        str_detect(fld, "uuid") ~ "uuid",
        fld    == "geom"        ~ "geometry",
        type_r == "POSIXct"     ~ "timestamp",
        type_r == "character"   ~ "varchar",
        .default = "numeric") })) |> 
  select(fld, type) |> 
  deframe()
st_write(site_seg, con_dev, "site_seg", field.types = fld_types)
dbExecute(con_dev, "CREATE INDEX IF NOT EXISTS idx_site_seg_geom ON grid USING gist(geom)")
dbExecute(con_dev, "ALTER TABLE site_seg ADD PRIMARY KEY (site_uuid_beg)")
dbExecute(con_dev, 'CREATE EXTENSION IF NOT EXISTS "uuid-ossp"')
add_fkey("site_seg", "site_uuid_beg", "site", "site_uuid")
add_fkey("site_seg", "site_uuid_end", "site", "site_uuid")
add_fkey("site_seg", "cruise_uuid", "cruise")

dm_from_con(
  con_dev, 
  # table_names = dbListTables(con_dev_only),
  table_names = c("site_seg", "site", "cruise"),
  learn_keys  = T) |> 
  dm_set_colors(green = site_seg) |>
  dm_draw(view_type = "all")

site_seg <- st_read(con_dev, "site_seg") |> 
  mutate(
    year = year(time_beg))

mapView(site_seg, zcol = "year")

7 Report

Code
dm_from_con(
  con_dev, 
  table_names = dbListTables(con_dev_only),
  # table_names = c("site_seg", "site", "cruise"),
  learn_keys  = T) |> 
  dm_set_colors(lightgreen = c(site_seg, grid)) |>
  dm_draw(view_type = "all")

d_eff <- tbl(con_dev, "site_seg") |> 
  mutate(
    year = year(time_beg)) |> 
  group_by(year) |> 
  summarize(
    time_hr   = sum(time_hr, na.rm = T),
    length_km = sum(length_km, na.rm = T)) |> 
  collect()

sum(d_eff$time_hr, na.rm = T)    # 302,516 hours; 12,604 days; 34.5 years
sum(d_eff$length_km, na.rm = T)  # 3,672,794 km

8 Cleanup

Code
# Close database connection
dbDisconnect(con)

9 TODO