Ingest CalCOFI Bird & Mammal Census

Published

2026-06-08

1 Overview

Source: whales-seabirds-turtles/bird-mammal-census/ (CCE-LTER DataZoo 255, PI Bill Sydeman). Bird & mammal observations along CalCOFI/NMFS/CPR cruise transects, 1987-2021.

  • Provider: calcofi (tentative — curated via CCE-LTER; see questions)
  • Tables: bird_mammal_transect (effort) ⨝ bird_mammal_observation (counts) on gis_key, plus bird_mammal_species (ITIS taxonomy) and bird_mammal_behavior lookups.
Code
graph LR
  T[transects] --> TR[bird_mammal_transect<br/>effort + position]
  O[observations] --> OB[bird_mammal_observation<br/>counts]
  OB -.gis_key.-> TR
  OB -.species_code.-> SP[bird_mammal_species]
  OB -.behavior_code.-> BH[bird_mammal_behavior]

gis_key

species_code

behavior_code

transects

bird_mammal_transect
effort + position

observations

bird_mammal_observation
counts

bird_mammal_species

bird_mammal_behavior

2 Setup

Code
devtools::load_all(here::here("../calcofi4db"))
devtools::load_all(here::here("../calcofi4r"))
librarian::shelf(
  CalCOFI/calcofi4db, CalCOFI/calcofi4r,
  DBI, dplyr, DT, fs, glue, here, janitor, jsonlite, knitr,
  lubridate, purrr, readr, sf, stringr, tibble, tidyr, units, quiet = T)
options(readr.show_col_types = F)
options(DT.options = list(scrollX = TRUE))
source(here("libs/ingest.R"))

cc           <- read_calcofi_meta(here("ingest_calcofi_bird_mammal_census.qmd"))
provider     <- cc$provider
dataset      <- cc$dataset
tables_owned <- cc$tables_owned
dir_label    <- glue("{provider}_{dataset}")
dir_parquet  <- here(glue("data/parquet/{dir_label}"))
db_path      <- here(glue("data/wrangling/{dir_label}.duckdb"))
dir_src      <- path_expand(glue("{dir_data}/whales-seabirds-turtles/bird-mammal-census"))

if (overwrite) {
  if (file_exists(db_path))                 file_delete(db_path)
  if (file_exists(paste0(db_path, ".wal"))) file_delete(paste0(db_path, ".wal"))
}
dir_create(dirname(db_path))
con <- get_duckdb_con(db_path)
load_duckdb_extension(con, "spatial")

3 Read Source Data

Code
stopifnot("source dir not found" = dir_exists(dir_src))
sync_to_gcs(local_dir = dir_src, gcs_prefix = glue("archive/{provider}/{dataset}"),
            bucket = "calcofi-files-public", exclude = c(".DS_Store", "*.tmp", "*.gdoc"))
# A tibble: 5 × 4
  file                                          action     size reason        
  <chr>                                         <chr>     <dbl> <chr>         
1 CalCOFI_bird-mammal-census_allspecieslist.csv skipped   12560 checksum match
2 CalCOFI_bird-mammal-census_behaviorcodes.csv  skipped      74 checksum match
3 CalCOFI_bird-mammal-census_observations.csv   skipped 2073695 checksum match
4 CalCOFI_bird-mammal-census_transects.csv      skipped 9728860 checksum match
5 README.txt                                    skipped    3462 checksum match
Code
rd <- function(f) read_csv(file.path(dir_src, glue("CalCOFI_bird-mammal-census_{f}.csv")),
                           col_types = cols(.default = "c")) |> clean_names()
d_tr <- rd("transects"); d_ob <- rd("observations")
d_sp <- rd("allspecieslist"); d_bh <- rd("behaviorcodes")
cat(glue("transects {nrow(d_tr)}, observations {nrow(d_ob)}, species {nrow(d_sp)}, behaviors {nrow(d_bh)}"), "\n")
transects 60715, observations 82418, species 200, behaviors 4 
Code
cat("transect clean names:", paste(names(d_tr), collapse=", "), "\n")
transect clean names: gis_key, cruise, transect_number, bin_number, date, time_sec, latitude_start_o, longitude_start_o, latitude_mid_o, longitude_mid_o, latitude_stop_o, longitude_stop_o, length_m, width_m, area_m2, depth_m, julian_date, julian_day, svy, season 

4 Build Lookups (species, behavior)

Code
b01 <- function(x) !is.na(x) & x %in% c("1","TRUE","true","Y","y")

d_species <- d_sp |>
  transmute(
    species_code = species, common_name, scientific_name = latin_name,
    itis_id = suppressWarnings(as.integer(itis)),
    is_bird = b01(bird), is_mammal = b01(mammal), is_fish = b01(fish),
    is_large_bird = b01(large_bird), is_unidentified = b01(unidentified),
    include_flag = b01(include), nmfs_code = nmfs, comment)
dbWriteTable(con, "bird_mammal_species", d_species, overwrite = TRUE)

d_behavior <- d_bh |> transmute(behavior_code = id, behavior, description)
dbWriteTable(con, "bird_mammal_behavior", d_behavior, overwrite = TRUE)
cat(glue("species {nrow(d_species)}, behaviors {nrow(d_behavior)}"), "\n")
species 200, behaviors 4 

5 Build Transect (effort) Table

Code
d_transect <- d_tr |>
  transmute(
    gis_key,
    cruise_label    = cruise,
    transect_number = suppressWarnings(as.integer(transect_number)),
    bin_number      = suppressWarnings(as.integer(bin_number)),
    date            = suppressWarnings(as.Date(date)),
    time_sec        = suppressWarnings(as.numeric(time_sec)),
    latitude        = suppressWarnings(as.numeric(latitude_mid_o)),
    longitude       = suppressWarnings(as.numeric(longitude_mid_o)),
    latitude_start  = suppressWarnings(as.numeric(latitude_start_o)),
    longitude_start = suppressWarnings(as.numeric(longitude_start_o)),
    latitude_stop   = suppressWarnings(as.numeric(latitude_stop_o)),
    longitude_stop  = suppressWarnings(as.numeric(longitude_stop_o)),
    length_m        = suppressWarnings(as.numeric(length_m)),
    width_m         = suppressWarnings(as.numeric(width_m)),
    area_m2         = suppressWarnings(as.numeric(area_m2)),
    bottom_depth_m  = suppressWarnings(as.numeric(depth_m)),
    julian_date     = suppressWarnings(as.numeric(julian_date)),
    julian_day      = suppressWarnings(as.integer(julian_day)),
    svy, season,
    # datetime from date + time_sec (Q01: tz unconfirmed, treated as given)
    datetime_start_utc = as_datetime(date) + dseconds(coalesce(time_sec, 0)))

dbWriteTable(con, "bird_mammal_transect", d_transect, overwrite = TRUE)
cat(glue("bird_mammal_transect: {nrow(d_transect)} rows"), "\n")
bird_mammal_transect: 60715 rows 

6 Build Observation (counts) Table

Code
d_observation <- d_ob |>
  transmute(gis_key, species_code = species, behavior_code = behavior,
            count = suppressWarnings(as.integer(count))) |>
  mutate(observation_id = row_number(), .before = 1)
dbWriteTable(con, "bird_mammal_observation", d_observation, overwrite = TRUE)
cat(glue("bird_mammal_observation: {nrow(d_observation)} rows"), "\n")
bird_mammal_observation: 82418 rows 

7 Resolve cruise_key + Spatial

Code
# cruise_key: best-effort YYYY-MM from cruise_label/date (no ship column -> Q02).
# leave NULL for now; record format for the provider question.
dbExecute(con, "ALTER TABLE bird_mammal_transect ADD COLUMN IF NOT EXISTS cruise_key VARCHAR")
[1] 0
Code
# geometry + grid from the transect midpoint
load_prior_tables(con, parquet_dir = here("data/parquet/swfsc_ichthyo"),
                  tables = c("grid"), geom_tables = c("grid"), as_view = TRUE)
# A tibble: 1 × 3
  table  rows has_geom
  <chr> <dbl> <lgl>   
1 grid    218 TRUE    
Code
add_point_geom(con, "bird_mammal_transect", lon_col = "longitude", lat_col = "latitude")
assign_grid_key(con, "bird_mammal_transect") |> datatable(caption = "Grid assignment")

8 Load Dataset Metadata + Schema

Code
d_dataset <- ingest_yaml_to_dataset_df(read_ingest_yaml(here()))
dbWriteTable(con, "dataset", d_dataset, overwrite = TRUE)

bmc_rels <- list(
  primary_keys = list(
    bird_mammal_transect = "gis_key", bird_mammal_observation = "observation_id",
    bird_mammal_species = "species_code", bird_mammal_behavior = "behavior_code"),
  foreign_keys = list(
    list(table="bird_mammal_observation", column="gis_key",       ref_table="bird_mammal_transect", ref_column="gis_key"),
    list(table="bird_mammal_observation", column="species_code",  ref_table="bird_mammal_species",  ref_column="species_code"),
    list(table="bird_mammal_observation", column="behavior_code", ref_table="bird_mammal_behavior", ref_column="behavior_code")))
cc_erd(con, tables = c("bird_mammal_transect","bird_mammal_observation","bird_mammal_species","bird_mammal_behavior","dataset"),
       rels = bmc_rels,
       colors = list(lightblue = c("bird_mammal_transect","bird_mammal_observation"),
                     lightyellow = c("bird_mammal_species","bird_mammal_behavior"), white = "dataset"))

Code
build_relationships_json(rels = bmc_rels, output_dir = dir_parquet, provider = provider, dataset = dataset)
[1] "/Users/bbest/Github/CalCOFI/workflows/data/parquet/calcofi_bird_mammal_census/relationships.json"

9 Validate + Preview

Code
results <- validate_for_release(con, checks = "all", strict = FALSE)
cat("Validation:", ifelse(results$passed, "PASSED", "FAILED"), "\n")
Validation: FAILED 
Code
# orphan check: observations whose gis_key is not in transects
n_orphan <- dbGetQuery(con, "SELECT COUNT(*) FROM bird_mammal_observation o
  LEFT JOIN bird_mammal_transect t USING(gis_key) WHERE t.gis_key IS NULL")[[1]]
cat(glue("observation gis_key orphans (no transect): {n_orphan}"), "\n")
observation gis_key orphans (no transect): 0 
Code
cols <- dbGetQuery(con, "SELECT column_name FROM information_schema.columns
  WHERE table_name='bird_mammal_transect' AND data_type NOT LIKE 'GEOMETRY%'")$column_name
dbGetQuery(con, glue("SELECT {paste(cols, collapse=', ')} FROM bird_mammal_transect LIMIT 100")) |>
  datatable(caption = "bird_mammal_transect — first 100", rownames = FALSE, filter = "top")

10 Questions for Data Providers

Code
read_csv(here(glue("metadata/{provider}/{dataset}/questions.csv"))) |>
  arrange(factor(priority, c("blocker","high","normal")), id) |>
  select(priority, question, context, status) |>
  datatable(caption = "Questions (ranked)", options = list(dom="t", pageLength=10), rownames = FALSE)

11 Write Outputs + Upload

Code
dir_create(dir_parquet)
write_parquet_outputs(
  con = con, output_dir = dir_parquet,
  tables = c("bird_mammal_transect","bird_mammal_observation","bird_mammal_species","bird_mammal_behavior","dataset"),
  strip_provenance = FALSE)
# A tibble: 5 × 5
  table                    rows file_size path                       partitioned
  <chr>                   <dbl>     <dbl> <chr>                      <lgl>      
1 bird_mammal_transect    60715   3696385 /Users/bbest/Github/CalCO… FALSE      
2 bird_mammal_observation 82418    185994 /Users/bbest/Github/CalCO… FALSE      
3 bird_mammal_species       200      6297 /Users/bbest/Github/CalCO… FALSE      
4 bird_mammal_behavior        4       534 /Users/bbest/Github/CalCO… FALSE      
5 dataset                     7      6427 /Users/bbest/Github/CalCO… FALSE      
Code
d_tbls_rd <- read_csv(here("metadata/calcofi/bird_mammal_census/tbls_redefine.csv"))
d_flds_rd <- read_csv(here("metadata/calcofi/bird_mammal_census/flds_redefine.csv"))
build_metadata_json(
  con = con, d_tbls_rd = d_tbls_rd, d_flds_rd = d_flds_rd,
  metadata_derived_csv = here("metadata/calcofi/bird_mammal_census/metadata_derived.csv"),
  output_dir = dir_parquet, tables = c("bird_mammal_transect","bird_mammal_observation","bird_mammal_species","bird_mammal_behavior"),
  set_comments = TRUE, provider = provider, dataset = dataset,
  workflow_url = cc$workflow_url, tables_owned = tables_owned)
[1] "/Users/bbest/Github/CalCOFI/workflows/data/parquet/calcofi_bird_mammal_census/metadata.json"
Code
sync_to_gcs(local_dir = dir_parquet, gcs_prefix = glue("ingest/{dir_label}"), bucket = "calcofi-db")
# A tibble: 8 × 4
  file                            action      size reason  
  <chr>                           <chr>      <dbl> <chr>   
1 bird_mammal_behavior.parquet    uploaded     534 new file
2 bird_mammal_observation.parquet uploaded  185994 new file
3 bird_mammal_species.parquet     uploaded    6297 new file
4 bird_mammal_transect.parquet    uploaded 3696385 new file
5 dataset.parquet                 uploaded    6427 new file
6 manifest.json                   uploaded    2629 new file
7 metadata.json                   uploaded    8889 new file
8 relationships.json              uploaded     800 new file
Code
close_duckdb(con)
cat(glue("Parquet outputs written to: {dir_parquet}"), "\n")
Parquet outputs written to: /Users/bbest/Github/CalCOFI/workflows/data/parquet/calcofi_bird_mammal_census