Publish Larvae to OBIS

Published

2025-07-30

Code
librarian::shelf(
  calcofi4db, DBI, dm, dplyr, DT, glue, here,
  lubridate, purrr, tibble, tidyr,
  quiet = T)
options(readr.show_col_types = F)

# get database connection
con <- get_db_con(c("dev"))

1 Show Database Tables

Code
# exclude experimental tables
tbls <- dbListTables(con) |> sort() |> setdiff(c("grid", "site_seg"))

# learn relations from database and draw
dm <- dm_from_con(con, table_names = tbls, learn_keys  = T)
dm_draw(dm, view_type = "all")
Figure 1: Entity relationship diagram (ERD) of the CalCOFI database. Created using the data models dm R package function dm_draw().
erDiagram
    net {
        string net_uuid PK
        string tow_uuid FK
        string side
        float std_haul_factor
        float vol_sampled_m3
        float prop_sorted
        float smallplankton
        float totalplankton
    }
    
    tow {
        string tow_uuid PK
        string site_uuid FK
        string tow_type_key FK
        int tow_number
        datetime time_start
    }
    
    site {
        string site_uuid PK
        string cruise_uuid FK
        string orderocc
        float longitude
        float latitude
        string line
        string station
    }
    
    cruise {
        string cruise_uuid PK
        string ship_key FK
        string date_ym
    }
    
    ship {
        string ship_key PK
        string ship_name
        string ship_nodc
    }
    
    tow_type {
        string tow_type_key PK
        string description
    }
    
    species {
        string species_id PK
        string scientific_name
        string itis_id
        string worms_id
        string common_name
    }
    
    larvastage {
        string net_uuid FK
        string species_id FK
        string stage
        int tally
    }
    
    egg {
        string net_uuid FK
        string species_id FK
        int tally
    }
    
    eggstage {
        string net_uuid FK
        string species_id FK
        string stage
        int tally
    }
    
    larva {
        string net_uuid FK
        string species_id FK
        int tally
    }
    
    larvasize {
        string net_uuid FK
        string species_id FK
        float length_mm
        int tally
    }

    %% Relationships
    net ||--o{ larvastage : "has"
    net ||--o{ egg : "has"
    net ||--o{ eggstage : "has"
    net ||--o{ larva : "has"
    net ||--o{ larvasize : "has"
    
    species ||--o{ larvastage : "identified_as"
    species ||--o{ egg : "identified_as"
    species ||--o{ eggstage : "identified_as"
    species ||--o{ larva : "identified_as"
    species ||--o{ larvasize : "identified_as"
    
    tow ||--o{ net : "uses"
    site ||--o{ tow : "conducted_at"
    cruise ||--o{ site : "visits"
    ship ||--o{ cruise : "operates"
    tow_type ||--o{ tow : "defines"
Figure 2: Entity relationship diagram (ERD) of the CalCOFI database. Created from image above and prompt to Claude: ‘Generate an ERD mermaid diagram from the image.’

2 Prompt Claude to Generate DarwinCore

PROMPT: Using the ERD above, create a Quarto document with chunks of R code and tidyverse style use of the database connection object con to generate a DarwinCore package using the species.worms_id (aka WoRMS AphiaID) to populate the species identifiers. Combine biological observation where possible, otherwise compose as seperate datasets, given the different biological observations: egg, eggstage, larva, larvasize, larvastage. Generate DarwinCore with as much of the extended information (i.e., event, occurrence, and measurementOrFact ) as possible. Compose Ecological Metadata Language (EML) metadata for the DarwinCore package, including the dataset and creator elements. Include prompts where more information is needed.

Code
knitr::opts_chunk$set(eval = F)
Code
librarian::shelf(
  DBI, EML, here, lubridate, tidyverse, uuid)

# Assuming database connection is already established as 'con'
# con <- dbConnect(...) 

3 Data Extraction from Database

First, let’s extract all necessary tables from the database:

Code
# Extract all tables
ship <- tbl(con, "ship") %>% collect()
cruise <- tbl(con, "cruise") %>% collect()
site <- tbl(con, "site") %>% collect()
tow <- tbl(con, "tow") %>% collect()
tow_type <- tbl(con, "tow_type") %>% collect()
net <- tbl(con, "net") %>% collect()
species <- tbl(con, "species") %>% collect()

# Extract biological observation tables
egg <- tbl(con, "egg") %>% collect()
eggstage <- tbl(con, "eggstage") %>% collect()
larva <- tbl(con, "larva") %>% collect()
larvastage <- tbl(con, "larvastage") %>% collect()
larvasize <- tbl(con, "larvasize") %>% collect()

4 Create Event Hierarchy

DarwinCore uses an event-based model. We’ll create a hierarchical event structure:

Code
# Build complete event hierarchy
events <- cruise %>%
  left_join(ship, by = "ship_key") %>%
  left_join(site, by = "cruise_uuid") %>%
  left_join(tow, by = "site_uuid") %>%
  left_join(tow_type, by = "tow_type_key") %>%
  left_join(net, by = "tow_uuid")

# Create event core at different levels
# Cruise level events
cruise_events <- events %>%
  distinct(cruise_uuid, ship_name, ship_nodc, date_ym) %>%
  mutate(
    eventID = paste0("urn:uuid:", cruise_uuid),
    parentEventID = NA_character_,
    eventDate = ymd(paste0(date_ym, "-01")), # Assuming YYYY-MM format
    eventRemarks = paste("Cruise on", ship_name),
    samplingProtocol = "Marine plankton survey cruise",
    sampleSizeValue = NA_real_,
    sampleSizeUnit = NA_character_,
    habitat = "marine",
    eventType = "cruise"
  )

# Site level events
site_events <- events %>%
  distinct(site_uuid, cruise_uuid, longitude, latitude, line, station, orderocc) %>%
  mutate(
    eventID = paste0("urn:uuid:", site_uuid),
    parentEventID = paste0("urn:uuid:", cruise_uuid),
    decimalLatitude = latitude,
    decimalLongitude = longitude,
    eventRemarks = paste("Station", station, "on line", line),
    locationID = paste0(line, "_", station),
    eventType = "site",
    samplingProtocol = "Marine plankton survey station"
  )

# Tow level events
tow_events <- events %>%
  distinct(tow_uuid, site_uuid, tow_number, time_start, description) %>%
  mutate(
    eventID = paste0("urn:uuid:", tow_uuid),
    parentEventID = paste0("urn:uuid:", site_uuid),
    eventDate = as_date(time_start),
    eventTime = format(time_start, "%H:%M:%S"),
    eventRemarks = paste("Tow", tow_number, "-", description),
    eventType = "tow",
    samplingProtocol = description
  )

# Net level events (sampling events)
net_events <- events %>%
  distinct(net_uuid, tow_uuid, side, std_haul_factor, vol_sampled_m3, 
           prop_sorted, smallplankton, totalplankton) %>%
  mutate(
    eventID = paste0("urn:uuid:", net_uuid),
    parentEventID = paste0("urn:uuid:", tow_uuid),
    eventRemarks = paste("Net sample from", side, "side"),
    eventType = "net_sample",
    samplingProtocol = paste("Plankton net tow -", side, "side"),
    sampleSizeValue = vol_sampled_m3,
    sampleSizeUnit = "cubic meters"
  )

# Combine all event levels
event_core <- bind_rows(
  cruise_events %>% select(eventID, parentEventID, eventDate, eventType, 
                          samplingProtocol, eventRemarks, habitat),
  site_events %>% select(eventID, parentEventID, eventType, samplingProtocol, 
                        eventRemarks, decimalLatitude, decimalLongitude, locationID),
  tow_events %>% select(eventID, parentEventID, eventDate, eventTime, eventType, 
                       samplingProtocol, eventRemarks),
  net_events %>% select(eventID, parentEventID, eventType, samplingProtocol, 
                       eventRemarks, sampleSizeValue, sampleSizeUnit)
) %>%
  # Add required DwC fields
  mutate(
    basisOfRecord = "HumanObservation",
    geodeticDatum = "WGS84",
    coordinateUncertaintyInMeters = 1000, # Adjust based on GPS accuracy
    countryCode = "[NEED COUNTRY CODE]", # Prompt: Add appropriate country code
    waterBody = "[NEED WATER BODY NAME]", # Prompt: Add water body name
    datasetID = "[NEED DATASET UUID]" # Prompt: Generate or provide dataset UUID
  )

5 Create Occurrence Records

Now let’s combine all biological observations into occurrence records:

Code
# Function to standardize biological observations
create_occurrences <- function(obs_data, obs_type, has_stage = FALSE, has_size = FALSE) {
  base_occ <- obs_data %>%
    left_join(species, by = "species_id") %>%
    mutate(
      occurrenceID = paste0("urn:uuid:", UUIDgenerate(n = n())),
      eventID = paste0("urn:uuid:", net_uuid),
      scientificName = scientific_name,
      scientificNameID = paste0("urn:lsid:marinespecies.org:taxname:", worms_id),
      taxonID = paste0("urn:lsid:marinespecies.org:taxname:", worms_id),
      kingdom = "Animalia",
      occurrenceStatus = ifelse(tally > 0, "present", "absent"),
      organismQuantity = tally,
      organismQuantityType = "individuals",
      lifeStage = case_when(
        obs_type == "egg" ~ "egg",
        obs_type == "larva" ~ "larva",
        TRUE ~ NA_character_
      ),
      preparations = paste0(obs_type, " sample"),
      associatedTaxa = common_name
    )
  
  # Add stage information if present
  if (has_stage) {
    base_occ <- base_occ %>%
      mutate(
        lifeStage = ifelse(!is.na(stage), 
                          paste(obs_type, stage, sep = "_"), 
                          lifeStage),
        occurrenceRemarks = paste("Stage:", stage)
      )
  }
  
  # Add size class if larvasize
  if (has_size) {
    base_occ <- base_occ %>%
      mutate(
        occurrenceRemarks = paste("Size class:", length_mm, "mm")
      )
  }
  
  return(base_occ)
}

# Process each observation type
egg_occurrences <- create_occurrences(egg, "egg")
eggstage_occurrences <- create_occurrences(eggstage, "egg", has_stage = TRUE)
larva_occurrences <- create_occurrences(larva, "larva")
larvastage_occurrences <- create_occurrences(larvastage, "larva", has_stage = TRUE)
larvasize_occurrences <- create_occurrences(larvasize, "larva", has_size = TRUE)

# Combine all occurrences
occurrence_extension <- bind_rows(
  egg_occurrences,
  eggstage_occurrences,
  larva_occurrences,
  larvastage_occurrences,
  larvasize_occurrences
) %>%
  select(occurrenceID, eventID, scientificName, scientificNameID, taxonID,
         kingdom, occurrenceStatus, organismQuantity, organismQuantityType,
         lifeStage, preparations, occurrenceRemarks, associatedTaxa) %>%
  # Add additional required/recommended fields
  mutate(
    identifiedBy = "[NEED IDENTIFIER NAME]", # Prompt: Add taxonomist name
    dateIdentified = "[NEED IDENTIFICATION DATE]", # Prompt: Add identification date
    identificationReferences = "[NEED REFERENCES]", # Prompt: Add identification guides used
    modified = Sys.Date()
  )

6 Create MeasurementOrFact Extension

Extract all measurements and environmental data:

Code
# Environmental measurements from net samples
env_measurements <- net_events %>%
  select(eventID = eventID, net_uuid, std_haul_factor, vol_sampled_m3, 
         prop_sorted, smallplankton, totalplankton) %>%
  pivot_longer(cols = c(std_haul_factor, vol_sampled_m3, prop_sorted, 
                       smallplankton, totalplankton),
               names_to = "measurementType",
               values_to = "measurementValue") %>%
  filter(!is.na(measurementValue)) %>%
  mutate(
    measurementID = paste0("urn:uuid:", UUIDgenerate(n = n())),
    measurementUnit = case_when(
      measurementType == "vol_sampled_m3" ~ "cubic meters",
      measurementType == "std_haul_factor" ~ "dimensionless",
      measurementType == "prop_sorted" ~ "proportion",
      measurementType %in% c("smallplankton", "totalplankton") ~ "grams",
      TRUE ~ NA_character_
    ),
    measurementType = case_when(
      measurementType == "vol_sampled_m3" ~ "volume filtered",
      measurementType == "std_haul_factor" ~ "standardized haul factor",
      measurementType == "prop_sorted" ~ "proportion of sample sorted",
      measurementType == "smallplankton" ~ "small plankton biomass",
      measurementType == "totalplankton" ~ "total plankton biomass",
      TRUE ~ measurementType
    ),
    measurementMethod = "[NEED MEASUREMENT METHODS]", # Prompt: Add measurement methods
    measurementRemarks = NA_character_
  )

# Size measurements from larvasize
size_measurements <- larvasize_occurrences %>%
  filter(!is.na(larvasize$length_mm)) %>%
  mutate(
    measurementID = paste0("urn:uuid:", UUIDgenerate(n = n())),
    measurementType = "body length",
    measurementValue = larvasize$length_mm,
    measurementUnit = "millimeters",
    measurementMethod = "[NEED SIZE MEASUREMENT METHOD]", # Prompt: Add measurement method
    measurementRemarks = "Larval body length"
  ) %>%
  select(measurementID, occurrenceID, measurementType, measurementValue,
         measurementUnit, measurementMethod, measurementRemarks)

# Combine all measurements
measurementorfact_extension <- bind_rows(
  env_measurements %>% 
    mutate(occurrenceID = NA_character_) %>%
    select(measurementID, eventID, occurrenceID, measurementType, 
           measurementValue, measurementUnit, measurementMethod, measurementRemarks),
  size_measurements %>% 
    mutate(eventID = NA_character_) %>%
    select(measurementID, eventID, occurrenceID, measurementType, 
           measurementValue, measurementUnit, measurementMethod, measurementRemarks)
)

7 Write DarwinCore Archive Files

Code
# Create output directory
dwc_dir <- here("dwc_output")
dir.create(dwc_dir, showWarnings = FALSE)

# Write core and extension files
write_csv(event_core, file.path(dwc_dir, "event.csv"))
write_csv(occurrence_extension, file.path(dwc_dir, "occurrence.csv"))
write_csv(measurementorfact_extension, file.path(dwc_dir, "measurementorfact.csv"))

# Create meta.xml for DwC-A
meta_xml <- '<?xml version="1.0" encoding="UTF-8"?>
<archive xmlns="http://rs.tdwg.org/dwc/text/" 
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
         xsi:schemaLocation="http://rs.tdwg.org/dwc/text/ http://rs.tdwg.org/dwc/text/tdwg_dwc_text.xsd">
  <core encoding="UTF-8" fieldsTerminatedBy="," linesTerminatedBy="\\n" 
        fieldsEnclosedBy=\'"\' ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Event">
    <files>
      <location>event.csv</location>
    </files>
    <id index="0" />
  </core>
  <extension encoding="UTF-8" fieldsTerminatedBy="," linesTerminatedBy="\\n" 
             fieldsEnclosedBy=\'"\' ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
    <files>
      <location>occurrence.csv</location>
    </files>
    <coreid index="1" />
  </extension>
  <extension encoding="UTF-8" fieldsTerminatedBy="," linesTerminatedBy="\\n" 
             fieldsEnclosedBy=\'"\' ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/MeasurementOrFact">
    <files>
      <location>measurementorfact.csv</location>
    </files>
    <coreid index="1" />
  </extension>
</archive>'

writeLines(meta_xml, file.path(dwc_dir, "meta.xml"))

8 Create EML Metadata

Code
# Dataset metadata - PROMPTS FOR REQUIRED INFORMATION
dataset_title <- "[DATASET TITLE]" # Prompt: Enter dataset title
dataset_abstract <- "[DATASET ABSTRACT]" # Prompt: Enter dataset abstract (200-500 words)
dataset_keywords <- c("[KEYWORD1]", "[KEYWORD2]", "[KEYWORD3]") # Prompt: Add relevant keywords
dataset_uuid <- UUIDgenerate() # Or use existing dataset UUID

# Geographic coverage
geo_coverage <- site %>%
  summarise(
    westBoundingCoordinate = min(longitude, na.rm = TRUE),
    eastBoundingCoordinate = max(longitude, na.rm = TRUE),
    northBoundingCoordinate = max(latitude, na.rm = TRUE),
    southBoundingCoordinate = min(latitude, na.rm = TRUE)
  )

# Temporal coverage
temp_coverage <- tow %>%
  summarise(
    beginDate = min(time_start, na.rm = TRUE),
    endDate = max(time_start, na.rm = TRUE)
  )

# Create EML document
my_eml <- list(
  packageId = dataset_uuid,
  system = "uuid",
  dataset = list(
    title = dataset_title,
    creator = list(
      individualName = list(
        givenName = "[CREATOR GIVEN NAME]", # Prompt: Enter creator's given name
        surName = "[CREATOR SURNAME]" # Prompt: Enter creator's surname
      ),
      organizationName = "[ORGANIZATION NAME]", # Prompt: Enter organization name
      electronicMailAddress = "[EMAIL ADDRESS]", # Prompt: Enter email address
      userId = list(
        directory = "https://orcid.org/",
        userId = "[ORCID ID]" # Prompt: Enter ORCID ID (0000-0000-0000-0000)
      )
    ),
    abstract = dataset_abstract,
    keywordSet = list(
      keyword = dataset_keywords,
      keywordThesaurus = "GCMD Science Keywords"
    ),
    coverage = list(
      geographicCoverage = list(
        geographicDescription = "[GEOGRAPHIC DESCRIPTION]", # Prompt: Describe study area
        boundingCoordinates = geo_coverage
      ),
      temporalCoverage = list(
        rangeOfDates = list(
          beginDate = list(calendarDate = as.character(temp_coverage$beginDate)),
          endDate = list(calendarDate = as.character(temp_coverage$endDate))
        )
      ),
      taxonomicCoverage = list(
        generalTaxonomicCoverage = "Marine ichthyoplankton and fish eggs",
        taxonomicClassification = species %>%
          filter(!is.na(worms_id)) %>%
          select(scientificName = scientific_name) %>%
          as.list()
      )
    ),
    contact = list(
      individualName = list(
        givenName = "[CONTACT GIVEN NAME]", # Prompt: Enter contact's given name
        surName = "[CONTACT SURNAME]" # Prompt: Enter contact's surname
      ),
      electronicMailAddress = "[CONTACT EMAIL]" # Prompt: Enter contact email
    ),
    methods = list(
      methodStep = list(
        description = list(
          para = "[SAMPLING METHODS DESCRIPTION]" # Prompt: Describe sampling methods in detail
        )
      ),
      sampling = list(
        studyExtent = list(
          description = list(
            para = "[STUDY EXTENT DESCRIPTION]" # Prompt: Describe spatial and temporal extent
          )
        ),
        samplingDescription = list(
          para = "[SAMPLING DESCRIPTION]" # Prompt: Describe sampling design and protocols
        )
      )
    ),
    project = list(
      title = "[PROJECT TITLE]", # Prompt: Enter project title
      personnel = list(
        individualName = list(
          givenName = "[PI GIVEN NAME]", # Prompt: Enter PI's given name
          surName = "[PI SURNAME]" # Prompt: Enter PI's surname
        ),
        role = "Principal Investigator"
      ),
      funding = list(
        para = "[FUNDING INFORMATION]" # Prompt: Enter funding source and grant numbers
      )
    )
  )
)

# Write EML
write_eml(my_eml, file.path(dwc_dir, "eml.xml"))

9 Data Quality Checks

Code
# Check for missing WoRMS IDs
missing_worms <- species %>%
  filter(is.na(worms_id) | worms_id == "") %>%
  select(species_id, scientific_name)

if(nrow(missing_worms) > 0) {
  cat("WARNING: The following species are missing WoRMS IDs:\n")
  print(missing_worms)
  cat("\nPlease add WoRMS AphiaIDs for these species before finalizing the dataset.\n")
}

# Check for orphan records
orphan_occurrences <- occurrence_extension %>%
  anti_join(event_core, by = c("eventID" = "eventID"))

if(nrow(orphan_occurrences) > 0) {
  cat("\nWARNING: Found", nrow(orphan_occurrences), "occurrence records without matching events.\n")
}

# Summary statistics
cat("\n=== Dataset Summary ===\n")
cat("Total events:", nrow(event_core), "\n")
cat("  - Cruises:", sum(event_core$eventType == "cruise", na.rm = TRUE), "\n")
cat("  - Sites:", sum(event_core$eventType == "site", na.rm = TRUE), "\n")
cat("  - Tows:", sum(event_core$eventType == "tow", na.rm = TRUE), "\n")
cat("  - Net samples:", sum(event_core$eventType == "net_sample", na.rm = TRUE), "\n")
cat("\nTotal occurrences:", nrow(occurrence_extension), "\n")
cat("Total species:", n_distinct(occurrence_extension$scientificName), "\n")
cat("Total measurements:", nrow(measurementorfact_extension), "\n")

10 Package Creation

Code
# Create DwC-A zip file
zip_file <- file.path(dwc_dir, paste0("plankton_survey_dwca_", Sys.Date(), ".zip"))
zip(zip_file, 
    files = c("event.csv", "occurrence.csv", "measurementorfact.csv", "meta.xml", "eml.xml"),
    root = dwc_dir)

cat("\nDarwin Core Archive created:", zip_file, "\n")

11 Required Information Prompts

Please provide the following information to complete the DarwinCore package:

11.1 Dataset Information

11.2 Personnel Information

11.3 Project Information

11.4 Methodology

11.5 Geographic Information

Once this information is provided, update the placeholders marked with [NEED …] in the code above.