Code
::shelf(
librarian
calcofi4db, DBI, dm, dplyr, DT, glue, here,
lubridate, purrr, tibble, tidyr,quiet = T)
options(readr.show_col_types = F)
# get database connection
<- get_db_con(c("dev")) con
::shelf(
librarian
calcofi4db, DBI, dm, dplyr, DT, glue, here,
lubridate, purrr, tibble, tidyr,quiet = T)
options(readr.show_col_types = F)
# get database connection
<- get_db_con(c("dev")) con
# exclude experimental tables
<- dbListTables(con) |> sort() |> setdiff(c("grid", "site_seg"))
tbls
# learn relations from database and draw
<- dm_from_con(con, table_names = tbls, learn_keys = T)
dm dm_draw(dm, view_type = "all")
dm
R package function dm_draw()
.
erDiagram net { string net_uuid PK string tow_uuid FK string side float std_haul_factor float vol_sampled_m3 float prop_sorted float smallplankton float totalplankton } tow { string tow_uuid PK string site_uuid FK string tow_type_key FK int tow_number datetime time_start } site { string site_uuid PK string cruise_uuid FK string orderocc float longitude float latitude string line string station } cruise { string cruise_uuid PK string ship_key FK string date_ym } ship { string ship_key PK string ship_name string ship_nodc } tow_type { string tow_type_key PK string description } species { string species_id PK string scientific_name string itis_id string worms_id string common_name } larvastage { string net_uuid FK string species_id FK string stage int tally } egg { string net_uuid FK string species_id FK int tally } eggstage { string net_uuid FK string species_id FK string stage int tally } larva { string net_uuid FK string species_id FK int tally } larvasize { string net_uuid FK string species_id FK float length_mm int tally } %% Relationships net ||--o{ larvastage : "has" net ||--o{ egg : "has" net ||--o{ eggstage : "has" net ||--o{ larva : "has" net ||--o{ larvasize : "has" species ||--o{ larvastage : "identified_as" species ||--o{ egg : "identified_as" species ||--o{ eggstage : "identified_as" species ||--o{ larva : "identified_as" species ||--o{ larvasize : "identified_as" tow ||--o{ net : "uses" site ||--o{ tow : "conducted_at" cruise ||--o{ site : "visits" ship ||--o{ cruise : "operates" tow_type ||--o{ tow : "defines"
PROMPT: Using the ERD above, create a Quarto document with chunks of R code and tidyverse style use of the database connection object con
to generate a DarwinCore package using the species.worms_id
(aka WoRMS AphiaID) to populate the species identifiers. Combine biological observation where possible, otherwise compose as seperate datasets, given the different biological observations: egg, eggstage, larva, larvasize, larvastage. Generate DarwinCore with as much of the extended information (i.e., event
, occurrence
, and measurementOrFact
) as possible. Compose Ecological Metadata Language (EML) metadata for the DarwinCore package, including the dataset
and creator
elements. Include prompts where more information is needed.
::opts_chunk$set(eval = F) knitr
::shelf(
librarian
DBI, EML, here, lubridate, tidyverse, uuid)
# Assuming database connection is already established as 'con'
# con <- dbConnect(...)
First, let’s extract all necessary tables from the database:
# Extract all tables
<- tbl(con, "ship") %>% collect()
ship <- tbl(con, "cruise") %>% collect()
cruise <- tbl(con, "site") %>% collect()
site <- tbl(con, "tow") %>% collect()
tow <- tbl(con, "tow_type") %>% collect()
tow_type <- tbl(con, "net") %>% collect()
net <- tbl(con, "species") %>% collect()
species
# Extract biological observation tables
<- tbl(con, "egg") %>% collect()
egg <- tbl(con, "eggstage") %>% collect()
eggstage <- tbl(con, "larva") %>% collect()
larva <- tbl(con, "larvastage") %>% collect()
larvastage <- tbl(con, "larvasize") %>% collect() larvasize
DarwinCore uses an event-based model. We’ll create a hierarchical event structure:
# Build complete event hierarchy
<- cruise %>%
events left_join(ship, by = "ship_key") %>%
left_join(site, by = "cruise_uuid") %>%
left_join(tow, by = "site_uuid") %>%
left_join(tow_type, by = "tow_type_key") %>%
left_join(net, by = "tow_uuid")
# Create event core at different levels
# Cruise level events
<- events %>%
cruise_events distinct(cruise_uuid, ship_name, ship_nodc, date_ym) %>%
mutate(
eventID = paste0("urn:uuid:", cruise_uuid),
parentEventID = NA_character_,
eventDate = ymd(paste0(date_ym, "-01")), # Assuming YYYY-MM format
eventRemarks = paste("Cruise on", ship_name),
samplingProtocol = "Marine plankton survey cruise",
sampleSizeValue = NA_real_,
sampleSizeUnit = NA_character_,
habitat = "marine",
eventType = "cruise"
)
# Site level events
<- events %>%
site_events distinct(site_uuid, cruise_uuid, longitude, latitude, line, station, orderocc) %>%
mutate(
eventID = paste0("urn:uuid:", site_uuid),
parentEventID = paste0("urn:uuid:", cruise_uuid),
decimalLatitude = latitude,
decimalLongitude = longitude,
eventRemarks = paste("Station", station, "on line", line),
locationID = paste0(line, "_", station),
eventType = "site",
samplingProtocol = "Marine plankton survey station"
)
# Tow level events
<- events %>%
tow_events distinct(tow_uuid, site_uuid, tow_number, time_start, description) %>%
mutate(
eventID = paste0("urn:uuid:", tow_uuid),
parentEventID = paste0("urn:uuid:", site_uuid),
eventDate = as_date(time_start),
eventTime = format(time_start, "%H:%M:%S"),
eventRemarks = paste("Tow", tow_number, "-", description),
eventType = "tow",
samplingProtocol = description
)
# Net level events (sampling events)
<- events %>%
net_events distinct(net_uuid, tow_uuid, side, std_haul_factor, vol_sampled_m3,
%>%
prop_sorted, smallplankton, totalplankton) mutate(
eventID = paste0("urn:uuid:", net_uuid),
parentEventID = paste0("urn:uuid:", tow_uuid),
eventRemarks = paste("Net sample from", side, "side"),
eventType = "net_sample",
samplingProtocol = paste("Plankton net tow -", side, "side"),
sampleSizeValue = vol_sampled_m3,
sampleSizeUnit = "cubic meters"
)
# Combine all event levels
<- bind_rows(
event_core %>% select(eventID, parentEventID, eventDate, eventType,
cruise_events
samplingProtocol, eventRemarks, habitat),%>% select(eventID, parentEventID, eventType, samplingProtocol,
site_events
eventRemarks, decimalLatitude, decimalLongitude, locationID),%>% select(eventID, parentEventID, eventDate, eventTime, eventType,
tow_events
samplingProtocol, eventRemarks),%>% select(eventID, parentEventID, eventType, samplingProtocol,
net_events
eventRemarks, sampleSizeValue, sampleSizeUnit)%>%
) # Add required DwC fields
mutate(
basisOfRecord = "HumanObservation",
geodeticDatum = "WGS84",
coordinateUncertaintyInMeters = 1000, # Adjust based on GPS accuracy
countryCode = "[NEED COUNTRY CODE]", # Prompt: Add appropriate country code
waterBody = "[NEED WATER BODY NAME]", # Prompt: Add water body name
datasetID = "[NEED DATASET UUID]" # Prompt: Generate or provide dataset UUID
)
Now let’s combine all biological observations into occurrence records:
# Function to standardize biological observations
<- function(obs_data, obs_type, has_stage = FALSE, has_size = FALSE) {
create_occurrences <- obs_data %>%
base_occ left_join(species, by = "species_id") %>%
mutate(
occurrenceID = paste0("urn:uuid:", UUIDgenerate(n = n())),
eventID = paste0("urn:uuid:", net_uuid),
scientificName = scientific_name,
scientificNameID = paste0("urn:lsid:marinespecies.org:taxname:", worms_id),
taxonID = paste0("urn:lsid:marinespecies.org:taxname:", worms_id),
kingdom = "Animalia",
occurrenceStatus = ifelse(tally > 0, "present", "absent"),
organismQuantity = tally,
organismQuantityType = "individuals",
lifeStage = case_when(
== "egg" ~ "egg",
obs_type == "larva" ~ "larva",
obs_type TRUE ~ NA_character_
),preparations = paste0(obs_type, " sample"),
associatedTaxa = common_name
)
# Add stage information if present
if (has_stage) {
<- base_occ %>%
base_occ mutate(
lifeStage = ifelse(!is.na(stage),
paste(obs_type, stage, sep = "_"),
lifeStage),occurrenceRemarks = paste("Stage:", stage)
)
}
# Add size class if larvasize
if (has_size) {
<- base_occ %>%
base_occ mutate(
occurrenceRemarks = paste("Size class:", length_mm, "mm")
)
}
return(base_occ)
}
# Process each observation type
<- create_occurrences(egg, "egg")
egg_occurrences <- create_occurrences(eggstage, "egg", has_stage = TRUE)
eggstage_occurrences <- create_occurrences(larva, "larva")
larva_occurrences <- create_occurrences(larvastage, "larva", has_stage = TRUE)
larvastage_occurrences <- create_occurrences(larvasize, "larva", has_size = TRUE)
larvasize_occurrences
# Combine all occurrences
<- bind_rows(
occurrence_extension
egg_occurrences,
eggstage_occurrences,
larva_occurrences,
larvastage_occurrences,
larvasize_occurrences%>%
) select(occurrenceID, eventID, scientificName, scientificNameID, taxonID,
kingdom, occurrenceStatus, organismQuantity, organismQuantityType,%>%
lifeStage, preparations, occurrenceRemarks, associatedTaxa) # Add additional required/recommended fields
mutate(
identifiedBy = "[NEED IDENTIFIER NAME]", # Prompt: Add taxonomist name
dateIdentified = "[NEED IDENTIFICATION DATE]", # Prompt: Add identification date
identificationReferences = "[NEED REFERENCES]", # Prompt: Add identification guides used
modified = Sys.Date()
)
Extract all measurements and environmental data:
# Environmental measurements from net samples
<- net_events %>%
env_measurements select(eventID = eventID, net_uuid, std_haul_factor, vol_sampled_m3,
%>%
prop_sorted, smallplankton, totalplankton) pivot_longer(cols = c(std_haul_factor, vol_sampled_m3, prop_sorted,
smallplankton, totalplankton),names_to = "measurementType",
values_to = "measurementValue") %>%
filter(!is.na(measurementValue)) %>%
mutate(
measurementID = paste0("urn:uuid:", UUIDgenerate(n = n())),
measurementUnit = case_when(
== "vol_sampled_m3" ~ "cubic meters",
measurementType == "std_haul_factor" ~ "dimensionless",
measurementType == "prop_sorted" ~ "proportion",
measurementType %in% c("smallplankton", "totalplankton") ~ "grams",
measurementType TRUE ~ NA_character_
),measurementType = case_when(
== "vol_sampled_m3" ~ "volume filtered",
measurementType == "std_haul_factor" ~ "standardized haul factor",
measurementType == "prop_sorted" ~ "proportion of sample sorted",
measurementType == "smallplankton" ~ "small plankton biomass",
measurementType == "totalplankton" ~ "total plankton biomass",
measurementType TRUE ~ measurementType
),measurementMethod = "[NEED MEASUREMENT METHODS]", # Prompt: Add measurement methods
measurementRemarks = NA_character_
)
# Size measurements from larvasize
<- larvasize_occurrences %>%
size_measurements filter(!is.na(larvasize$length_mm)) %>%
mutate(
measurementID = paste0("urn:uuid:", UUIDgenerate(n = n())),
measurementType = "body length",
measurementValue = larvasize$length_mm,
measurementUnit = "millimeters",
measurementMethod = "[NEED SIZE MEASUREMENT METHOD]", # Prompt: Add measurement method
measurementRemarks = "Larval body length"
%>%
) select(measurementID, occurrenceID, measurementType, measurementValue,
measurementUnit, measurementMethod, measurementRemarks)
# Combine all measurements
<- bind_rows(
measurementorfact_extension %>%
env_measurements mutate(occurrenceID = NA_character_) %>%
select(measurementID, eventID, occurrenceID, measurementType,
measurementValue, measurementUnit, measurementMethod, measurementRemarks),%>%
size_measurements mutate(eventID = NA_character_) %>%
select(measurementID, eventID, occurrenceID, measurementType,
measurementValue, measurementUnit, measurementMethod, measurementRemarks) )
# Create output directory
<- here("dwc_output")
dwc_dir dir.create(dwc_dir, showWarnings = FALSE)
# Write core and extension files
write_csv(event_core, file.path(dwc_dir, "event.csv"))
write_csv(occurrence_extension, file.path(dwc_dir, "occurrence.csv"))
write_csv(measurementorfact_extension, file.path(dwc_dir, "measurementorfact.csv"))
# Create meta.xml for DwC-A
<- '<?xml version="1.0" encoding="UTF-8"?>
meta_xml <archive xmlns="http://rs.tdwg.org/dwc/text/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://rs.tdwg.org/dwc/text/ http://rs.tdwg.org/dwc/text/tdwg_dwc_text.xsd">
<core encoding="UTF-8" fieldsTerminatedBy="," linesTerminatedBy="\\n"
fieldsEnclosedBy=\'"\' ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Event">
<files>
<location>event.csv</location>
</files>
<id index="0" />
</core>
<extension encoding="UTF-8" fieldsTerminatedBy="," linesTerminatedBy="\\n"
fieldsEnclosedBy=\'"\' ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
<files>
<location>occurrence.csv</location>
</files>
<coreid index="1" />
</extension>
<extension encoding="UTF-8" fieldsTerminatedBy="," linesTerminatedBy="\\n"
fieldsEnclosedBy=\'"\' ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/MeasurementOrFact">
<files>
<location>measurementorfact.csv</location>
</files>
<coreid index="1" />
</extension>
</archive>'
writeLines(meta_xml, file.path(dwc_dir, "meta.xml"))
# Dataset metadata - PROMPTS FOR REQUIRED INFORMATION
<- "[DATASET TITLE]" # Prompt: Enter dataset title
dataset_title <- "[DATASET ABSTRACT]" # Prompt: Enter dataset abstract (200-500 words)
dataset_abstract <- c("[KEYWORD1]", "[KEYWORD2]", "[KEYWORD3]") # Prompt: Add relevant keywords
dataset_keywords <- UUIDgenerate() # Or use existing dataset UUID
dataset_uuid
# Geographic coverage
<- site %>%
geo_coverage summarise(
westBoundingCoordinate = min(longitude, na.rm = TRUE),
eastBoundingCoordinate = max(longitude, na.rm = TRUE),
northBoundingCoordinate = max(latitude, na.rm = TRUE),
southBoundingCoordinate = min(latitude, na.rm = TRUE)
)
# Temporal coverage
<- tow %>%
temp_coverage summarise(
beginDate = min(time_start, na.rm = TRUE),
endDate = max(time_start, na.rm = TRUE)
)
# Create EML document
<- list(
my_eml packageId = dataset_uuid,
system = "uuid",
dataset = list(
title = dataset_title,
creator = list(
individualName = list(
givenName = "[CREATOR GIVEN NAME]", # Prompt: Enter creator's given name
surName = "[CREATOR SURNAME]" # Prompt: Enter creator's surname
),organizationName = "[ORGANIZATION NAME]", # Prompt: Enter organization name
electronicMailAddress = "[EMAIL ADDRESS]", # Prompt: Enter email address
userId = list(
directory = "https://orcid.org/",
userId = "[ORCID ID]" # Prompt: Enter ORCID ID (0000-0000-0000-0000)
)
),abstract = dataset_abstract,
keywordSet = list(
keyword = dataset_keywords,
keywordThesaurus = "GCMD Science Keywords"
),coverage = list(
geographicCoverage = list(
geographicDescription = "[GEOGRAPHIC DESCRIPTION]", # Prompt: Describe study area
boundingCoordinates = geo_coverage
),temporalCoverage = list(
rangeOfDates = list(
beginDate = list(calendarDate = as.character(temp_coverage$beginDate)),
endDate = list(calendarDate = as.character(temp_coverage$endDate))
)
),taxonomicCoverage = list(
generalTaxonomicCoverage = "Marine ichthyoplankton and fish eggs",
taxonomicClassification = species %>%
filter(!is.na(worms_id)) %>%
select(scientificName = scientific_name) %>%
as.list()
)
),contact = list(
individualName = list(
givenName = "[CONTACT GIVEN NAME]", # Prompt: Enter contact's given name
surName = "[CONTACT SURNAME]" # Prompt: Enter contact's surname
),electronicMailAddress = "[CONTACT EMAIL]" # Prompt: Enter contact email
),methods = list(
methodStep = list(
description = list(
para = "[SAMPLING METHODS DESCRIPTION]" # Prompt: Describe sampling methods in detail
)
),sampling = list(
studyExtent = list(
description = list(
para = "[STUDY EXTENT DESCRIPTION]" # Prompt: Describe spatial and temporal extent
)
),samplingDescription = list(
para = "[SAMPLING DESCRIPTION]" # Prompt: Describe sampling design and protocols
)
)
),project = list(
title = "[PROJECT TITLE]", # Prompt: Enter project title
personnel = list(
individualName = list(
givenName = "[PI GIVEN NAME]", # Prompt: Enter PI's given name
surName = "[PI SURNAME]" # Prompt: Enter PI's surname
),role = "Principal Investigator"
),funding = list(
para = "[FUNDING INFORMATION]" # Prompt: Enter funding source and grant numbers
)
)
)
)
# Write EML
write_eml(my_eml, file.path(dwc_dir, "eml.xml"))
# Check for missing WoRMS IDs
<- species %>%
missing_worms filter(is.na(worms_id) | worms_id == "") %>%
select(species_id, scientific_name)
if(nrow(missing_worms) > 0) {
cat("WARNING: The following species are missing WoRMS IDs:\n")
print(missing_worms)
cat("\nPlease add WoRMS AphiaIDs for these species before finalizing the dataset.\n")
}
# Check for orphan records
<- occurrence_extension %>%
orphan_occurrences anti_join(event_core, by = c("eventID" = "eventID"))
if(nrow(orphan_occurrences) > 0) {
cat("\nWARNING: Found", nrow(orphan_occurrences), "occurrence records without matching events.\n")
}
# Summary statistics
cat("\n=== Dataset Summary ===\n")
cat("Total events:", nrow(event_core), "\n")
cat(" - Cruises:", sum(event_core$eventType == "cruise", na.rm = TRUE), "\n")
cat(" - Sites:", sum(event_core$eventType == "site", na.rm = TRUE), "\n")
cat(" - Tows:", sum(event_core$eventType == "tow", na.rm = TRUE), "\n")
cat(" - Net samples:", sum(event_core$eventType == "net_sample", na.rm = TRUE), "\n")
cat("\nTotal occurrences:", nrow(occurrence_extension), "\n")
cat("Total species:", n_distinct(occurrence_extension$scientificName), "\n")
cat("Total measurements:", nrow(measurementorfact_extension), "\n")
# Create DwC-A zip file
<- file.path(dwc_dir, paste0("plankton_survey_dwca_", Sys.Date(), ".zip"))
zip_file zip(zip_file,
files = c("event.csv", "occurrence.csv", "measurementorfact.csv", "meta.xml", "eml.xml"),
root = dwc_dir)
cat("\nDarwin Core Archive created:", zip_file, "\n")
Please provide the following information to complete the DarwinCore package:
Once this information is provided, update the placeholders marked with [NEED …] in the code above.