Generate a datasets sitemap.xml for ODIS crawling

Published

2025-07-02

Goal: Generate datasets/sitemap.xml with datasets already in authoritative repositories with JSON-LD content for ODIS crawling.

Tracking in Github issue(s):
- register datasets with ODIS (using JSON-LD) · Issue #24 · CalCOFI/workflows
Techniques:
- Use RESTful based APIs directly where possible. Avoid web scraping since most brittle to website changes. Avoid custom R packages (e.g., rerddap or rdataone for EDI) since have more R package dependencies, may be out of date and add unnecessary complexity.
- Store CSV tables of repositories and datasets gleaned as snapshot in Github along with this report output as html.

1 Read GoogleSheet of repos

Columns from original GoogleSheet:

repo: repository name
link: url to repository

Extra columns added by this script:

to_ds: custom function to fetch datasets from repository
status: check if repository is accessible (OK) or not (Not Found); optionally set in code with ck_status <- T

Code

# libraries ----
librarian::shelf(
  curl, dplyr, DT, here, httr2, glue, googlesheets4, janitor, knitr, purrr, 
  readr, stringr, tidyr)
options(readr.show_col_types = F)

# variables ----
d_gs      <- "https://docs.google.com/spreadsheets/d/1uhviF2ecfOqGaSbC_JE8B5jPRqqjMaFc9TNCK_m297c/edit?gid=1271784325#gid=1271784325"
d_csv     <- here("datasets/repo_links.csv")
ds_csv    <- here("datasets/repo_datasets.csv")
dsi_csv   <- here("datasets/repo_datasets_info.csv")
dss_csv   <- here("datasets/repo_datasets_summary.csv")
sm_xml    <- here("datasets/sitemap.xml")
ck_status <- F

# helper functions ----
erddap_ds <- function(link){
  # link = d$link[3]
  
  x <- link |> 
    str_replace(fixed("index.html"), fixed("index.csv")) |> 
    read_csv() |> 
    filter(
      Accessible == "public") |> 
    mutate(
      pfx = dplyr::if_else(
        !is.na(tabledap),
        tabledap |> str_replace("tabledap", "info"),
        griddap  |> str_replace("griddap",  "info")),
      url = glue("{pfx}/index.html"))

  ds <- x |> 
    select(
      title = Title, url)
  attr(ds, "datasets") <- x
  ds
}

edi_ds <- function(link){
  # link = d$link[2]

  u <- url_parse(link)
  u$path  <- str_replace(u$path, "simpleSearch", "downloadSearch")
  u$query <- list(
    q = paste(names(u$query), u$query, sep = "=", collapse = "&"))
  #     curl_escape(httr2:::query_build(u$query)))
  
  x <- read_csv(url_build(u)) |> 
    mutate(
      url = glue("https://portal.edirepository.org/nis/mapbrowse?packageid={packageid}"))
  
  ds <- x |> 
    select(title, url)
  attr(ds, "datasets") <- x
  ds
}

# individual dataset functions (not yet used) ----
erddap_ds_indiv <- function(link){
  # link = "https://coastwatch.pfeg.noaa.gov/erddap/tabledap/erdCalCOFIzoovol.html"
  # link = "https://coastwatch.pfeg.noaa.gov/erddap/info/erdCalCOFIzoovol/index.html"
  
  title <- link |> 
    str_replace("tabledap/([^.]+)\\.html", "info/\\1/index.csv") |>
    str_replace("info/([^/]+)/index\\.html", "info/\\1/index.csv") |> 
    read_csv() |> 
    filter(
      `Attribute Name` == "title") |>
    pull(Value)

  tibble(
    title = title, 
    url   = link)
}

edi_ds_indiv <- function(link){
  # link = "https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.188.4"
  # link = "https://portal.edirepository.org/nis/mapbrowse?scope=knb-lter-cce&identifier=313"
  # link = "https://portal.edirepository.org/nis/mapbrowse?scope=knb-lter-cce&identifier=104&revision=12"

  u <- url_parse(link)
  # trim revision to return latest packageid
  if (length(u$query) == 1 && names(u$query) == "packageid"){
    id <- u$query |> str_replace("\\.[0-9]+$", "")
  } else if (all(c("scope", "identifier") %in% names(u$query))){
    id <- paste(u$query$scope, u$query$identifier, sep = ".")
  } else {
    stop("Unsupported link format for EDI repository")
  }
  res <- search_data_packages(query = glue("q=id:{id}&fl=packageid,title"))
  stopifnot(nrow(res) == 1)
  
  tibble(
    title = res$title,
    url   = glue("https://portal.edirepository.org/nis/mapbrowse?packageid={res$packageid}"))
}

# read googlesheet repos ----
dir.create(dirname(d_csv), showWarnings = F)

gs4_deauth()
read_sheet(d_gs, "CalCOFI data repositories") |> 
  write_csv(d_csv)

d <- read_csv(d_csv) |> 
  clean_names() |> # for now, simply translates to lower case
  mutate(
    to_ds = map_chr(
      link, 
      \(link){
        case_when(
          str_detect(link, "erddap")        ~ "erddap_ds(link)",
          str_detect(link, "edirepository") ~ "edi_ds(link)",
          .default = NA) } ) ) |> 
  relocate(to_ds, .after = repo)

if (ck_status){
  d <- d |> 
    mutate(
      status = map_chr(
        link, 
        \(x){
          request(x) |> 
            req_perform() |> 
            resp_status_desc() } ) )
}

# write repos to csv ----
d |> 
  select(repo, to_ds, link) |> 
  write_csv(d_csv)

# show repos ----
d |> 
  mutate(
    link   = glue("<a href='{link}' target='_blank'>{link}</a>")) |>
  datatable(
    escape = F,
    options = list(
      dom = "ft",
      pageLength = nrow(d))) |> 
  formatStyle(
    "to_ds",
    `font-family` = 'monospace')

source: Living program document: CalCOFI Data Inventory_updated Oct2024 - Google Sheets

2 Issues

GoogleSheet typo under repo: “ERRDAP” -> “ERDDAP”
Repositories require login:
- ZooDB
- ZooScan
TODO:
- Add datasets from other repositories
- Extract JSON-LD from dataset links (see CalCOFI/workflows#24)
- Update lastmod to dataset’s last modified date
- Create an ODISCat entry
  
  (and point to the sitemap there). See steps at book.odis.org/gettingStarted.html. We’ll be driving the connection through that entry (to find your sitemap etc).
  – @jmckenna, per iodepo/odis-arch#461

3 Fetch datasets per repo

Always return a data frame with columns:

title: title of dataset
url: url to dataset

And attach the original data frame as an attribute datasets.

Code

# fetch datasets per repo ----
datasets <- d |> 
  filter(!is.na(to_ds)) |> # View()
  mutate(
    ds = map2(
      to_ds, link,
      \(to_ds, link){
        eval(parse(text = to_ds)) } ) ) |> 
  unnest(ds)

# write datasets to csv ----
datasets |> 
  write_csv(ds_csv)

# show datasets ----
datasets|> 
  mutate(
    dataset = glue("<a href='{url}' target='_blank'>{title}</a>")) |>
  select(repo, dataset) |> 
  datatable(escape = F)

3.1 Summary of datasets per repo

Code

# tabulate datasets per repo ----
dss <- datasets |> 
  count(repo, name = "n_datasets")

write_csv(dss, dss_csv)

dss |> 
  datatable(
    options = list(
      dom = "t",
      pageLength = nrow(d)))

4 Write sitemap.xml

Build and Submit a Sitemap | Google Search Documentation
Protocol | sitemaps.org
- loc: dataset link
- lastmod: today’s date
  TODO: change to dataset’s last modified date
- changefreq: weekly; valid values: always, hourly, daily, weekly, monthly, yearly, never
- priority: SKIP; valid values: 0 to 1, e.g. 0.8

Code

# write sitemap.xml ----
datasets <- read_csv(ds_csv)

sm_body <- datasets |>
  glue_data(
    "<url>
      <loc>{url}</loc>
      <lastmod>{Sys.Date()}</lastmod>
      <changefreq>weekly</changefreq>
    </url>") |> 
  paste(collapse = "\n")

write_lines(
  list(
    glue('
      <?xml version="1.0" encoding="UTF-8"?>
        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'), 
    sm_body, 
    '</urlset>'), 
  path = sm_xml)

# copy entire datasets folder to _output
dir_from <- here("datasets")
dir_to   <- here("_output/datasets")
dir.create(dir_to, recursive = T, showWarnings = F)
file.copy(
  from      = dir_from, 
  to        = dir_to, 
  recursive = T, 
  overwrite = T)

[1] TRUE

Datasets sitemap.xml:

calcofi.io/workflows/datasets/sitemap.xml

Contents of sitemap.xml:

<?xml version="1.0" encoding="UTF-8"?>
  <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
  <loc>https://oceanview.pfeg.noaa.gov/erddap/info/CAC_FI_SBAS_obs/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://oceanview.pfeg.noaa.gov/erddap/info/CAC_FI_SBAS_tr/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://oceanview.pfeg.noaa.gov/erddap/info/cciea_B_AS_DENS/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.17.4</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.78.5</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=edi.109.4</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.183.2</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.155.3</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.172.3</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.184.2</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.262.2</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.113.4</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.170.2</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.279.3</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.171.4</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.54.10</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.281.3</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.313.1</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.162.3</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.164.2</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.21.3</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.57.3</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.255.3</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.119.6</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.194.7</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.188.4</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.152.2</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.319.1</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.253.3</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.159.7</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.58.3</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.316.1</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.284.2</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.254.4</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.249.3</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.314.1</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.312.2</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.176.6</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.180.4</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.283.1</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.76.4</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.71.6</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.55.4</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.72.4</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.104.13</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.277.1</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.311.2</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.179.4</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.20.6</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.292.1</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-hfr.170.12</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.62.7</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=edi.1769.1</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-sbc.172.3</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-sbc.162.1</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-sbc.56.4</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/erdCalCOFINOAAhydros/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/erdCalCOFIcufes/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/erdCalCOFIcruises/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/erdCalCOFIeggcnt/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/erdCalCOFIeggstg/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/erdCalCOFIlrvcnt/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/erdCalCOFIlrvsiz/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/erdCalCOFIlrvstg/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/erdCalCOFItowtyp/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/erdCalCOFItows/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/erdCalCOFIinvsiz/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/erdCalCOFIinvcnt/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/erdCalCOFIstns/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/erdCalCOFIzoovol/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/siocalcofiHydroBottle/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/siocalcofiHydroCast/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/FED_Rockfish_Catch/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/FRDCPSTrawlLHLengthFrequency/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/FRDCPSNearshoreSetCatch/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/FRDCPSTrawlLHSpecimen/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/FRDCPSTrawlLHHaulCatch/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/info/FRD_CPS_SDMs/index.html</loc>
  <lastmod>2025-07-02</lastmod>
  <changefreq>weekly</changefreq>
</url>
</urlset>

5 Create an ODISCat entry

5.1 NEW: Spatial Coverage using `ctd_casts.geom`

We’ll get the spatial extent for the entirety of the CalCOFI CTD casts going back to 1949.

SELECT
  MIN(date) AS date_min,
  MAX(date) AS date_max,
  ST_Extent(geom) AS bbox
FROM ctd_casts

date_min	date_max	bbox
1949-02-28	2020-01-26	BOX(-164.083333 18.416666,-105.966666 47.916666)

Looking at JSON-LD in source of page California Cooperative Oceanic Fisheries Investigations (CalCOFI)Database | InPort, it wants minY minX maxY maxX, so for Spatial coverage using:

"box": "18.4 -164.1 47.9 -106.0"

5.2 OLD: Spatial Coverage using `calcofi4r`

Per example under Essential Ocean Variables — The Ocean InfoHub Project and the development of the ODIS-architecture:

"spatialCoverage": {
        "@type": "Place",
        "geo": {
            "@type": "GeoShape",
            "description": "schema.org expects lat long (Y X) coordinate order",
            "polygon": "10.161667 142.014,18.033833 142.014,18.033833 147.997833,10.161667 147.997833,10.161667 142.014"
        },
        "additionalProperty": {
            "@type": "PropertyValue",
            "propertyID": "https://dbpedia.org/page/Spatial_reference_system",
            "value": "https://www.w3.org/2003/01/geo/wgs84_pos"
        }
    }

5.2.1 CalCOFI zones

Let’s use the CalCOFI zones described in the calcofi4r R package to construct the study envelope for the spatialCoverage term.

Code

librarian::shelf(
  calcofi/calcofi4r, dplyr, glue, leaflet, mapview, rmapshaper, sf)

mapview(cc_grid_zones, zcol="zone_key") +
  mapview(cc_grid_ctrs, cex = 1)

5.2.2 Dissolve zones

Code

# dissolve zones into a single polygon
cc_ply <- cc_grid_zones |> 
  st_union()

# mapview() not working on cc_ply, so switching to leaflet()
lmap <- function(ply){
  leaflet(ply) |> 
  addProviderTiles(providers$Esri.OceanBasemap) |> 
  addPolygons()
}
lmap(cc_ply)

5.2.3 Remove holes

Code

# remove holes and cast to simple polygon
cc_ply <- st_multipolygon(lapply(cc_ply, function(x) x[1])) |> 
  st_sfc(crs = 4326) |> 
  st_cast("POLYGON")
lmap(cc_ply)

Number of characters: 67442

5.2.4 Simplify

To reduce number of characters in text string.

Code

cc_ply <- cc_ply |> 
  st_simplify(preserveTopology = T, dTolerance = 10*1000) |> # simplify by 10 km
  st_cast("POLYGON") |>  
  st_as_sf() |> 
  slice(1)

cc_txt <- cc_ply |> 
  st_geometry() |> 
  st_as_text(digits=4)

lmap(cc_ply)

Number of characters: 2543

5.2.5 Transform text

Transform to “schema.org expects lat long (Y X) coordinate order”, versus the default “X Y” order for well-known text (WKT) (and every other geospatial standard, including GeoJSON).

Original:

Code

cc_txt

[1] "POLYGON ((-122.8 38.09, -122.9 38.14, -123 38.23, -123.7 38.9, -124 39.96, -124.4 40.44, -124.3 40.68, -124.2 40.85, -124.1 41.71, -124.5 42.85, -124.4 43.33, -124.3 43.39, -124.2 43.68, -124.1 43.71, -123.9 45.49, -123.9 45.74, -123.9 46.18, -123.6 46.21, -123.9 46.27, -124.1 46.27, -124 46.38, -123.9 46.55, -123.9 46.66, -123.8 46.67, -124 46.74, -124 46.83, -123.9 46.94, -124 47.01, -124.2 47.07, -124.7 48.39, -124.3 48.26, -124.3 48.48, -125.1 48.74, -125 48.91, -125.3 49.04, -125.4 48.94, -125.9 49.14, -133.6 46.14, -135.2 45.47, -133.6 43.74, -135.2 43.07, -133.7 41.34, -135.2 40.68, -130.7 35.48, -134.9 33.48, -127.9 24.82, -126.6 25.49, -125.4 23.75, -124.1 24.42, -122.9 22.69, -121.7 23.36, -120.5 21.62, -119.2 22.29, -118.1 20.56, -116.9 21.23, -115.8 19.49, -114.5 20.16, -113.5 18.43, -105.8 22.6, -106.9 23.8, -107.5 24.4, -107.7 24.49, -108 24.66, -108.1 24.77, -108.1 24.97, -108 25.03, -108.2 25.11, -108.3 25.11, -108.4 25.15, -108.4 25.25, -108.6 25.34, -108.7 25.36, -108.8 25.56, -109 25.47, -109.1 25.55, -109.3 25.66, -109.4 25.77, -109.3 26.13, -109.2 26.26, -109.1 26.22, -109.2 26.38, -109.4 26.64, -109.6 26.7, -109.8 26.77, -109.9 26.95, -109.9 27.06, -110.4 27.25, -110.5 27.3, -110.6 27.66, -110.6 27.73, -110.6 27.89, -110.9 27.9, -111.4 28.23, -112.2 29.07, -112.3 29.3, -112.8 30.06, -113.1 30.91, -113.1 31.2, -113.6 31.38, -114 31.65, -114.2 31.49, -114.4 31.64, -114.9 31.5, -114.8 30.98, -114.5 29.95, -113.6 29.09, -113.5 28.9, -113.4 28.93, -113.1 28.64, -112.9 28.4, -112.6 27.68, -112.3 27.33, -112 27.07, -111.9 26.81, -111.7 26.82, -111.4 26.21, -111.3 25.91, -111.3 25.76, -111.1 25.53, -111 25.52, -110.7 24.69, -110.6 24.22, -110.5 24.18, -110.3 24.18, -110.2 24.34, -110 24.04, -109.8 24.05, -109.7 23.64, -109.4 23.36, -109.8 22.92, -110.1 23.01, -110.4 23.61, -111.4 24.31, -111.5 24.37, -111.6 24.55, -111.8 24.55, -111.9 24.73, -112 24.76, -112.1 24.9, -112.1 25.06, -112.1 25.54, -112.1 25.72, -112.6 26.3, -113.2 26.76, -113.7 26.77, -114.5 27.24, -114.8 27.62, -114.9 27.69, -115 27.86, -114.3 27.82, -114.1 27.71, -114.1 27.92, -114.3 27.92, -114.2 28, -114.1 28.12, -114.1 28.52, -115.3 29.53, -115.7 29.85, -115.9 30.44, -116 30.36, -116 30.48, -116.1 30.82, -116.3 31.09, -116.7 31.74, -116.6 31.84, -117.1 32.47, -117.1 32.57, -117.1 32.67, -117.3 32.76, -117.5 33.34, -118.2 33.73, -118.4 33.75, -118.5 34.04, -119.2 34.18, -119.7 34.4, -120.6 34.55, -120.7 35.15, -120.9 35.33, -121.9 36.4, -121.9 36.51, -122 36.61, -121.8 36.84, -122.4 37.22, -122.5 37.86, -122.8 38.09))"

Converted:

Code

cc_txt = cc_txt |>
  # Swap coordinates
  gsub("([-0-9.]+)\\s+([-0-9.]+)", "\\2 \\1", x = _) |>
  # Remove space after comma
  gsub(",\\s",",", x = _) |>
  # Remove outer MULTIPOLYGON (( and ))
  gsub("POLYGON \\(\\((.*)\\)\\)", "\\1", x = _)
# nchar(cc_txt) # 2,476
cc_txt

[1] "38.09 -122.8,38.14 -122.9,38.23 -123,38.9 -123.7,39.96 -124,40.44 -124.4,40.68 -124.3,40.85 -124.2,41.71 -124.1,42.85 -124.5,43.33 -124.4,43.39 -124.3,43.68 -124.2,43.71 -124.1,45.49 -123.9,45.74 -123.9,46.18 -123.9,46.21 -123.6,46.27 -123.9,46.27 -124.1,46.38 -124,46.55 -123.9,46.66 -123.9,46.67 -123.8,46.74 -124,46.83 -124,46.94 -123.9,47.01 -124,47.07 -124.2,48.39 -124.7,48.26 -124.3,48.48 -124.3,48.74 -125.1,48.91 -125,49.04 -125.3,48.94 -125.4,49.14 -125.9,46.14 -133.6,45.47 -135.2,43.74 -133.6,43.07 -135.2,41.34 -133.7,40.68 -135.2,35.48 -130.7,33.48 -134.9,24.82 -127.9,25.49 -126.6,23.75 -125.4,24.42 -124.1,22.69 -122.9,23.36 -121.7,21.62 -120.5,22.29 -119.2,20.56 -118.1,21.23 -116.9,19.49 -115.8,20.16 -114.5,18.43 -113.5,22.6 -105.8,23.8 -106.9,24.4 -107.5,24.49 -107.7,24.66 -108,24.77 -108.1,24.97 -108.1,25.03 -108,25.11 -108.2,25.11 -108.3,25.15 -108.4,25.25 -108.4,25.34 -108.6,25.36 -108.7,25.56 -108.8,25.47 -109,25.55 -109.1,25.66 -109.3,25.77 -109.4,26.13 -109.3,26.26 -109.2,26.22 -109.1,26.38 -109.2,26.64 -109.4,26.7 -109.6,26.77 -109.8,26.95 -109.9,27.06 -109.9,27.25 -110.4,27.3 -110.5,27.66 -110.6,27.73 -110.6,27.89 -110.6,27.9 -110.9,28.23 -111.4,29.07 -112.2,29.3 -112.3,30.06 -112.8,30.91 -113.1,31.2 -113.1,31.38 -113.6,31.65 -114,31.49 -114.2,31.64 -114.4,31.5 -114.9,30.98 -114.8,29.95 -114.5,29.09 -113.6,28.9 -113.5,28.93 -113.4,28.64 -113.1,28.4 -112.9,27.68 -112.6,27.33 -112.3,27.07 -112,26.81 -111.9,26.82 -111.7,26.21 -111.4,25.91 -111.3,25.76 -111.3,25.53 -111.1,25.52 -111,24.69 -110.7,24.22 -110.6,24.18 -110.5,24.18 -110.3,24.34 -110.2,24.04 -110,24.05 -109.8,23.64 -109.7,23.36 -109.4,22.92 -109.8,23.01 -110.1,23.61 -110.4,24.31 -111.4,24.37 -111.5,24.55 -111.6,24.55 -111.8,24.73 -111.9,24.76 -112,24.9 -112.1,25.06 -112.1,25.54 -112.1,25.72 -112.1,26.3 -112.6,26.76 -113.2,26.77 -113.7,27.24 -114.5,27.62 -114.8,27.69 -114.9,27.86 -115,27.82 -114.3,27.71 -114.1,27.92 -114.1,27.92 -114.3,28 -114.2,28.12 -114.1,28.52 -114.1,29.53 -115.3,29.85 -115.7,30.44 -115.9,30.36 -116,30.48 -116,30.82 -116.1,31.09 -116.3,31.74 -116.7,31.84 -116.6,32.47 -117.1,32.57 -117.1,32.67 -117.1,32.76 -117.3,33.34 -117.5,33.73 -118.2,33.75 -118.4,34.04 -118.5,34.18 -119.2,34.4 -119.7,34.55 -120.6,35.15 -120.7,35.33 -120.9,36.4 -121.9,36.51 -121.9,36.61 -122,36.84 -121.8,37.22 -122.4,37.86 -122.5,38.09 -122.8"

5.2.6 Enter `polygon` into `spatialCoverage`

Code

sc <- glue(
  '{
      "@type": "Place",
      "geo": {
          "@type": "GeoShape",
          "description": "schema.org expects lat long (Y X) coordinate order",
          "polygon": "{{cc_txt}}"
      },
      "additionalProperty": {
          "@type": "PropertyValue",
          "propertyID": "https://dbpedia.org/page/Spatial_reference_system",
          "value": "https://www.w3.org/2003/01/geo/wgs84_pos"
   }',
  .open  = "{{",
  .close = "}}")
sc

{
   "@type": "Place",
   "geo": {
       "@type": "GeoShape",
       "description": "schema.org expects lat long (Y X) coordinate order",
       "polygon": "38.09 -122.8,38.14 -122.9,38.23 -123,38.9 -123.7,39.96 -124,40.44 -124.4,40.68 -124.3,40.85 -124.2,41.71 -124.1,42.85 -124.5,43.33 -124.4,43.39 -124.3,43.68 -124.2,43.71 -124.1,45.49 -123.9,45.74 -123.9,46.18 -123.9,46.21 -123.6,46.27 -123.9,46.27 -124.1,46.38 -124,46.55 -123.9,46.66 -123.9,46.67 -123.8,46.74 -124,46.83 -124,46.94 -123.9,47.01 -124,47.07 -124.2,48.39 -124.7,48.26 -124.3,48.48 -124.3,48.74 -125.1,48.91 -125,49.04 -125.3,48.94 -125.4,49.14 -125.9,46.14 -133.6,45.47 -135.2,43.74 -133.6,43.07 -135.2,41.34 -133.7,40.68 -135.2,35.48 -130.7,33.48 -134.9,24.82 -127.9,25.49 -126.6,23.75 -125.4,24.42 -124.1,22.69 -122.9,23.36 -121.7,21.62 -120.5,22.29 -119.2,20.56 -118.1,21.23 -116.9,19.49 -115.8,20.16 -114.5,18.43 -113.5,22.6 -105.8,23.8 -106.9,24.4 -107.5,24.49 -107.7,24.66 -108,24.77 -108.1,24.97 -108.1,25.03 -108,25.11 -108.2,25.11 -108.3,25.15 -108.4,25.25 -108.4,25.34 -108.6,25.36 -108.7,25.56 -108.8,25.47 -109,25.55 -109.1,25.66 -109.3,25.77 -109.4,26.13 -109.3,26.26 -109.2,26.22 -109.1,26.38 -109.2,26.64 -109.4,26.7 -109.6,26.77 -109.8,26.95 -109.9,27.06 -109.9,27.25 -110.4,27.3 -110.5,27.66 -110.6,27.73 -110.6,27.89 -110.6,27.9 -110.9,28.23 -111.4,29.07 -112.2,29.3 -112.3,30.06 -112.8,30.91 -113.1,31.2 -113.1,31.38 -113.6,31.65 -114,31.49 -114.2,31.64 -114.4,31.5 -114.9,30.98 -114.8,29.95 -114.5,29.09 -113.6,28.9 -113.5,28.93 -113.4,28.64 -113.1,28.4 -112.9,27.68 -112.6,27.33 -112.3,27.07 -112,26.81 -111.9,26.82 -111.7,26.21 -111.4,25.91 -111.3,25.76 -111.3,25.53 -111.1,25.52 -111,24.69 -110.7,24.22 -110.6,24.18 -110.5,24.18 -110.3,24.34 -110.2,24.04 -110,24.05 -109.8,23.64 -109.7,23.36 -109.4,22.92 -109.8,23.01 -110.1,23.61 -110.4,24.31 -111.4,24.37 -111.5,24.55 -111.6,24.55 -111.8,24.73 -111.9,24.76 -112,24.9 -112.1,25.06 -112.1,25.54 -112.1,25.72 -112.1,26.3 -112.6,26.76 -113.2,26.77 -113.7,27.24 -114.5,27.62 -114.8,27.69 -114.9,27.86 -115,27.82 -114.3,27.71 -114.1,27.92 -114.1,27.92 -114.3,28 -114.2,28.12 -114.1,28.52 -114.1,29.53 -115.3,29.85 -115.7,30.44 -115.9,30.36 -116,30.48 -116,30.82 -116.1,31.09 -116.3,31.74 -116.7,31.84 -116.6,32.47 -117.1,32.57 -117.1,32.67 -117.1,32.76 -117.3,33.34 -117.5,33.73 -118.2,33.75 -118.4,34.04 -118.5,34.18 -119.2,34.4 -119.7,34.55 -120.6,35.15 -120.7,35.33 -120.9,36.4 -121.9,36.51 -121.9,36.61 -122,36.84 -121.8,37.22 -122.4,37.86 -122.5,38.09 -122.8"
   },
   "additionalProperty": {
       "@type": "PropertyValue",
       "propertyID": "https://dbpedia.org/page/Spatial_reference_system",
       "value": "https://www.w3.org/2003/01/geo/wgs84_pos"
}

5.3 Add ODIS record `3318`

Added record here:

catalogue.odis.org/view/3318
- print: CalCOFI on ODIS record 3318.pdf

screenshot…

6 Parse JSON-LD from dataset links

Code

# Load required libraries
librarian::shelf(
  dplyr, httr2, jsonlite, listviewer, readr, rvest, dplyr, purrr, tibble, tidyr, yaml)
redo_dsi = F

# Define the function to extract and flatten JSON-LD data
extract_jsonld <- function(url) {
  # url <- d$url[1]
  
  # Fetch the web page using httr2
  response <- request(url) %>%
    req_perform()
  
  # Check if the request was successful
  if (response$status_code == 200) {
    page_content <- response %>%
      resp_body_string()
  } else {
    warning(paste("Failed to retrieve the web page:", url))
    return(NULL)
  }
  
  # Parse the HTML content and find the script tag with type="application/ld+json"
  script_node <- page_content |> 
    read_html() |> 
    html_node(
      xpath = "//script[@type='application/ld+json']")
  
  if (is.na(script_node)) {
    warning(paste("No JSON-LD found in the page:", url))
    return(NULL)
  }
  
  # Extract the JSON-LD content
  html_text(script_node) |> 
    fromJSON() |> 
    as.yaml()
}

if (!file.exists(dsi_csv) | redo_dsi){
  d <- read_csv(ds_csv) |> 
    select(url) |> 
    mutate(
      jsonld_yaml = map_chr(url, extract_jsonld)) 
  
  write_csv(d, dsi_csv)
}

# show outputs
read_csv(dsi_csv) |> 
  mutate(
    jsonld = map(jsonld_yaml, yaml::yaml.load)) |> 
  select(-jsonld_yaml) |>
  deframe() |> 
  jsonedit()

1 Read GoogleSheet of repos

2 Issues

3 Fetch datasets per repo

3.1 Summary of datasets per repo

4 Write sitemap.xml

5 Create an ODISCat entry

5.1 NEW: Spatial Coverage using ctd_casts.geom

5.2 OLD: Spatial Coverage using calcofi4r

5.2.1 CalCOFI zones

5.2.2 Dissolve zones

5.2.3 Remove holes

5.2.4 Simplify

5.2.5 Transform text

5.2.6 Enter polygon into spatialCoverage

5.3 Add ODIS record 3318

6 Parse JSON-LD from dataset links

5.1 NEW: Spatial Coverage using `ctd_casts.geom`

5.2 OLD: Spatial Coverage using `calcofi4r`

5.2.6 Enter `polygon` into `spatialCoverage`

5.3 Add ODIS record `3318`