Generate a datasets sitemap.xml for ODIS crawling

Goal: Generate datasets/sitemap.xml with datasets already in authoritative repositories with JSON-LD content for ODIS crawling.

1 Read GoogleSheet of repos

Columns from original GoogleSheet:

  • repo: repository name
  • link: url to repository

Extra columns added by this script:

  • to_ds: custom function to fetch datasets from repository
  • status: check if repository is accessible (OK) or not (Not Found); optionally set in code with ck_status <- T
Code
# libraries ----
librarian::shelf(
  curl, dplyr, DT, here, httr2, glue, googlesheets4, janitor, knitr, purrr, 
  readr, stringr, tidyr)
options(readr.show_col_types = F)

# variables ----
d_gs      <- "https://docs.google.com/spreadsheets/d/1uhviF2ecfOqGaSbC_JE8B5jPRqqjMaFc9TNCK_m297c/edit?gid=1271784325#gid=1271784325"
d_csv     <- here("datasets/repo_links.csv")
ds_csv    <- here("datasets/repo_datasets.csv")
dss_csv   <- here("datasets/repo_datasets_summary.csv")
sm_xml    <- here("datasets/sitemap.xml")
ck_status <- F

# helper functions ----
erddap_ds <- function(link){
  # link = d$link[3]
  
  x <- link |> 
    str_replace(fixed("index.html"), fixed("index.csv")) |> 
    read_csv() |> 
    filter(
      Accessible == "public") |> 
    mutate(
      pfx = dplyr::if_else(
        !is.na(tabledap),
        tabledap,
        griddap),
      url = glue("{pfx}.html"))
  
  ds <- x |> 
    select(
      title = Title, url)
  attr(ds, "datasets") <- x
  ds
}

edi_ds <- function(link){
  # link = d$link[2]

  u <- url_parse(link)
  u$path  <- str_replace(u$path, "simpleSearch", "downloadSearch")
  u$query <- list(
    q = paste(names(u$query), u$query, sep = "=", collapse = "&"))
  #     curl_escape(httr2:::query_build(u$query)))
  
  x <- read_csv(url_build(u)) |> 
    mutate(
      url = glue("https://portal.edirepository.org/nis/mapbrowse?packageid={packageid}"))
  
  ds <- x |> 
    select(title, url)
  attr(ds, "datasets") <- x
  ds
}

# read googlesheet repos ----
dir.create(dirname(d_csv), showWarnings = F)

gs4_deauth()
read_sheet(d_gs) |> 
  write_csv(d_csv)

d <- read_csv(d_csv) |> 
  clean_names() |> # for now, simply translates to lower case
  mutate(
    to_ds = map_chr(
      link, 
      \(link){
        case_when(
          str_detect(link, "erddap")        ~ "erddap_ds(link)",
          str_detect(link, "edirepository") ~ "edi_ds(link)",
          .default = NA) } ) ) |> 
  relocate(to_ds, .after = repo)

if (ck_status){
  d <- d |> 
    mutate(
      status = map_chr(
        link, 
        \(x){
          request(x) |> 
            req_perform() |> 
            resp_status_desc() } ) )
}

# write repos to csv ----
d |> 
  select(repo, to_ds, link) |> 
  write_csv(d_csv)

# show repos ----
d |> 
  mutate(
    link   = glue("<a href='{link}' target='_blank'>{link}</a>")) |>
  datatable(
    escape = F,
    options = list(
      dom = "ft",
      pageLength = nrow(d))) |> 
  formatStyle(
    "to_ds",
    `font-family` = 'monospace')

2 Issues

  • GoogleSheet typo under repo: “ERRDAP” -> “ERDDAP”
  • Repositories require login:
  • TODO:

3 Fetch datasets per repo

Always return a data frame with columns:

  • title: title of dataset
  • url: url to dataset

And attach the original data frame as an attribute datasets.

Code
# fetch datasets per repo ----
datasets <- d |> 
  filter(!is.na(to_ds)) |> # View()
  mutate(
    ds = map2(
      to_ds, link,
      \(to_ds, link){
        eval(parse(text = to_ds)) } ) ) |> 
  unnest(ds)

# write datasets to csv ----
datasets |> 
  write_csv(ds_csv)

# show datasets ----
datasets|> 
  mutate(
    dataset = glue("<a href='{url}' target='_blank'>{title}</a>")) |>
  select(repo, dataset) |> 
  datatable(escape = F)

3.1 Summary of datasets per repo

Code
# tabulate datasets per repo ----
dss <- datasets |> 
  count(repo, name = "n_datasets")

write_csv(dss, dss_csv)

dss |> 
  datatable(
    options = list(
      dom = "t",
      pageLength = nrow(d)))

4 Write sitemap.xml

Code
# write sitemap.xml ----
datasets <- read_csv(ds_csv)

sm_body <- datasets |>
  glue_data(
    "<url>
      <loc>{url}</loc>
      <lastmod>{Sys.Date()}</lastmod>
    </url>") |> 
  paste(collapse = "\n")

write_lines(
  list(
    glue('
      <?xml version="1.0" encoding="UTF-8"?>
        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'), 
    sm_body, 
    '</urlset>'), 
  path = sm_xml)

Datasets sitemap.xml:

Contents of sitemap.xml:

<?xml version="1.0" encoding="UTF-8"?>
  <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
  <loc>https://oceanview.pfeg.noaa.gov/erddap/tabledap/CAC_FI_SBAS_obs.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://oceanview.pfeg.noaa.gov/erddap/tabledap/CAC_FI_SBAS_tr.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://oceanview.pfeg.noaa.gov/erddap/tabledap/cciea_B_AS_DENS.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.17.4</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.78.5</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=edi.109.4</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.183.2</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.155.3</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.172.3</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.184.2</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.262.2</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.113.4</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.170.2</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.279.3</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.171.4</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.54.7</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.281.3</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.313.1</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.162.3</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.164.2</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.21.3</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.57.3</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.119.5</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.194.6</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.255.3</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.159.6</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.188.4</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.253.2</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.152.2</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.319.1</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.58.3</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.316.1</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.284.2</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.254.4</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.249.3</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.314.1</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.312.2</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.176.6</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.180.4</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.283.1</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.76.4</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.71.6</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.55.4</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.72.4</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.104.12</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.277.1</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.311.2</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.179.4</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.20.6</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.292.1</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-hfr.170.12</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cce.62.7</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=edi.1769.1</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-sbc.172.2</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-sbc.162.1</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-sbc.56.4</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/erdCalCOFINOAAhydros.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/erdCalCOFIcufes.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/erdCalCOFIcruises.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/erdCalCOFIeggcnt.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/erdCalCOFIeggstg.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/erdCalCOFIlrvcnt.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/erdCalCOFIlrvsiz.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/erdCalCOFIlrvstg.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/erdCalCOFItowtyp.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/erdCalCOFItows.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/erdCalCOFIinvsiz.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/erdCalCOFIinvcnt.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/erdCalCOFIstns.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/erdCalCOFIzoovol.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/siocalcofiHydroBottle.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/siocalcofiHydroCast.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/FED_Rockfish_Catch.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/FRDCPSTrawlLHLengthFrequency.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/FRDCPSTrawlLHSpecimen.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/tabledap/FRDCPSTrawlLHHaulCatch.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
<url>
  <loc>https://coastwatch.pfeg.noaa.gov/erddap/griddap/FRD_CPS_SDMs.html</loc>
  <lastmod>2024-10-22</lastmod>
</url>
</urlset>