Author

Ben Best

1 CalCOFI Larvae

Constraints described by Andrew Thompson in CalCOFI Analysis for CINMS/MBNMS infographics (calcofi.io/larvae-cinms):

  • select only starboard samples from the bongo nets
    net2cruise.netside = 'S' (Port is usually ethanol sample vs formalin)
  • select a minimum date
    net2cruise.cruise_ymd >= '19780101'
    diff’t gear type, like oblique tow. internal table for

Questions:

  • Should we limit (or have option) to filter by linestation_core?

  • Should we limit (or have option) to limit by towtype?

    tbl(con, "net2cruise") |> pull(towtype) |> table()
         C1     CB     CV     MT     PV 
     28,640 18,819  6,402 12,757 12,065

    See tow_types table:

    • CB: standard bongo tow starting 1978-01-01
    • C1: previously used ringnet

1.1 Species Groups

Code
librarian::shelf(
  calcofi/calcofi4r,
  dplyr, DT, dygraphs, glue, here, mapview, purrr, readr, sf, tidyr,
  quiet = T)

source(here("../apps_dev/libs/db.R")) # con: database connection 

tbl(con, "species_groups") |> 
  group_by(spp_group) |> 
  summarize(n_spp = n()) |> 
  collect() |> 
  datatable()

1.2 Anchovy by Year-Month

Code
spp_group <- "Anchovy"
ym        <- "2005-04"

monthly_csv <- here(glue("data/larvae_{spp_group}-ym.csv"))
ym_geo      <- here(glue("data/larvae_{spp_group}-{ym}.geojson"))

if (any(!file.exists(c(monthly_csv, ym_geo)))){
  d <- tbl(con, "net2cruise") |> # colnames() |> paste(collapse=", ") 
    # netid, cruise_id, cruise_ymd, ship, line, stationid, station, latitude, longitude, orderocc, gebco_depth, netside, townumber, towtype, volsampled, shf, propsorted, starttime, geom
    filter(
      netside    == "S",
      cruise_ymd >= "19780101") |> 
    left_join(
      tbl(con, "larvae_species"),
      # netid, spccode, scientific_name, common_name, larvaecount, itis_tsn
      by="netid") |>
    left_join(
      tbl(con, "species_groups") |> 
        # spp_group, spccode
        filter(spp_group == !!spp_group), 
      by="spccode") |> 
    select(
      netid, longitude, latitude, starttime, volsampled,             # net2cruise
      spccode, scientific_name, common_name, larvaecount, itis_tsn,  # larvae_species
      spp_group) |>                                                  # species_groups
    arrange(starttime) |> 
    collect() |> # 27,721 × 6 <-  145,724 × 11 
    group_by(
      netid, longitude, latitude, starttime, volsampled) |>
    nest() |> 
    mutate(
      larvaecount = map_int(data, \(x){
        x |> 
          filter(
            !is.na(spp_group)) |> 
          summarize(
            n = sum(larvaecount, na.rm = T)) |> 
          pull(n) } ) )

  d_monthly <- d |> 
    select(-data) |> 
    mutate(
      ym_chr = format(starttime, "%Y-%m"),
      ym     = glue("{ym_chr}-01") |> as.Date()) |>
    group_by(ym) |> 
    summarize(
      vol = first(volsampled),
      n   = sum(larvaecount)) |> 
    mutate(
      n_log10         = log10(n + 0.001),
      n_per_vol       = n / vol,
      n_per_vol_log10 = log10(n_per_vol + 0.0001))
  write_csv(d_monthly, monthly_csv)
  
  pts_ym <- d |>  # 99 × 7 <- 27,721 × 7
    filter(
      format(starttime, "%Y-%m") == "2005-04") |> 
    select(-data) |>
    rename(
      n   = larvaecount,
      vol = volsampled) |> 
    mutate(
      n_log10         = log10(n + 0.001),
      n_per_vol       = n / vol,
      n_per_vol_log10 = log10(n_per_vol + 0.0001)) |>
    sf::st_as_sf(
      coords = c("longitude", "latitude"), 
      crs    = 4326,
      remove = F)
  write_sf(pts_ym, ym_geo, delete_dsn = T)
}

d_monthly <- read_csv(monthly_csv, show_col_types = F)

d_monthly |> 
  select(ym, n_per_vol) |> 
  arrange(ym) |> 
  dygraph(
    x = "ym", 
    y = "n_per_vol",
    main = glue("Larvae Counts by Month for {spp_group}"))

1.3 Map of Anchovy Larvae on 2005-04

Code
pts_ym <- read_sf(ym_geo)

mapView(
  pts_ym, 
  zcol        = "n_per_vol_log10", 
  col.regions = rev(RColorBrewer::brewer.pal(11,"Spectral")))

2 Copernicus Marine

The GLORYS12V1 product is the CMEMS global ocean eddy-resolving (1/12° horizontal resolution, 50 vertical levels) reanalysis covering the altimetry (1993 onward)… This product includes daily and monthly mean files for temperature, salinity, currents, sea level, mixed layer depth and ice parameters from the top to the bottom. The global ocean output files are displayed on a standard regular grid at 1/12° (approximatively 8 km) and on 50 standard levels.

2.1 Visualizing Copernicus in 4D (lon, lat, depth, time)

The Web Map Tile Service (WMTS) allows for interactive map rendering by variable, time and depth. It does not include the raw data values, so is ony for visualization.

The WMTS GetCapabilities gets parsed to collect available variables, depths (elevation) and dates (time) for the given product_id/dataset_id/variable.

See:

2.2 Subset raster of surface temperature for 2005-04

Using the CLI.

Code
librarian::shelf(
  glue, httr2, leaflet, lubridate, xml2)

# Global Ocean Physics Reanalysis
#   1993-01-01 to 2021-06-01
product_id = "GLOBAL_MULTIYEAR_PHY_001_030"
# dataset_id = "cmems_mod_glo_phy_my_0.083deg_P1D-m" # P1D: daily
dataset_id = "cmems_mod_glo_phy_my_0.083deg_P1M-m" # P1M: monthly

# Global Ocean Physics Analysis and Forecast
#   1 Nov 2020 to 31 May 2024
# product_id = "GLOBAL_ANALYSISFORECAST_PHY_001_024"
# dataset_id = "cmems_mod_glo_phy-thetao_anfc_0.083deg_P1M-m"

# my:    MY (Multi-Year/Reprocessed)
# myint: MY interim (MYINT) 
# nrt:   NRT (Near Real Time)

variable = "thetao"
date     = as.Date("2005-04-01") |> as.POSIXct()
  • product_id: GLOBAL_MULTIYEAR_PHY_001_030
  • dataset_id: cmems_mod_glo_phy_my_0.083deg_P1M-m
  • variable: thetao (temperature)
Code
librarian::shelf(
  calcofi/calcofi4r, 
  dplyr, glue, here, jsonlite, leaflet, lubridate, mapview, purrr, sf, terra,
  tibble, tidyr)

cm   = "/opt/homebrew/Caskroom/mambaforge/base/envs/copernicusmarine/bin/copernicusmarine"
user = "bbest1"
# pass = readLines("~/My Drive/private/data.marine.copernicus.eu_bbest1-password.txt")
pass = "temporary"
verbose = F

# CalCOFI bounding box
b <- st_union(cc_grid_zones) |> st_bbox()
# b
#       xmin       ymin       xmax       ymax 
# -135.23008   18.42757 -105.77692   49.23891

m_json <- here(glue("data/copernicusmarine/{dataset_id}.json"))
cmd <- glue("{cm} describe --include-datasets -c {dataset_id}")
if (!file.exists(m_json))
  system(cmd, intern = TRUE) |> writeLines(m_json)
m_o <- read_json(m_json)
# listviewer::jsonedit(m_o)

get_coord = function(o, v){
  if (!v %in% names(o))
    return(NA)
  if (length(o[[v]]) == 0)
    return(NA)
  o[[v]] |> 
    as.character()
}

coord_date <- function(df, col){
  
  df |> 
    filter(coord_id == "time") |> 
    pull(!!col) |> 
    (\(x) as.double(x) / 1000)() |>  
    as_datetime(origin = "1970-01-01", tz = "UTC") |> 
    as_date()
}

m_vars <- m_o |> 
  pluck("products", 1, "datasets", 1, "versions", 1, "parts", 1, "services") |> 
  keep(\(x) x$service_type$service_name == "arco-geo-series") |> 
  pluck(1, "variables")

d_vars_coords <- m_vars %>%
  tibble(
    short_name    = map_chr(., "short_name"),
    standard_name = map_chr(., "standard_name"),
    units         = map_chr(., "units"),
    coord         = map(., "coordinates") ) |> 
  select(-`.`) |> 
  unnest(coord) |> 
  mutate(
    coord_id            = map_chr(coord, "coordinates_id"),
    coord_units         = map_chr(coord, "units"),
    coord_min_val       = map_chr(coord, get_coord, "minimum_value"),
    coord_max_val       = map_chr(coord, get_coord, "maximum_value"),
    coord_step          = map_chr(coord, get_coord, "step"),
    coord_chunk_length  = map_chr(coord, get_coord, "chunking_length"),
    coord_chunk_type    = map_chr(coord, get_coord, "chunk_type"),
    coord_chunk_ref     = map_chr(coord, get_coord, "chunk_reference_coordinate"),
    coord_chunk_geo     = map_chr(coord, get_coord, "chunk_geometric_factor"),
    coord_values        = map(coord, "values"),
    coord_values_n      = map_int(coord_values, length))
#
# d_vars_coords |> select(short_name, coord_id) |> table()
#           coord_id
# short_name depth latitude longitude time
#    bottomT     1        1         1    1
#    mlotst      1        1         1    1
#    siconc      1        1         1    1
#    sithick     1        1         1    1
#    so          1        1         1    1
#    thetao      1        1         1    1
#    uo          1        1         1    1
#    usi         1        1         1    1
#    vo          1        1         1    1
#    vsi         1        1         1    1
#    zos         1        1         1    1
#
# d_vars_coords |> select(coord_id, coord_values_n) |> table()
#            coord_values_n
# coord_id     0 50
#   depth      0 11
#   latitude  11  0
#   longitude 11  0
#   time      11  0

d_vars <- d_vars_coords |> 
  nest(.by = c(short_name, standard_name, units)) |> 
  mutate(
    date_min = map_dbl(data, coord_date, "coord_min_val") |> 
      as.Date(),
    date_max = map_dbl(data, coord_date, "coord_max_val") |> 
      as.Date())

depths <- d_vars |> 
  filter(
    short_name == "thetao") |> 
  pull(data) |> 
  (\(x) x[[1]])() |> 
  filter(
    coord_id == "depth") |> 
  pull(coord_values) |>
  unlist()

# depths <- obj$products[[1]]$datasets[[1]]$versions[[1]]$parts[[1]]$services[[2]]$variables[[1]]$coordinates[[4]][["values"]] |> 
#   unlist()
depth <- depths[length(depths)] * -1

# output filename
nc <- here(glue("data/copernicusmarine/{dataset_id}__{variable}__{date}.nc"))

# https://help.marine.copernicus.eu/en/articles/7970514-copernicus-marine-toolbox-installation
# mamba update --name copernicusmarine copernicusmarine --yes
# copernicusmarine, version 1.2.2
cmd <- glue("
  {cm} subset -i {dataset_id} \\
  -x {b$xmin} -X {b$xmax}     \\
  -y {b$ymin} -Y {b$ymax}     \\
  -t {date}   -T {date + months(1) - days(1)}       \\
  -z {depth - 0.01}  -Z {depth +  - 0.01}      \\
  -v {variable}               \\
  -o {dirname(nc)} -f {basename(nc)} \\
  --username {user} --password '{pass}' \\
  --force-download")
if (verbose)
  print(cmd)
if (!file.exists(nc))
  system(cmd, intern = TRUE)

r <- rast(nc)
# dimensions  : 369, 353, 1  (nrow, ncol, nlyr)
# resolution  : 0.08333334, 0.08333334  (x, y)
# extent      : -135.2083, -105.7917, 18.45833, 49.20833  (xmin, xmax, ymin, ymax)
r_3857 <- leaflet::projectRasterForLeaflet(r, method = "bilinear")
# dimensions  : 403, 313, 1  (nrow, ncol, nlyr)
# resolution  : 10458.57, 10458.57  (x, y)

plet(r_3857, tiles = "Esri.WorldImagery")

3 Interpolate larvae on Copernicus grid

3.1 IDW(x,y)

Code
# basename(nc) # cmems_mod_glo_phy_my_0.083deg_P1M-m__thetao__2022-04-01.nc"

r_cm <- rast(nc)

r_idw <- terra::interpIDW(
  x = r_cm, 
  y = pts_ym |> terra::vect(),
  "n_per_vol_log10",
  # borrow from defaults in [calcofi4r:: pts_to_rast_idw()](https://github.com/CalCOFI/calcofi4r/blob/5e9508565a78785f8b374c3b8b5e2c4cdf559ed0/R/analyze.R#L236C5-L236C100)
  radius = 1.5, power = 1.3, smooth = 0.2,
  maxPoints = Inf, minPoints = 1, near = F, fill = NA) |> 
  mask(r_cm)
names(r_idw)     <- "n_per_vol_log10_idw"
# time(r_idw) # "2005-04-01 UTC"
varnames(r_idw)  <- ""
longnames(r_idw) <- ""
# units(r_idw) ""
# plot(r_idw)
  
pts_ymi <- r_idw |> 
  terra::extract(pts_ym, method = "simple", bind = T) |> 
  st_as_sf() |> 
  mutate(
    dif_idw = n_per_vol_log10 - n_per_vol_log10_idw)

# pts_ymi$dif_idw |> summary()
#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
# -2.24324 -0.28024  0.00000  0.01276  0.38674  1.76034        6 

d_mi <- pts_ymi |> 
  st_drop_geometry() |>
  filter(!is.na(n_per_vol_log10_idw))

rmse_idw <- with(
  d_mi,
  sqrt( 
    sum((n_per_vol_log10_idw - n_per_vol_log10)^2) /
      nrow(d_mi) ) )
# [1] 0.7824212

mapView(
  r_idw, 
  zcol        = "n_per_vol_log10_idw", 
  col.regions = rev(RColorBrewer::brewer.pal(11,"Spectral"))) + 
mapView(
  pts_ymi, 
  zcol        = "n_per_vol_log10", 
  col.regions = rev(RColorBrewer::brewer.pal(11,"Spectral"))) +
mapView(
  pts_ymi, 
  zcol        = "dif_idw", 
  col.regions = rev(RColorBrewer::brewer.pal(11,"Spectral"))) 

Root Mean Square Error: 1.0986527

3.2 GAM(x, y)

Code
librarian::shelf(mgcv)

# fit GAM
mdl_xy <- mgcv::gam(
  n_per_vol_log10 ~ s(longitude, latitude, k = 60),
  data   = pts_ymi |> st_drop_geometry() |> 
    select(n_per_vol_log10, longitude, latitude), 
  method = "GCV.Cp", # method = "REML",
  family = gaussian())
# summary(mdl_xy)
# Family: gaussian 
# Link function: identity 
# 
# Formula:
# n_per_vol_log10 ~ s(longitude, latitude, k = 60)
# 
# Parametric coefficients:
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  0.25360    0.06438   3.939 0.000117 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Approximate significance of smooth terms:
#                         edf Ref.df     F p-value    
# s(longitude,latitude) 12.91  18.26 11.79  <2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# R-sq.(adj) =  0.526   Deviance explained = 55.7%
# GCV = 0.86605  Scale est. = 0.80397   n = 194

p_idw <- r_idw |> 
  as.points() |> 
  st_as_sf() |> 
  mutate(
    longitude = sf::st_coordinates(geometry)[,1],
    latitude  = sf::st_coordinates(geometry)[,2])

# range(p_idw$n_per_vol_log10_idw)
# [1] -1.000000  2.413467

# predict with GAM
p_idw$n_per_vol_log10_xy <- predict.gam(
  mdl_xy, 
  newdata = p_idw |> 
    st_drop_geometry() |> 
    select(longitude, latitude), 
  type = "response") # link") # "response")
# range(p_idw$n_per_vol_log10_xy) # -4.334638  0.183662

r_xy <- rasterize(
  p_idw |> 
    st_drop_geometry() |> 
    select(longitude, latitude) |>
    as.matrix(),
  r_idw, 
  values = p_idw$n_per_vol_log10_xy)
# range(values(r_xy, na.rm=T)) # -4.334638  0.183662
# plot(r_xy)
names(r_xy)     <- "n_per_vol_log10_xy"

pts_ymix <- r_xy |> 
  terra::extract(pts_ymi, method = "simple", bind = T) |> 
  st_as_sf() |> 
  mutate(
    dif_xy = n_per_vol_log10 - n_per_vol_log10_xy)

# pts_ymix$dif_xy |> summary()
#      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
# -2.259856 -0.326382 -0.000841  0.024003  0.401710  2.446836         6 

d_mix <- pts_ymix |> 
  st_drop_geometry() |>
  filter(!is.na(n_per_vol_log10_idw))

rmse_xy <- with(
  d_mix,
  sqrt( 
    sum((n_per_vol_log10_xy - n_per_vol_log10)^2) /
      nrow(d_mix) ) )
# [1] 0.8445961

mapView(
  r_xy, 
  zcol        = "n_per_vol_log10_xy", 
  col.regions = rev(RColorBrewer::brewer.pal(11,"Spectral"))) + 
mapView(
  pts_ymix, 
  zcol        = "n_per_vol_log10", 
  col.regions = rev(RColorBrewer::brewer.pal(11,"Spectral"))) +
mapView(
  pts_ymix, 
  zcol        = "dif_xy", 
  col.regions = rev(RColorBrewer::brewer.pal(11,"Spectral"))) 

Root Mean Square Error: 1.1494105

3.3 GAM(x, y, temperature)

Code
librarian::shelf(mgcv)

names(r_cm) <- "thetao"

pts_ymixt <- r_cm |> 
  terra::extract(pts_ymix, method = "simple", bind = T) |> 
  st_as_sf()

# fit GAM
mdl_xyt <- mgcv::gam(
  n_per_vol_log10 ~ s(longitude, latitude, k = 60) + s(thetao),
  data   = pts_ymixt |> st_drop_geometry() |> 
    select(n_per_vol_log10, longitude, latitude, thetao), 
  method = "GCV.Cp", # method = "REML",
  family = gaussian())
# summary(mdl_xyt)
# Family: gaussian 
# Link function: identity 
# 
# Formula:
# n_per_vol_log10 ~ s(longitude, latitude, k = 60) + s(thetao)
# 
# Parametric coefficients:
#             Estimate Std. Error t value Pr(>|t|)    
# (Intercept)  0.25181    0.06219   4.049 7.84e-05 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Approximate significance of smooth terms:
#                          edf Ref.df      F p-value    
# s(longitude,latitude) 12.767 18.199 12.505  <2e-16 ***
# s(thetao)              6.151  7.214  1.556   0.146    
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# R-sq.(adj) =  0.576   Deviance explained = 61.9%
# GCV = 0.81327  Scale est. = 0.7271    n = 188

p_cm <- r_cm |> 
  mask(r_idw) |>
  trim() |> 
  as.points() |> 
  st_as_sf() |> 
  mutate(
    longitude = sf::st_coordinates(geometry)[,1],
    latitude  = sf::st_coordinates(geometry)[,2])

# range(p_idw$n_per_vol_log10_idw)
# [1] -1.000000  2.413467

# predict with GAM
p_cm$n_per_vol_log10_xyt <- predict.gam(
  mdl_xyt, 
  newdata = p_cm |> 
    st_drop_geometry() |> 
    select(longitude, latitude, thetao), 
  type = "response") # link") # "response")
# range(p_cm$n_per_vol_log10_xyt) # -5.369945  1.165597

r_xyt <- rasterize(
  p_cm |> 
    st_drop_geometry() |> 
    select(longitude, latitude) |>
    as.matrix(),
  r_idw, 
  values = p_cm$n_per_vol_log10_xyt)
# range(values(r_xyt, na.rm=T)) # -5.369945  1.165597
# plot(r_xy)
names(r_xyt)     <- "n_per_vol_log10_xyt"

pts_ymixt <- r_xyt |> 
  terra::extract(pts_ymixt, method = "simple", bind = T) |> 
  st_as_sf() |> 
  mutate(
    dif_xyt = n_per_vol_log10 - n_per_vol_log10_xyt)

# pts_ymixt$dif_xyt |> summary()
#      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
# -3.596383 -0.429587 -0.003567  0.002183  0.587119  2.562442         6
     
     
d_mixt <- pts_ymixt |> 
  st_drop_geometry() |>
  filter(!is.na(n_per_vol_log10_idw))

rmse_xyt <- with(
  d_mixt,
  sqrt( 
    sum((n_per_vol_log10_xyt - n_per_vol_log10)^2) /
      nrow(d_mixt) ) )
# rmse_xyt
# [1] 0.8054934

mapView(
  r_xyt, 
  zcol        = "n_per_vol_log10_xyt", 
  col.regions = rev(RColorBrewer::brewer.pal(11,"Spectral"))) + 
mapView(
  pts_ymixt, 
  zcol        = "n_per_vol_log10", 
  col.regions = rev(RColorBrewer::brewer.pal(11,"Spectral"))) +
mapView(
  pts_ymixt, 
  zcol        = "dif_xyt", 
  col.regions = rev(RColorBrewer::brewer.pal(11,"Spectral"))) 

Root Mean Square Error: 1.103813