Packages & setup

# packages
if (!require("librarian")){
  install.packages("librarian")
  library(librarian)
}
librarian::shelf(
  dplyr, DT, dygraphs, glue,googlesheets4, gstat, here, lubridate, mapview, purrr, readr, 
  raster, rmapshaper, sf, skimr, stars, stringr, tidyr)
select <- dplyr::select
mapviewOptions(fgb = FALSE)
# paths
dir_data <- switch(
  Sys.info()["nodename"],
  `ben-mbpro` = "/Users/bbest/My Drive (ben@ecoquants.com)/projects/calcofi/data",
  `Cristinas-MacBook-Pro.local` = "/Volumes/GoogleDrive/.shortcut-targets-by-id/13pWB5x59WSBR0mr9jJjkx7rri9hlUsMv/calcofi/data",
  `Bens-MacBook-Air.local` = "/Volumes/GoogleDrive/My Drive/projects/calcofi/data")
  # TODO: get Erin's Google Drive path and "nodename")

Read dataset keys from gdrive

bottle_key_csv <- file.path(dir_data, "/dataset_keys/bottle_field_descriptions.csv")
cast_key_csv   <- file.path(dir_data, "/dataset_keys/cast_field_descriptions.csv")

key_cast   <- read_csv(cast_key_csv)

key_bottle <- googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/18c6eSGRf0bSdraDocjn3-j1rUIxN2WKqxsEnPQCR6rA/edit#gid=2046976359") %>% 
  na_if("n.a.") %>% 
  mutate(
    dataset    = "bottle_cast",
    source_url = source_url[1])


key_DIC <- googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1SGfGMJUhiYZKIh746p5pGn7cD0cOcTA3g_imAnvyz88/edit#gid=0") %>% 
  na_if("n.a.") %>% 
  mutate(
    dataset    = "DIC_cast",
    source_url = source_url[1])

data_key <- bind_rows(key_bottle, key_DIC) %>% 
  select(dataset, field_name, title, description_abbv, everything())


# convert to var_lookup
var_lookup_key_tbl <- data_key %>% 
  filter(!is.na(description_abbv)) 
var_lookup_key <- var_lookup_key_tbl %>% 
  split(seq(nrow(.))) %>% 
  lapply(as.list)
names(var_lookup_key) <- var_lookup_key_tbl$field_name

var_lookup_tbl <- googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1ghM30pIdcsun7XWzLRKh4EilUplN60YdHayfiilPFwE/edit#gid=0") 
var_lookup <- var_lookup_tbl %>%
  split(seq(nrow(.))) %>% 
  lapply(as.list)
names(var_lookup) <- var_lookup_tbl$var

Read Oceano Bottle data from gdrive

# get data file paths from gdrive
bottle_csv <- file.path(dir_data, "/oceanographic-data/bottle-database/CalCOFI_Database_194903-202001_csv_22Sep2021/194903-202001_Bottle.csv")
cast_csv <- file.path(dir_data, "/oceanographic-data/bottle-database/CalCOFI_Database_194903-202001_csv_22Sep2021/194903-202001_Cast.csv")
bottle_cast_rds <- file.path(dir_data, "/oceanographic-data/bottle-database/bottle_cast.rds")
DIC_csv  <- file.path(dir_data, "/DIC/CalCOFI_DICs_200901-201507_28June2018.csv")

calcofi_geo           <- here("data/calcofi_oceano-bottle-stations_convex-hull.geojson")
calcofi_offshore_geo  <- here("data/calcofi_oceano-bottle-stations_convex-hull_offshore.geojson")
calcofi_nearshore_geo <- here("data/calcofi_oceano-bottle-stations_convex-hull_nearshore.geojson")

# check paths
stopifnot(dir.exists(dir_data))
stopifnot(any(file.exists(bottle_csv,cast_csv)))

# read data
d_cast   <- read_csv(cast_csv) %>% 
  separate(
    Sta_ID, c("Sta_ID_Line", "Sta_ID_Station"), 
    sep=" ", remove=F) %>% 
  mutate(
    Sta_ID_Line    = as.double(Sta_ID_Line),
    Sta_ID_Station = as.double(Sta_ID_Station))

d_bottle <- read_csv(bottle_csv, skip=1, col_names = F, guess_max = 1000000)
#d_bottle_problems() <- problems()
names(d_bottle) <- str_split(
  readLines(bottle_csv, n=1), ",")[[1]] %>% 
  str_replace("\xb5", "µ")

table(is.na(d_bottle$pH1))
## 
##  FALSE   TRUE 
##     84 889416
# FALSE   TRUE 
#    84 889416 
table(is.na(d_bottle$pH2))
## 
##  FALSE   TRUE 
##     10 889490
# FALSE   TRUE 
#    10 889490
table(is.na(d_bottle$O2ml_L))
## 
##  FALSE   TRUE 
## 719993 169507
#  FALSE   TRUE 
# 719993 169507
table(is.na(d_bottle$O2Sat))
## 
##  FALSE   TRUE 
## 685072 204428
# FALSE   TRUE 
# 685072 204428 
table(is.na(d_bottle[,'Oxy_µmol/Kg']))
## 
##  FALSE   TRUE 
## 685061 204439
#  FALSE   TRUE 
# 685061 204439

d_DIC <- read_csv(DIC_csv, skip=1, col_names = F, guess_max = 1000000)
names(d_DIC) <- str_split(
  readLines(DIC_csv, n=1), ",")[[1]] %>% 
  str_replace("\xb5", "µ")
# %>% 
#   separate(
#     `Line Sta_ID`, c("Sta_ID_Line", "Sta_ID_Station"), 
#     sep=" ", remove=F) %>% 
#   mutate(
#     Sta_ID_Line    = as.double(Sta_ID_Line),
#     Sta_ID_Station = as.double(Sta_ID_Station),
#     offshore       = ifelse(Sta_ID_Station > 60, T, F)) 

# Join data

# for now... ideally would join with all the data but this causes some issues with shared variables that have different values
bottle_cast <- d_cast %>% 
  left_join(
    d_bottle %>% select(-Sta_ID),
    by = "Cst_Cnt") %>% 
  mutate(Date = lubridate::as_date(Date, format = "%m/%d/%Y")) 

DIC_cast <- d_cast %>% 
  left_join(
    d_DIC %>% 
      select(-`Line Sta_ID`) %>% 
      rename(Depthm = `Depth(m)`),
    by = c("Cst_Cnt" = "ID")) %>% 
  mutate(Date = lubridate::as_date(Date, format = "%m/%d/%Y"))
saveRDS(bottle_cast, bottle_cast_rds)

Read whales, seabirds, turtles data from gdrive

Bird Mammal Census

bird_mamm_csv <- file.path(dir_data, "/whales-seabirds-turtles/bird-mammal-census/CalCOFI_bird-mammal-census_observations.csv")
transects_csv <- file.path(dir_data, "/whales-seabirds-turtles/bird-mammal-census/CalCOFI_bird-mammal-census_transects.csv")
key_bird_mamm_beh_csv <- file.path(dir_data, "/whales-seabirds-turtles/bird-mammal-census/CalCOFI_bird-mammal-census_behaviorcodes.csv")
key_bird_mamm_sp_csv <- file.path(dir_data, "/whales-seabirds-turtles/bird-mammal-census/CalCOFI_bird-mammal-census_allspecieslist.csv")

key_bird_mamm_beh <- read_csv(key_bird_mamm_beh_csv) 
key_bird_mamm_sp  <- read_csv(key_bird_mamm_sp_csv) 


# bird & mammal counts from CalCOFI cruises
d_bird_mamm <- read_csv(bird_mamm_csv, guess_max = 1000000)

# log of transect datetimes & locations from CalCOFI cruises
d_transects <- read_csv(transects_csv, guess_max = 1000000)

bird_mamm_transects <- d_bird_mamm %>% 
  left_join(
    d_transects, 
    by = "GIS key") %>% 
  left_join(
    key_bird_mamm_beh %>% 
      select(Behavior, Behavior_Description = Description), 
    by = "Behavior") %>%
  left_join(
    key_bird_mamm_sp %>% 
      mutate(across(
        c("Bird", "LargeBird", "Fish", "Mammal", "Include", "Unidentified"), 
        as.logical)), 
    by = "Species")

Exploratory summary of bottle data

# d_cast
skim(d_cast)
# d_bottle
skim(d_bottle)
# d_DIC
skim(d_DIC)
Data summary
Name d_DIC
Number of rows 2084
Number of columns 23
_______________________
Column type frequency:
character 3
numeric 20
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Line Sta_ID 0 1.00 11 11 0 33 0
Depth_ID 0 1.00 38 38 0 2084 0
DIC Quality Comment 2029 0.03 28 115 0 36 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ID 0 1.00 1058.07 613.45 1.00 521.75 1061.50 1585.25 2130.00 ▇▇▇▇▇
Cast_Index 0 1.00 33165.93 596.82 31785.00 32780.00 33312.00 33646.00 33973.00 ▂▂▅▆▇
Bottle_Index 0 1.00 833038.33 15412.45 796974.00 823082.75 836834.00 845385.50 853785.00 ▂▂▅▅▇
Cruise 0 1.00 201258.07 196.70 200808.00 201108.00 201304.00 201407.00 201507.00 ▂▁▆▃▇
Depth(m) 0 1.00 186.48 326.67 1.00 30.00 100.00 231.00 3542.00 ▇▁▁▁▁
DIC1 85 0.96 2153.24 113.00 1948.85 2028.33 2170.64 2253.81 2367.80 ▇▃▅▇▃
DIC2 1860 0.11 2168.15 154.85 1969.44 2008.98 2265.89 2315.52 2364.42 ▇▁▁▁▇
TA1 0 1.00 2256.06 34.84 2181.57 2230.32 2244.32 2278.50 2434.90 ▅▇▃▁▁
TA2 1850 0.11 2278.86 58.50 2198.15 2229.06 2247.50 2316.45 2437.00 ▇▁▇▁▁
pH1 2000 0.04 7.91 0.08 7.62 7.90 7.93 7.96 8.05 ▁▁▁▇▂
pH2 2074 0.00 7.95 0.02 7.92 7.93 7.95 7.96 7.99 ▇▂▃▃▂
Salinity1 0 1.00 33.76 0.40 32.84 33.42 33.73 34.15 34.68 ▂▇▃▇▁
Salinity2 1849 0.11 33.83 0.52 32.94 33.35 33.66 34.29 34.82 ▅▆▁▇▁
Temperature_degC 0 1.00 11.05 3.80 1.52 8.25 10.00 14.01 22.75 ▁▇▅▃▁
Bottle Salinity 0 1.00 33.77 0.40 32.84 33.42 33.74 34.15 34.68 ▂▇▃▇▁
Bottle O2(ml_L) 0 1.00 3.40 2.14 0.00 1.36 3.22 5.64 7.81 ▆▅▂▇▁
Bottle O2 (µmol/Kg) 0 1.00 148.28 93.39 0.00 59.31 140.12 245.95 340.42 ▆▅▂▇▁
Sigma-theta 0 1.00 25.74 0.98 22.98 24.91 25.96 26.58 27.78 ▁▅▅▇▂
DIC Bottle_ID1 0 1.00 4269.62 3603.41 1.00 390.75 2952.50 7503.25 10004.00 ▇▂▂▃▅
DIC Bottle_ID2 1852 0.11 3519.40 3441.65 2.00 331.25 2247.00 6907.50 9959.00 ▇▂▂▂▃

Get CINMS AOI

# get example AOI (Channel Islands NMS)
sanctuaries_geo <- "https://github.com/noaa-onms/onmsR/raw/12a87dfd4b90f2e3009ccb4913315fb2df7afddc/data-raw/sanctuaries.geojson"

cinms_ply <- sf::st_read(sanctuaries_geo) %>%
  dplyr::filter(nms == "CINMS")
## Reading layer `sanctuaries' from data source 
##   `https://github.com/noaa-onms/onmsR/raw/12a87dfd4b90f2e3009ccb4913315fb2df7afddc/data-raw/sanctuaries.geojson' 
##   using driver `GeoJSON'
## Simple feature collection with 16 features and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -180 ymin: -15.38615 xmax: 180 ymax: 48.50589
## Geodetic CRS:  WGS 84
# get AOI geom points as WKT for later use with API
cinms_txt <- sf::st_as_text(cinms_ply$geometry)
#cinms_txt

mapview(cinms_ply) 
# get aoi
aoi = cinms_ply

Functions

Get Station IDs as points

# for summary, want to group by Sta_Code because each data point has a diff Sta_ID
get_pts <- function(data) {
  data %>% 
    filter(
    !is.na(Lat_Dec),
    !is.na(Lon_Dec)) %>% 
  group_by(
    Sta_ID) %>%
  summarize(
    lon            = mean(Lon_Dec),
    lat            = mean(Lat_Dec),
    Sta_ID_Line    = mean(Sta_ID_Line),
    Sta_ID_Station = mean(Sta_ID_Station)) %>%
  st_as_sf(
    coords = c("lon", "lat"), crs=4326, remove = F) %>% 
  mutate(
    offshore = ifelse(Sta_ID_Station > 60, T, F))
}


get_pts(bottle_cast) %>% mapview(zcol="offshore")
get_pts(DIC_cast) %>% mapview(zcol="offshore")