Goal: Systematically evaluate and clean-up the database, including renaming tables and columns, and documenting relationships, per naming conventions being established in CalCOFI/docs: Database.

Code
source(here::here("../apps_dev/libs/db.R")) # con: database connection 

librarian::shelf(
  calcofi/calcofi4r,
  dm, DT, glue, purrr, stringr, tidyr)

1 Transitionary Tables

Let’s evaluate tables that are in transition, i.e. containing in the table name: “old”, “new”, “backup”, “temp”, “temp”, or “test”.

Code
d_tbls <- tibble(
  table = dbListTables(con)) |> 
  mutate(
    subset = case_when(
      str_detect(table, "old")      ~ "old",
      str_detect(table, "new")      ~ "new",
      str_detect(table, "backup")   ~ "backup",
      str_detect(table, "temp|tmp") ~ "temp",
      str_detect(table, "test")     ~ "test",
      TRUE ~ "base")) |> 
  arrange(subset, table)
table(d_tbls$subset)

backup   base    new    old   temp   test 
     3     68      5      4      3      1 
Code
d_tbls |> 
  filter(subset != "base")
# A tibble: 16 × 2
   table                subset
   <chr>                <chr> 
 1 egg_counts_backup    backup
 2 larvae_counts_backup backup
 3 nets_backup          backup
 4 newnet2ctdcast       new   
 5 scrippscast_new      new   
 6 species_codes_new    new   
 7 species_new          new   
 8 tow_types_new        new   
 9 egg_counts_old       old   
10 larvae_counts_old    old   
11 stations_old         old   
12 tows_old             old   
13 net_uuids_tmp        temp  
14 tempnet2ctd          temp  
15 tmp_out              temp  
16 test_rast            test  
Code
tbls_other <- d_tbls |> 
  filter(subset != "base") |> 
  pull(table)
cc_db_catalog(tbls_other)
  • TODO: Evaluate these tables and decide what to do with them, i.e. rename or drop.

2 Base Tables

Code
tbls_base <- d_tbls |> 
  filter(subset == "base") |> 
  pull(table) %>% 
  c(., "species_codes_new") |> 
  sort()
tibble(
  table = tbls_base)
# A tibble: 69 × 1
   table              
   <chr>              
 1 aoi_fed_sanctuaries
 2 core_stations      
 3 cruises            
 4 cruises_uuids      
 5 ctd_bottles        
 6 ctd_cast_bottles   
 7 ctd_casts          
 8 ctd_dic            
 9 ctd_profiles       
10 dataset_keywords   
# ℹ 59 more rows
Code
cc_db_catalog(tables = tbls_base)

2.1 taxonomic

Let’s evaluate tables that are taxonomic in nature, including “species”, “taxa”.

Code
d_tbls <- tibble(
  table = dbListTables(con)) |> 
  mutate(
    subset = case_when(
      str_detect(table, "species") ~ "species",
      str_detect(table, "taxa")    ~ "taxa")) |>
  filter(
    !is.na(subset)) |> 
  arrange(subset, table) |> 
  relocate(subset)
datatable(d_tbls)
Code
cc_db_catalog(d_tbls$table)

2.2 *_uuids

Let’s look at redundancy of tables labeled with “uuids”.

Code
tbls_uuids <- str_subset(dbListTables(con), "uuids")
tbls_pfx <- str_replace(tbls_uuids, "(.*)_uuids.*", "\\1")

d_tbls <- tibble(
  table = dbListTables(con)) |> 
  mutate(
    subset = case_when(
      table %in% tbls_uuids ~ "uuids",
      table %in% tbls_pfx   ~ "prefix")) |>
  filter(
    !is.na(subset)) |> 
  arrange(table, subset)
datatable(d_tbls)
Code
cc_db_catalog(d_tbls$table)

2.3 Proposed renames

  • Generalize from “species” to “taxa” (or “taxon” singular), since not all species_codes are at taxonomic level of species. OR operate from understanding that species could refer to other taxon ranks.

  • Follow conventions at calcofi.io/docs: Database.

  • See CalCOFI/workflows: modify_db.R for previous SQL commands, including renames, and dates.

  • TODO: Consider other changes and how to document. For instance, perhaps track changes in the pg_dump of only data definition language (DDL) for the whole database (i.e., CREATE TABLE SQL statements), and see git differences over time.

    pg_dump -U user_name -h host database -s -t table_or_view_names -f table_or_view_names.sql
Code
# TODO: species_codes -> taxa
q("ALTER TABLE species_codes RENAME TO taxa")
# TODO: species.spccode -> sp_key
q("ALTER TABLE taxa RENAME COLUMN spccode TO taxa_key")

3 Relationships (dm R package)

3.1 Relationships, "keys_only"

Code
con_dm <- dm_from_con(
  con, 
  table_names = tbls_base,
  learn_keys  = T)
con_dm
── Table source ────────────────────────────────────────────────────────────────
src:  postgres  [admin@localhost:5432/gis]
── Metadata ────────────────────────────────────────────────────────────────────
Tables: `aoi_fed_sanctuaries`, `core_stations`, `cruises`, `cruises_uuids`, `ctd_bottles`, … (69 total)
Columns: 642
Primary keys: 14
Foreign keys: 5
Code
dm_draw(con_dm, view_type = "keys_only")
%0 aoi_fed_sanctuaries aoi_fed_sanctuaries core_stations core_stations cruises cruises cruise_id cruises_uuids cruises_uuids ctd_bottles ctd_bottles btl_cnt ctd_cast_bottles ctd_cast_bottles ctd_casts ctd_casts cast_count ctd_dic ctd_dic ctd_profiles ctd_profiles dataset_keywords dataset_keywords dataset_methods dataset_methods dataset_persons dataset_persons dataset_tables dataset_tables datasets datasets effort_areas effort_areas effort_ctrs effort_ctrs effort_grid effort_grid effort_zones effort_zones egg_counts egg_counts netid nets nets towid netid egg_counts:netid->nets:netid egg_species egg_species egg_species_uuids egg_species_uuids egg_uuids egg_uuids field_labels field_labels geography_columns geography_columns geometry_columns geometry_columns grd_a grd_a rid grd_gcs grd_gcs rid grd_mer grd_mer rid idw_stats idw_stats juvenile_organisms juvenile_organisms keywords keywords krill_abundances krill_abundances larvae_counts larvae_counts netid larvae_counts:netid->nets:netid larvae_species larvae_species larvae_species_uuids larvae_species_uuids larvae_uuids larvae_uuids mature_organisms mature_organisms methods methods net_uuids net_uuids net2cruise net2cruise net2ctdcast net2ctdcast netcastdict netcastdict tows tows towid stationid nets:towid->tows:towid persons persons places places r_mer100km r_mer100km rid r_sta_cnt r_sta_cnt rid raster_columns raster_columns raster_overviews raster_overviews samples samples scrippscast scrippscast ships ships spatial_ref_sys spatial_ref_sys srid species_codes_new species_codes_new species_groups species_groups station_uuids station_uuids stations stations stationid cruise_id stations:cruise_id->cruises:cruise_id stations_hull stations_hull stations_order stations_order taxa taxa taxa_hierarchy taxa_hierarchy tow_types tow_types tow_uuids tow_uuids tows:stationid->stations:stationid tsn_list tsn_list users users email uunet2cruise uunet2cruise uunet2ctd uunet2ctd z_idw z_idw rid

3.2 Relationships, "all"

This is too much.

Code
# TODO: make SVG zoomable
dm_draw(con_dm, view_type = "all")
%0 aoi_fed_sanctuaries aoi_fed_sanctuaries sanctuary nms geom core_stations core_stations sta_id cruises cruises cruise_id cruise_ymd ship cruises_uuids cruises_uuids cruise_id cruise_ymd ship ctd_bottles ctd_bottles cast_count btl_cnt sta_id depth_id depth_m t_degc salinity o2ml_l stheta o2sat oxy_umolperkg btlnum recind t_prec t_qual s_prec s_qual p_qual o_qual sthtaq o2satq chlora chlqua phaeop phaqua po4um po4q sio3um sio3qu no2um no2q no3um no3q nh3um nh3q c14as1 c14a1p c14a1q c14as2 c14a2p c14a2q darkas darkap darkaq meanas meanap meanaq inctim lightp r_depth r_temp r_sal r_dynht r_nuts r_oxy_umolkg dic1 dic2 ta1 ta2 ph1 ph2 dic_q_comment ctd_cast_bottles ctd_cast_bottles cast_count t_degc salinity depth_m date geom ctd_casts ctd_casts cast_count cruiseid cruise cruz_sta dbsta_id castid sta_id quarter sta_code distance date year month juliandate julianday time latitude latdeg latmin lathem longitude londeg lonmin lonhem rptline stline acline rptsta ststa acsta bottomdepth secchi foreiu shipname shipcode datatype orderocc eventnum cruzleg origstaid dataor cruznum intchl intc14 incstr incend pstlan civilt timezone wavedir waveht waveprd winddir windspd barometer dryt wett wea cloudtype cloudamount visibility geom datetime geom_3857 ctd_dic ctd_dic id cast_index bottle_index cruise depth_m line_sta_id depth_id dic1 dic2 ta1 ta2 ph1 ph2 salinity1 salinity2 temp_degc btlsalinity btlo2_ml_per_l btlo2_umol_per_kg sigma_theta dic_btl_id1 dic_btl_id2 dic_quality_comment ctd_profiles ctd_profiles project study ord_occ event_num cast_id date_time_utc date_time_pst lat_dec lon_dec sta_id line sta depth pressure prq temp1 temp1q temp2 temp2q tempave salt1 salt1q salt1_corr salt2 salt2q salt2_corr saltave_corr ox1 ox1q ox1_cruisecorr ox1_stacorr ox2 ox2q ox2_cruisecorr ox2_stacorr oxave_stacorr ox1um ox1um_cruisecorr ox1um_stacorr ox2um ox2um_cruisecorr ox2um_stacorr oxaveum_stacorr fluorv fluorq estchl_cruisecorr estchl_stacorr isusv isusq estno3_cruisecorr estno3_stacorr sigthetats1 sigthetats1q sigthetats2 sigthetats2q bat xmiss transq ph phq spar sparq par parq pot1 pot2 dynht sva oxsat1 oxsat2 btl_depth btl_temp saltb oxb oxbum chl_a phaeo no3 no2 nh4 po4 sil cast_count cast_cnt dataset_keywords dataset_keywords id dataset_id keyword_id dataset_methods dataset_methods id dataset_id method_id dataset_persons dataset_persons id dataset_id person_id role description dataset_tables dataset_tables id dataset_id table_name table_description datasets datasets id label title abstract startdate enddate effort_areas effort_areas area_key sta_pattern sta_shore sta_dpos sta_lin_min sta_lin_max sta_pos_min sta_pos_max geom effort_ctrs effort_ctrs sta_key sta_lin sta_pos sta_dpos sta_shore sta_pattern geom zone_key effort_grid effort_grid sta_key sta_lin sta_pos sta_dpos sta_shore sta_pattern geom zone_key effort_zones effort_zones zone_key sta_pattern sta_shore sta_dpos sta_lin_min sta_lin_max sta_pos_min sta_pos_max geom egg_counts egg_counts netid spccode tally nets nets towid netid netside shf volsampled propsorted egg_counts:netid->nets:netid egg_species egg_species netid spccode scientific_name common_name eggcount itis_tsn egg_species_uuids egg_species_uuids netid sppcode scientific_name common_name eggcount itis_tsn egg_uuids egg_uuids netid sppcode tally field_labels field_labels active category table_field plot_title plot_label plot_color color_palette geography_columns geography_columns f_table_catalog f_table_schema f_table_name f_geography_column coord_dimension srid type geometry_columns geometry_columns f_table_catalog f_table_schema f_table_name f_geometry_column coord_dimension srid type grd_a grd_a rid rast grd_gcs grd_gcs rid name rast grd_mer grd_mer rid rast idw_stats idw_stats hash ply_geo rast_tif args dtime_created dtime_last_viewed dtime_last_downloaded n_views_ply n_views_rast n_downloads_ply n_downloads_rast juvenile_organisms juvenile_organisms sampleid speciescode count keywords keywords id keyword krill_abundances krill_abundances cruise ship region station line maxdepth latitude longitude towbegin towend daytow phase stage abundance_m2 spccode larvae_counts larvae_counts netid spccode tally larvae_counts:netid->nets:netid larvae_species larvae_species netid spccode scientific_name common_name larvaecount itis_tsn larvae_species_uuids larvae_species_uuids netid sppcode scientific_name common_name larvaecount itis_tsn larvae_uuids larvae_uuids netid sppcode tally mature_organisms mature_organisms sampleid speciescode count methods methods id title description instrumentation1 instrumentation2 instrumentation3 net_uuids net_uuids netid towid netside shf volsampled propsorted smallplankton totalplankton net2cruise net2cruise netid cruise_id cruise_ymd ship line stationid station latitude longitude orderocc gebco_depth netside townumber towtype volsampled shf propsorted starttime geom net2ctdcast net2ctdcast netid cast_count cruiseid ncruise ctdcruise nline nstation stline ststa acline acsta rptline rptsta ctdlat codlon nlat nlon bottomdepth gebco_depth date time nettime netcastdict netcastdict netid cast_count tows tows towid stationid towtype townumber starttime nets:towid->tows:towid persons persons id prefix first middleinit last organization email orcid type places places category key name geom r_mer100km r_mer100km rid rast r_sta_cnt r_sta_cnt rid band_names r_class r_proj4 rast raster_columns raster_columns r_table_catalog r_table_schema r_table_name r_raster_column srid scale_x scale_y blocksize_x blocksize_y same_alignment regular_blocking num_bands pixel_types nodata_values out_db extent spatial_index raster_overviews raster_overviews o_table_catalog o_table_schema o_table_name o_raster_column r_table_catalog r_table_schema r_table_name r_raster_column overview_factor samples samples sampleid location date scrippscast scrippscast cruise ship orderocc castcnt ships ships shipcode shipname shipnodc spatial_ref_sys spatial_ref_sys srid auth_name auth_srid srtext proj4text species_codes_new species_codes_new id spccode scientific_name itis_tsn common_name taxon_rank species_groups species_groups spp_group spccode station_uuids station_uuids stationid cruise_id orderocc latitude longitude line station gebco_depth geom stations stations stationid cruise_id orderocc latitude longitude line station gebco_depth geom stations:cruise_id->cruises:cruise_id stations_hull stations_hull geom stations_order stations_order ORDER OCC LINE STA LAT (DEC) LAT (DEG) LAT (MIN) LAT (DEG MIN) LON (DEC) LON (DEG) LON (MIN) LON (DEG MIN) EST DEPTH STA TYPE taxa taxa id taxa_key itis_tsn common_name scientific_name taxon_rank taxa_hierarchy taxa_hierarchy id parent tsn scientific_name taxon_rank lsid path tow_types tow_types code description tow_uuids tow_uuids towid stationid towtype townumber starttime tows:stationid->stations:stationid tsn_list tsn_list id tsn rank name users users email uunet2cruise uunet2cruise netid cruise_id cruise_ymd ship line stationid station latitude longitude orderocc gebco_depth netside townumber towtype volsampled shf propsorted starttime geom uunet2ctd uunet2ctd netid cast_count cruiseid ncruise ctdcruise nline nstation stline ststa acline acsta rptline rptsta ctdlat ctdlon nlat nlon bottomdepth gebco_depth date time nettime ctdtime z_idw z_idw rid args_hash args_json rast

3.3 Show existing unique keys across tables

Code
dm_get_all_uks(con_dm)
# A tibble: 14 × 3
   table           uk_col     kind 
   <chr>           <keys>     <chr>
 1 cruises         cruise_id  PK   
 2 ctd_bottles     btl_cnt    PK   
 3 ctd_casts       cast_count PK   
 4 grd_a           rid        PK   
 5 grd_gcs         rid        PK   
 6 grd_mer         rid        PK   
 7 nets            netid      PK   
 8 r_mer100km      rid        PK   
 9 r_sta_cnt       rid        PK   
10 spatial_ref_sys srid       PK   
11 stations        stationid  PK   
12 tows            towid      PK   
13 users           email      PK   
14 z_idw           rid        PK   

3.4 Identify candidate keys

Code
# candidate primary keys
dm_enum_pk_candidates(con_dm, "species_codes_new")
# A tibble: 6 × 3
  columns         candidate why                                                 
  <keys>          <lgl>     <chr>                                               
1 id              TRUE      ""                                                  
2 spccode         TRUE      ""                                                  
3 scientific_name TRUE      ""                                                  
4 itis_tsn        TRUE      ""                                                  
5 common_name     FALSE     "has duplicate values: NULL (366), Dreamers        …
6 taxon_rank      FALSE     "has 34 missing values, and duplicate values: Speci…
Code
# candidate foreign keys
try(
  dm_enum_fk_candidates(con_dm, "species_codes_new", "species_groups"))
Error in abort_ref_tbl_has_no_pk(ref_table_name) : 
  ref_table `species_groups` needs a primary key first. Use `dm_enum_pk_candidates()` to find appropriate columns and `dm_add_pk()` to define a primary key.

3.5 Run Shiny app

Code
# install.packages("DiagrammeRsvg")
dm_gui(dm = con_dm)

The dm::dm_gui() Shiny app after clicking on two tables. Haven’t tried editing relationships yet.