Clean-up Database

Goal: Systematically evaluate and clean-up the database, including renaming tables and columns, and documenting relationships, per naming conventions being established in CalCOFI/docs: Database.

Code

source(here::here("../apps_dev/libs/db.R")) # con: database connection 

librarian::shelf(
  calcofi/calcofi4r,
  dm, DT, glue, purrr, stringr, tidyr)

1 Transitionary Tables

Let’s evaluate tables that are in transition, i.e. containing in the table name: “old”, “new”, “backup”, “temp”, “temp”, or “test”.

Code

d_tbls <- tibble(
  table = dbListTables(con)) |> 
  mutate(
    subset = case_when(
      str_detect(table, "old")      ~ "old",
      str_detect(table, "new")      ~ "new",
      str_detect(table, "backup")   ~ "backup",
      str_detect(table, "temp|tmp") ~ "temp",
      str_detect(table, "test")     ~ "test",
      TRUE ~ "base")) |> 
  arrange(subset, table)
table(d_tbls$subset)


backup   base    new    old   temp   test 
     3     68      5      4      3      1

Code

d_tbls |> 
  filter(subset != "base")

# A tibble: 16 × 2
   table                subset
   <chr>                <chr> 
 1 egg_counts_backup    backup
 2 larvae_counts_backup backup
 3 nets_backup          backup
 4 newnet2ctdcast       new   
 5 scrippscast_new      new   
 6 species_codes_new    new   
 7 species_new          new   
 8 tow_types_new        new   
 9 egg_counts_old       old   
10 larvae_counts_old    old   
11 stations_old         old   
12 tows_old             old   
13 net_uuids_tmp        temp  
14 tempnet2ctd          temp  
15 tmp_out              temp  
16 test_rast            test

Code

tbls_other <- d_tbls |> 
  filter(subset != "base") |> 
  pull(table)
cc_db_catalog(tbls_other)

TODO: Evaluate these tables and decide what to do with them, i.e. rename or drop.

2 Base Tables

Code

tbls_base <- d_tbls |> 
  filter(subset == "base") |> 
  pull(table) %>% 
  c(., "species_codes_new") |> 
  sort()
tibble(
  table = tbls_base)

# A tibble: 69 × 1
   table              
   <chr>              
 1 aoi_fed_sanctuaries
 2 core_stations      
 3 cruises            
 4 cruises_uuids      
 5 ctd_bottles        
 6 ctd_cast_bottles   
 7 ctd_casts          
 8 ctd_dic            
 9 ctd_profiles       
10 dataset_keywords   
# ℹ 59 more rows

Code

cc_db_catalog(tables = tbls_base)

2.1 taxonomic

Let’s evaluate tables that are taxonomic in nature, including “species”, “taxa”.

Code

d_tbls <- tibble(
  table = dbListTables(con)) |> 
  mutate(
    subset = case_when(
      str_detect(table, "species") ~ "species",
      str_detect(table, "taxa")    ~ "taxa")) |>
  filter(
    !is.na(subset)) |> 
  arrange(subset, table) |> 
  relocate(subset)
datatable(d_tbls)

Code

cc_db_catalog(d_tbls$table)

2.2 *_uuids

Let’s look at redundancy of tables labeled with “uuids”.

Code

tbls_uuids <- str_subset(dbListTables(con), "uuids")
tbls_pfx <- str_replace(tbls_uuids, "(.*)_uuids.*", "\\1")

d_tbls <- tibble(
  table = dbListTables(con)) |> 
  mutate(
    subset = case_when(
      table %in% tbls_uuids ~ "uuids",
      table %in% tbls_pfx   ~ "prefix")) |>
  filter(
    !is.na(subset)) |> 
  arrange(table, subset)
datatable(d_tbls)

Code

cc_db_catalog(d_tbls$table)

2.3 Proposed renames

Generalize from “species” to “taxa” (or “taxon” singular), since not all species_codes are at taxonomic level of species. OR operate from understanding that species could refer to other taxon ranks.
Follow conventions at calcofi.io/docs: Database.
See CalCOFI/workflows: modify_db.R for previous SQL commands, including renames, and dates.
TODO: Consider other changes and how to document. For instance, perhaps track changes in the pg_dump of only data definition language (DDL) for the whole database (i.e., CREATE TABLE SQL statements), and see git differences over time.
```
pg_dump -U user_name -h host database -s -t table_or_view_names -f table_or_view_names.sql
```

Code

# TODO: species_codes -> taxa
q("ALTER TABLE species_codes RENAME TO taxa")
# TODO: species.spccode -> sp_key
q("ALTER TABLE taxa RENAME COLUMN spccode TO taxa_key")

3 Relationships (`dm` R package)

3.1 Relationships, `"keys_only"`

Code

con_dm <- dm_from_con(
  con, 
  table_names = tbls_base,
  learn_keys  = T)
con_dm

── Table source ────────────────────────────────────────────────────────────────
src:  postgres  [admin@localhost:5432/gis]
── Metadata ────────────────────────────────────────────────────────────────────
Tables: `aoi_fed_sanctuaries`, `core_stations`, `cruises`, `cruises_uuids`, `ctd_bottles`, … (69 total)
Columns: 642
Primary keys: 14
Foreign keys: 5

Code

dm_draw(con_dm, view_type = "keys_only")

3.2 Relationships, `"all"`

This is too much.

Code

# TODO: make SVG zoomable
dm_draw(con_dm, view_type = "all")

3.3 Show existing unique keys across tables

Code

dm_get_all_uks(con_dm)

# A tibble: 14 × 3
   table           uk_col     kind 
   <chr>           <keys>     <chr>
 1 cruises         cruise_id  PK   
 2 ctd_bottles     btl_cnt    PK   
 3 ctd_casts       cast_count PK   
 4 grd_a           rid        PK   
 5 grd_gcs         rid        PK   
 6 grd_mer         rid        PK   
 7 nets            netid      PK   
 8 r_mer100km      rid        PK   
 9 r_sta_cnt       rid        PK   
10 spatial_ref_sys srid       PK   
11 stations        stationid  PK   
12 tows            towid      PK   
13 users           email      PK   
14 z_idw           rid        PK

3.4 Identify candidate keys

Code

# candidate primary keys
dm_enum_pk_candidates(con_dm, "species_codes_new")

# A tibble: 6 × 3
  columns         candidate why                                                 
  <keys>          <lgl>     <chr>                                               
1 id              TRUE      ""                                                  
2 spccode         TRUE      ""                                                  
3 scientific_name TRUE      ""                                                  
4 itis_tsn        TRUE      ""                                                  
5 common_name     FALSE     "has duplicate values: NULL (366), Dreamers        …
6 taxon_rank      FALSE     "has 34 missing values, and duplicate values: Speci…

Code

# candidate foreign keys
try(
  dm_enum_fk_candidates(con_dm, "species_codes_new", "species_groups"))

Error in abort_ref_tbl_has_no_pk(ref_table_name) : 
  ref_table `species_groups` needs a primary key first. Use `dm_enum_pk_candidates()` to find appropriate columns and `dm_add_pk()` to define a primary key.

3.5 Run Shiny app

Code

# install.packages("DiagrammeRsvg")
dm_gui(dm = con_dm)

The `dm::dm_gui()` Shiny app after clicking on two tables. Haven’t tried editing relationships yet.

--- title: "Clean-up Database" editor_options: chunk_output_type: console --- **Goal**: Systematically evaluate and clean-up the database, including renaming tables and columns, and documenting relationships, per naming conventions being established in [CalCOFI/docs: Database]( https://calcofi.io/docs/db.html). ```{r} #| label: "db con" source(here::here("../apps_dev/libs/db.R")) # con: database connection librarian::shelf( calcofi/calcofi4r, dm, DT, glue, purrr, stringr, tidyr) ``` ## Transitionary Tables Let's evaluate tables that are in transition, i.e. containing in the table name: "old", "new", "backup", "temp", "temp", or "test". ```{r} d_tbls <- tibble( table = dbListTables(con)) |> mutate( subset = case_when( str_detect(table, "old") ~ "old", str_detect(table, "new") ~ "new", str_detect(table, "backup") ~ "backup", str_detect(table, "temp|tmp") ~ "temp", str_detect(table, "test") ~ "test", TRUE ~ "base")) |> arrange(subset, table) table(d_tbls$subset) d_tbls |> filter(subset != "base") tbls_other <- d_tbls |> filter(subset != "base") |> pull(table) cc_db_catalog(tbls_other) ``` - `TODO:` Evaluate these tables and decide what to do with them, i.e. rename or drop. ## Base Tables ```{r} tbls_base <- d_tbls |> filter(subset == "base") |> pull(table) %>% c(., "species_codes_new") |> sort() tibble( table = tbls_base) cc_db_catalog(tables = tbls_base) ``` ### taxonomic Let's evaluate tables that are taxonomic in nature, including "species", "taxa". ```{r} #| label: tbls_taxonomic d_tbls <- tibble( table = dbListTables(con)) |> mutate( subset = case_when( str_detect(table, "species") ~ "species", str_detect(table, "taxa") ~ "taxa")) |> filter( !is.na(subset)) |> arrange(subset, table) |> relocate(subset) datatable(d_tbls) cc_db_catalog(d_tbls$table) ``` ### \*\_uuids Let's look at redundancy of tables labeled with "uuids". ```{r} #| label: tbls_uuid tbls_uuids <- str_subset(dbListTables(con), "uuids") tbls_pfx <- str_replace(tbls_uuids, "(.*)_uuids.*", "\\1") d_tbls <- tibble( table = dbListTables(con)) |> mutate( subset = case_when( table %in% tbls_uuids ~ "uuids", table %in% tbls_pfx ~ "prefix")) |> filter( !is.na(subset)) |> arrange(table, subset) datatable(d_tbls) cc_db_catalog(d_tbls$table) ``` ### Proposed renames - Generalize from "species" to "taxa" (or "taxon" singular), since not all species_codes are at taxonomic level of species. OR operate from understanding that species could refer to other taxon ranks. - Follow conventions at [calcofi.io/docs: Database](https://calcofi.io/docs/db.html). - See [CalCOFI/workflows: `modify_db.R`](https://github.com/CalCOFI/workflows/blob/main/modify_db.R) for previous SQL commands, including renames, and dates. - `TODO:` Consider other changes and how to document. For instance, perhaps track changes in the `pg_dump` of only data definition language (DDL) for the whole database (i.e., `CREATE TABLE` SQL statements), and see `git diff`erences over time. ```bash pg_dump -U user_name -h host database -s -t table_or_view_names -f table_or_view_names.sql ``` ```{r} #| label: "TODO renames" #| eval: false # TODO: species_codes -> taxa q("ALTER TABLE species_codes RENAME TO taxa") # TODO: species.spccode -> sp_key q("ALTER TABLE taxa RENAME COLUMN spccode TO taxa_key") ``` ## Relationships (`dm` R package) - [document database: entity-relationship diagram, comments, etc. · Issue #2 · CalCOFI/docs](https://github.com/CalCOFI/docs/issues/2) - [Relational Data Models • dm](https://dm.cynkra.com/) - [Cheat sheet • dm](https://dm.cynkra.com/articles/cheatsheet.html) - [Introduction to relational data models • dm](https://krlmlr.github.io/dm/articles/howto-dm-theory.html) ### Relationships, `"keys_only"` ```{r} con_dm <- dm_from_con( con, table_names = tbls_base, learn_keys = T) con_dm dm_draw(con_dm, view_type = "keys_only") ``` ### Relationships, `"all"` This is too much. ```{r} # TODO: make SVG zoomable dm_draw(con_dm, view_type = "all") ``` ### Show existing unique keys across tables ```{r} dm_get_all_uks(con_dm) ``` ### Identify candidate keys ```{r} # candidate primary keys dm_enum_pk_candidates(con_dm, "species_codes_new") # candidate foreign keys try( dm_enum_fk_candidates(con_dm, "species_codes_new", "species_groups")) ``` ### Run Shiny app ```{r} #| eval: false # install.packages("DiagrammeRsvg") dm_gui(dm = con_dm) ``` ![The `dm::dm_gui()` Shiny app after clicking on two tables. Haven't tried editing relationships yet.](figures/update-db_dm-shiny-app.png) ```{r} #| label: "TODO relationships" #| eval: false #| echo: false # Add primary keys: dm_add_pk(con_dm, table = "species_codes", columns = "spccode") ``` ```{r} #| label: "alternative reactable()" #| eval: false #| echo: false htmltools::browsable( tagList( div( div(tags$label("Filter Type", `for` = "tbl-filter")), tags$select( id = "tbl-filter", onchange = "Reactable.setFilter('dbcatalog', 'table_type', this.value)", tags$option("All", value = ""), lapply(unique(d_t$table_type), tags$option) ) ), tags$hr("aria-hidden" = "true"), reactable( d_t, elementId = "dbcatalog", # groupBy = c("table_type"), searchable = T, columns = list( table = colDef( html = TRUE, # Show table description under table name cell = function(value, index) { descrip <- d_t$table_description[index] # browser() ifelse(!is.na(descrip), div( div(style = "font-weight: 600", value), div(style = "font-size: 0.75rem", HTML(descrip))) |> as.character(), div(style = "font-weight: 600", value) |> as.character() ) } ), table_description = colDef(show = FALSE), table_type = colDef(show = FALSE) ), details = function(index) { d_c_t <- d_c[d_c$table == d_t$table[index], ] htmltools::div( style = "padding: 1rem", reactable( d_c_t, columns = list( table = colDef(show = FALSE), column_description = colDef(show = FALSE), column = colDef( html = TRUE, cell = function(value, index) { descrip <- d_c_t$column_description[index] # browser() ifelse( !is.na(descrip), div( div(style = "font-weight: 600", value), div(style = "font-size: 0.75rem", HTML(descrip))) |> as.character(), div(style = "font-weight: 600", value) |> as.character() ) } ) ), outlined = TRUE ) ) } ) ) ) ```

1 Transitionary Tables

2 Base Tables

2.1 taxonomic

2.2 *_uuids

2.3 Proposed renames

3 Relationships (dm R package)

3.1 Relationships, "keys_only"

3.2 Relationships, "all"

3.3 Show existing unique keys across tables

3.4 Identify candidate keys

3.5 Run Shiny app

3 Relationships (`dm` R package)

3.1 Relationships, `"keys_only"`

3.2 Relationships, `"all"`