# Detect expected types from the most common type across all files
d_csv <- d_csv |>
mutate(
# Identify completely empty columns in each data frame
col_empties = map(data, \(x) {
tibble(
col_name = names(x),
n_empty = map_int(x, \(col) sum(is.na(col)))
) |>
filter(n_empty == nrow(x)) |>
pull(col_name)
}),
# Extract column types from each data frame
col_types = map2(data, col_empties, \(x, y) {
tibble(
col_name = names(x),
col_type = map_chr(x, \(col) class(col)[1])
) |>
filter(!col_name %in% y)
})
)
# Find the most common type for each column across all files
d_types <- d_csv |>
select(path, col_types) |>
unnest(col_types) |>
count(col_name, col_type) |>
group_by(col_name) |>
slice_max(n, n = 1, with_ties = FALSE) |>
ungroup() |>
select(col_name, expected_type = col_type)
# Identify files with type mismatches
d_mismatches <- d_csv |>
select(cruise_id, path, col_types) |>
unnest(col_types) |>
left_join(d_types, by = "col_name") |>
filter(col_type != expected_type) |>
arrange(col_name, path)
# Log mismatches if any exist
if (nrow(d_mismatches) > 0) {
message("Type mismatches detected - converting columns...")
}
# Bind data, converting mismatched columns to expected type
d_bind <- d_csv |>
mutate(
data = map2(data, path, \(x, p) {
# Get columns that need type conversion for this file
x_mismatches <- d_mismatches |>
filter(path == p)
if (nrow(x_mismatches) > 0) {
for (i in 1:nrow(x_mismatches)) {
col <- x_mismatches$col_name[i]
expected <- x_mismatches$expected_type[i]
# Count NAs before conversion
na_before <- sum(is.na(x[[col]]))
# Convert to expected type
suppressWarnings({
x[[col]] <- switch(
expected,
"numeric" = as.numeric(x[[col]]),
"integer" = as.integer(x[[col]]),
"logical" = as.logical(x[[col]]),
"character" = as.character(x[[col]]),
x[[col]]
)
})
# Count NAs after conversion
na_after <- sum(is.na(x[[col]]))
na_generated <- na_after - na_before
if (na_generated > 0) {
message(glue(
" {basename(p)}: {col} ({x_mismatches$col_type[i]} → {expected}) generated {na_generated} NAs"
))
}
# Store NA count in d_mismatches for reporting
d_mismatches[
d_mismatches$path == p & d_mismatches$col_name == col,
"nas_generated"
] <<- na_generated
}
}
x
})
) |>
unnest(data)
# Report on type mismatches with NA generation
if (nrow(d_mismatches) > 0) {
# Summary by column
d_mismatches |>
group_by(col_name, expected_type, col_type) |>
summarize(
n_files = n(),
total_nas = sum(nas_generated, na.rm = TRUE),
files = paste(basename(path), collapse = "; "),
.groups = "drop"
) |>
arrange(desc(total_nas)) |>
dt(
caption = "Type mismatches by column",
fname = "ctd_type_mismatches_by_column"
) |>
formatCurrency(
c("n_files", "total_nas"),
currency = "",
digits = 0,
mark = ","
)
# Summary by file
d_mismatches |>
mutate(
path_csv = basename(path),
col_expr = glue(
"{col_name}: {col_type} → {expected_type} ({nas_generated} NAs)<br>"
)
) |>
arrange(path_csv, col_name) |>
group_by(path_csv) |>
summarize(
n_columns = n(),
total_nas = sum(nas_generated, na.rm = TRUE),
columns = paste(col_expr, collapse = "\n"),
.groups = "drop"
) |>
arrange(desc(total_nas), path_csv) |>
dt(
caption = "Type mismatches by file",
fname = "ctd_type_mismatches_by_file",
escape = FALSE
) |>
formatCurrency(
c("n_columns", "total_nas"),
currency = "",
digits = 0,
mark = ","
)
}