Clinical data from Electronic Health Records (EHR) often contains
inconsistencies, missing values, outliers, and formatting issues that
must be addressed before analysis. This vignette demonstrates a
structured cleaning workflow using clinpubr:
Why this order? Format cleaning and unit standardization must precede outlier detection — mixed units (e.g., mg/dL vs mmol/L) can cause false outlier flags.
Let’s create a sample laboratory dataset mimicking real-world EHR data with common quality issues. This dataset will be used throughout the vignette.
set.seed(123)
n <- 100
# Patient IDs
patient_ids <- paste0("P", sprintf("%03d", 1:n))
# Glucose: mix of mg/dL and mmol/L (1 mmol/L = 18 mg/dL)
# Also include some outliers and missing values
glucose_vals <- c(
rnorm(35, mean = 100, sd = 15), # mg/dL normal range
rnorm(30, mean = 5.5, sd = 0.8), # mmol/L normal range
rnorm(10, mean = 200, sd = 20), # mg/dL high (outliers)
rnorm(10, mean = 15, sd = 2), # mmol/L high (outliers)
rep(NA, 10), # missing values
999, 888, 0, -50, 500 # erroneous values
)
glucose_vals <- round(sample(glucose_vals, n), 1)
glucose_units <- sample(c("mg/dL", "mmol/L", NA), n, replace = TRUE, prob = c(0.5, 0.4, 0.1))
# Creatinine: mix of mg/dL and umol/L (1 mg/dL = 88.4 umol/L)
creatinine_vals <- c(
rnorm(35, mean = 1.0, sd = 0.2), # mg/dL normal range
rnorm(30, mean = 88, sd = 15), # umol/L normal range
rnorm(10, mean = 3.0, sd = 0.5), # mg/dL high (outliers)
rnorm(10, mean = 265, sd = 44), # umol/L high (outliers)
rep(NA, 10), # missing values
999, 0, -10, 1000, 0.001 # erroneous values
)
creatinine_vals <- round(sample(creatinine_vals, n), 2)
creatinine_units <- sample(c("mg/dL", "umol/L", NA), n, replace = TRUE, prob = c(0.5, 0.4, 0.1))
# Cholesterol: mix of mg/dL and mmol/L (1 mmol/L = 38.67 mg/dL)
cholesterol_vals <- c(
rnorm(35, mean = 180, sd = 30), # mg/dL normal range
rnorm(30, mean = 4.5, sd = 0.8), # mmol/L normal range
rnorm(10, mean = 350, sd = 40), # mg/dL high (outliers)
rnorm(10, mean = 9, sd = 1), # mmol/L high (outliers)
rep(NA, 10), # missing values
888, 999, 50, 600, 0 # erroneous values
)
cholesterol_vals <- round(sample(cholesterol_vals, n), 2)
cholesterol_units <- sample(c("mg/dL", "mmol/L", NA), n, replace = TRUE, prob = c(0.5, 0.4, 0.1))
# Test dates with various formats and impossible dates
test_date_vals <- sample(c(
format(Sys.Date() - sample(1:365, 60, replace = TRUE), "%Y-%m-%d"), # ISO format
format(Sys.Date() - sample(1:365, 30, replace = TRUE), "%Y/%m/%d"), # Slash format
"1900-01-01", "2030-12-31", "N/A", "", "unknown", "pending", # Invalid dates
rep(NA, 10)
), n * 3, replace = TRUE)
# Create long-format lab data with messy values
lab_data <- data.frame(
patient_id = rep(patient_ids, 3),
test = rep(c("Glucose", "Creatinine", "Cholesterol"), each = n),
value = c(glucose_vals, creatinine_vals, cholesterol_vals),
unit = c(glucose_units, creatinine_units, cholesterol_units),
test_date = test_date_vals
)
# Add messy string values that need cleaning
# Glucose with text annotations and European decimal commas
messy_glucose_idx <- sample(which(lab_data$test == "Glucose"), 15)
lab_data$value[messy_glucose_idx] <- sample(c(
"<40", ">500", "5.2", "6.8", "120 mg/dL", "89 mmol/L",
"5.2 (fasting)", "180 (post-meal)", "N/A", "pending",
"6..5", "7..2", "8.5.1", "normal range", "see comment"
), length(messy_glucose_idx), replace = TRUE)
# Creatinine with text annotations
messy_creatinine_idx <- sample(which(lab_data$test == "Creatinine"), 12)
lab_data$value[messy_creatinine_idx] <- sample(c(
"<0.5", ">5.0", "1.1", "0.9", "1.2 mg/dL", "110 umol/L",
"1.5 (dialysis)", "2.8 (critical)", "N/A", " hemolyzed",
"1.3 ", " 1.4", "(1.2)", "1.0*", "see note"
), length(messy_creatinine_idx), replace = TRUE)
# Cholesterol with text annotations
messy_cholesterol_idx <- sample(which(lab_data$test == "Cholesterol"), 10)
lab_data$value[messy_cholesterol_idx] <- sample(c(
"<100", ">400", "4.5", "5.2", "200 mg/dL", "5.5 mmol/L",
"180 (fasting)", "250 (borderline)", "N/A", "lipemic"
), length(messy_cholesterol_idx), replace = TRUE)
knitr::kable(head(lab_data[sample(nrow(lab_data)), ], 10),
caption = "Original Messy Lab Data (10 random rows)"
)| patient_id | test | value | unit | test_date | |
|---|---|---|---|---|---|
| 267 | P067 | Cholesterol | 4.56 | mg/dL | 2026-04-29 |
| 81 | P081 | Glucose | 5.4 | mg/dL | 2025-10-25 |
| 260 | P060 | Cholesterol | 227.31 | mmol/L | 2026/03/08 |
| 76 | P076 | Glucose | 218.4 | mg/dL | 2025-10-29 |
| 184 | P084 | Creatinine | 263.9 | mg/dL | 2026-02-13 |
| 14 | P014 | Glucose | 190.2 | mg/dL | 2025/12/01 |
| 51 | P051 | Glucose | NA | mg/dL | 2025/07/15 |
| 155 | P055 | Creatinine | 1.09 | umol/L | 2026-04-20 |
| 75 | P075 | Glucose | 5.7 | NA | 2025-11-14 |
| 236 | P036 | Cholesterol | 8.39 | mg/dL | 2025/12/30 |
Data used: lab_data (the messy lab data
created above)
Use data_overview() to get a comprehensive diagnostic
report of data quality issues:
overview <- data_overview(lab_data)
#> === Data Overview Summary ===
#> Dataset: 300 rows, 5 columns
#>
#> Variable Types:
#> character : 5 variables
#>
#> Found 4 potential quality issues:
#> numeric_as_character : 1 cases
#> missing_values : 3 cases
#>
#> Recommendations:
#> - Consider converting these character variables to numeric: value
#> - Variables with < 50 % missing values: unit, value, test_date - consider imputation
print(overview$variable_types)
#> $character
#> [1] "patient_id" "test" "value" "unit" "test_date"
print(overview$summary_stats)
#> $character
#> variable n missing missing_pct unique top_value top_freq
#> patient_id patient_id 300 0 0.00 100 P001 3
#> test test 300 0 0.00 3 Cholesterol 100
#> value value 275 25 8.33 236 0 4
#> unit unit 268 32 10.67 3 mg/dL 159
#> test_date test_date 279 21 7.00 86 2025-08-31 9
#> top_pct
#> patient_id 1.00
#> test 33.33
#> value 1.45
#> unit 59.33
#> test_date 3.23
print(overview$quality_issues$missing_values)
#> unit value test_date
#> 10.67 8.33 7.00Check for unit conflicts across different test types:
knitr::kable(unit_view(lab_data, subject_col = "test", value_col = "value", unit_col = "unit"),
caption = "Unit Conflicts by Test Type"
)| subject | unit | label | count | nvalid | mean | sd | median | q_0.025 | q_0.975 |
|---|---|---|---|---|---|---|---|---|---|
| Cholesterol | mg/dL | NA | 50 | 41 | 119.34902 | 175.46788 | 9.70 | 3.70000 | 342.5300 |
| Cholesterol | mmol/L | NA | 40 | 33 | 185.58394 | 197.11032 | 183.62 | 3.73600 | 657.6000 |
| Cholesterol | NA | NA | 10 | 8 | 102.32375 | 84.31264 | 141.32 | 0.56000 | 189.1785 |
| Creatinine | mg/dL | NA | 54 | 46 | 94.63500 | 212.69011 | 2.87 | 0.70000 | 915.3400 |
| Creatinine | umol/L | NA | 37 | 32 | 57.27156 | 82.24823 | 2.72 | -2.25000 | 292.5532 |
| Creatinine | NA | NA | 9 | 8 | 74.75500 | 109.80029 | 41.49 | 0.14175 | 283.1100 |
| Glucose | mg/dL | NA | 55 | 41 | 88.75610 | 159.26087 | 16.30 | 0.00000 | 500.0000 |
| Glucose | mmol/L | NA | 32 | 26 | 71.28846 | 63.38110 | 84.30 | 4.56250 | 202.9750 |
| Glucose | NA | NA | 13 | 9 | 178.11111 | 314.49209 | 82.90 | 5.74000 | 841.0000 |
Data used: lab_data
Format cleaning is essential before any numerical analysis. Lab data often contains mixed formats, text annotations, and inconsistent date representations.
First, identify which entries cannot be converted to numeric values:
nonnum_df <- df_view_nonnum(lab_data)
knitr::kable(head(nonnum_df, 15), caption = "Non-numeric Entries by Variable")| patient_id | test | test_date | unit | value |
|---|---|---|---|---|
| P001 | Glucose | 2026/05/21 | mg/dL | N/A |
| P002 | Creatinine | pending | mmol/L | 7..2 |
| P003 | Cholesterol | 2025/07/05 | umol/L | 180 (post-meal) |
| P004 | NA | 2025-12-06 | NA | >500 |
| P005 | NA | 1900-01-01 | NA | <40 |
| P006 | NA | 2026-04-19 | NA | 8.5.1 |
| P007 | NA | 2026/04/10 | NA | 6..5 |
| P008 | NA | 2025/12/30 | NA | see comment |
| P009 | NA | 2026-04-29 | NA | normal range |
| P010 | NA | 2026-04-20 | NA | 1.2 mg/dL |
| P011 | NA | 2025-08-31 | NA | 110 umol/L |
| P012 | NA | 2025/12/01 | NA | >5.0 |
| P013 | NA | 2026/01/25 | NA | (1.2) |
| P014 | NA | 2025-12-23 | NA | >400 |
| P015 | NA | 2025/10/19 | NA | 250 (borderline) |
Use extract_num() to clean messy numeric strings. This
function handles: - Text annotations (e.g., “120 mg/dL” → 120) - Range
indicators (e.g., “<40”, “>500”) - Extra decimal points (e.g.,
“5..2” → 5.2) - Extra whitespace and punctuation
# Create a copy for cleaning
lab_data_cleaned <- lab_data
# Extract numeric values from the messy value column
lab_data_cleaned$value_numeric <- extract_num(lab_data$value)
# Show before/after comparison
comparison <- data.frame(
test = lab_data$test,
original = lab_data$value,
cleaned = lab_data_cleaned$value_numeric,
unit = lab_data$unit
) %>%
dplyr::filter(original != cleaned | is.na(cleaned)) %>%
head(15)
knitr::kable(comparison, caption = "Value Cleaning: Before vs After")| test | original | cleaned | unit |
|---|---|---|---|
| Glucose | N/A | NA | mg/dL |
| Glucose | 7..2 | 7.2 | mg/dL |
| Glucose | NA | NA | mmol/L |
| Glucose | NA | NA | mg/dL |
| Glucose | NA | NA | NA |
| Glucose | N/A | NA | mg/dL |
| Glucose | NA | NA | mg/dL |
| Glucose | 180 (post-meal) | 180.0 | mmol/L |
| Glucose | NA | NA | mmol/L |
| Glucose | >500 | 500.0 | mmol/L |
| Glucose | NA | NA | mg/dL |
| Glucose | <40 | 40.0 | mg/dL |
| Glucose | 8.5.1 | 8.5 | mg/dL |
| Glucose | 6..5 | 6.5 | mg/dL |
| Glucose | <40 | 40.0 | NA |
Lab data often contains test dates in various formats. Use
to_date() to standardize them:
# Convert various date formats to standard Date objects
lab_data_cleaned$test_date_clean <- to_date(lab_data$test_date)
# Show date conversion results
date_comparison <- data.frame(
original = lab_data$test_date,
cleaned = lab_data_cleaned$test_date_clean
) %>%
dplyr::filter(!is.na(original)) %>%
head(15)
knitr::kable(date_comparison, caption = "Date Conversion: Before vs After")| original | cleaned |
|---|---|
| 2026/05/21 | 2026-05-21 |
| pending | NA |
| 2025/07/05 | 2025-07-05 |
| 2025-12-06 | 2025-12-06 |
| 1900-01-01 | 1900-01-01 |
| 2026-04-19 | 2026-04-19 |
| 2026/04/10 | 2026-04-10 |
| 2025/12/30 | 2025-12-30 |
| 2026-04-29 | 2026-04-29 |
| 2025-12-06 | 2025-12-06 |
| 2026-04-20 | 2026-04-20 |
| 2025-08-31 | 2025-08-31 |
| 2025/12/01 | 2025-12-01 |
| 2025/12/01 | 2025-12-01 |
| 2026/01/25 | 2026-01-25 |
Check for invalid dates that couldn’t be parsed:
invalid_dates <- lab_data_cleaned %>%
dplyr::filter(!is.na(test_date) & is.na(test_date_clean)) %>%
dplyr::select(patient_id, test, test_date) %>%
head(10)
knitr::kable(invalid_dates, caption = "Invalid Date Entries (Could Not Be Parsed)")| patient_id | test | test_date |
|---|---|---|
| P002 | Glucose | pending |
| P068 | Glucose | unknown |
| P079 | Glucose | unknown |
| P088 | Glucose | pending |
| P018 | Creatinine | unknown |
| P058 | Creatinine | N/A |
| P059 | Creatinine | N/A |
| P065 | Creatinine | |
| P073 | Creatinine | unknown |
| P080 | Creatinine | N/A |
Data used: lab_data
Unit standardization must be performed before outlier detection. This is critical when merging data from different laboratories. We will standardize all values to conventional units:
change_rules <- list(
list(subject = "Glucose", target_unit = "mg/dL", units2change = "mmol/L", coeffs = 18),
list(subject = "Creatinine", target_unit = "mg/dL", units2change = "umol/L", coeffs = 1 / 88.4),
list(subject = "Cholesterol", target_unit = "mg/dL", units2change = "mmol/L", coeffs = 38.67)
)
lab_data_std <- unit_standardize(
lab_data,
subject_col = "test", value_col = "value",
unit_col = "unit", change_rules = change_rules
)
knitr::kable(head(lab_data_std, 15), caption = "Lab Data After Unit Standardization")| patient_id | test | value | unit | test_date |
|---|---|---|---|---|
| P001 | Glucose | NA | mg/dL | 2026-05-21 |
| P002 | Glucose | 2282.4 | mg/dL | NA |
| P003 | Glucose | 7.2 | mg/dL | 2025-07-05 |
| P004 | Glucose | 95.6 | mg/dL | 2025-12-06 |
| P005 | Glucose | 1512.0 | mg/dL | 1900-01-01 |
| P006 | Glucose | 95.4 | mg/dL | 2026-04-19 |
| P007 | Glucose | 186.2 | mg/dL | 2026-04-10 |
| P008 | Glucose | 6.1 | mg/dL | 2025-12-30 |
| P009 | Glucose | 257.4 | mg/dL | 2026-04-29 |
| P010 | Glucose | NA | mg/dL | 2025-12-06 |
| P011 | Glucose | 5.3 | mg/dL | 2026-04-20 |
| P012 | Glucose | 6.6 | mg/dL | 2025-08-31 |
| P013 | Glucose | 1614.6 | mg/dL | 2025-12-01 |
| P014 | Glucose | 190.2 | mg/dL | 2025-12-01 |
| P015 | Glucose | 5.1 | mg/dL | 2026-01-25 |
Verify that units are now standardized:
knitr::kable(unit_view(lab_data_std, subject_col = "test", value_col = "value", unit_col = "unit"),
caption = "Units After Standardization"
)| subject | unit | label | count | nvalid | mean | sd | median | q_0.025 | q_0.975 |
|---|
Data used: lab_data_std
(unit-standardized lab data)
Important: Outlier detection is performed after unit standardization to ensure consistent scales across all measurements.
The package provides three methods, each with different strengths:
Detect outliers for each test type using the IQR method:
# Split data by test type for outlier detection
lab_data_clean <- lab_data_std
# Detect outliers for each test
for (test_name in c("Glucose", "Creatinine", "Cholesterol")) {
test_data <- lab_data_std$value[lab_data_std$test == test_name]
outlier_res <- detect_outliers(test_data, method = "iqr")
cat("\n", test_name, ":\n")
cat(" Total values:", length(test_data), "\n")
cat(" Outliers detected:", sum(outlier_res$outlier_mask, na.rm = TRUE), "\n")
cat(" Missing values:", sum(is.na(test_data)), "\n")
}
#>
#> Glucose :
#> Total values: 100
#> Outliers detected: 19
#> Missing values: 14
#>
#> Creatinine :
#> Total values: 100
#> Outliers detected: 23
#> Missing values: 8
#>
#> Cholesterol :
#> Total values: 100
#> Outliers detected: 22
#> Missing values: 10Compare outlier detection methods for Glucose:
glucose_values <- lab_data_std$value[lab_data_std$test == "Glucose"]
knitr::kable(data.frame(
Method = c("MAD", "IQR", "Z-score"),
Outlier_Count = c(
sum(mad_outlier(glucose_values, threshold = 3), na.rm = TRUE),
sum(iqr_outlier(glucose_values, threshold = 1.5), na.rm = TRUE),
sum(zscore_outlier(glucose_values, threshold = 3), na.rm = TRUE)
)
), caption = "Outlier Detection Results by Method (Glucose)")| Method | Outlier_Count |
|---|---|
| MAD | 19 |
| IQR | 19 |
| Z-score | 1 |
For this demonstration, we will set outliers to NA. In practice, outliers should be manually verified — they may represent true extreme values (e.g., severe hyperglycemia) rather than data errors.
# Create a copy for outlier handling
lab_data_final <- lab_data_std
# Set outliers to NA for each test type
for (test_name in c("Glucose", "Creatinine", "Cholesterol")) {
test_idx <- lab_data_std$test == test_name
test_values <- lab_data_std$value[test_idx]
outlier_res <- detect_outliers(test_values, method = "iqr")
# Set outliers to NA
lab_data_final$value[test_idx][outlier_res$outlier_mask] <- NA
}
# Compare before and after outlier handling
comparison_df <- data.frame(
Test = rep(c("Glucose", "Creatinine", "Cholesterol"), each = 2),
Stage = rep(c("Before", "After"), 3),
N = c(
sum(!is.na(lab_data_std$value[lab_data_std$test == "Glucose"])),
sum(!is.na(lab_data_final$value[lab_data_final$test == "Glucose"])),
sum(!is.na(lab_data_std$value[lab_data_std$test == "Creatinine"])),
sum(!is.na(lab_data_final$value[lab_data_final$test == "Creatinine"])),
sum(!is.na(lab_data_std$value[lab_data_std$test == "Cholesterol"])),
sum(!is.na(lab_data_final$value[lab_data_final$test == "Cholesterol"]))
),
Mean = c(
mean(lab_data_std$value[lab_data_std$test == "Glucose"], na.rm = TRUE),
mean(lab_data_final$value[lab_data_final$test == "Glucose"], na.rm = TRUE),
mean(lab_data_std$value[lab_data_std$test == "Creatinine"], na.rm = TRUE),
mean(lab_data_final$value[lab_data_final$test == "Creatinine"], na.rm = TRUE),
mean(lab_data_std$value[lab_data_std$test == "Cholesterol"], na.rm = TRUE),
mean(lab_data_final$value[lab_data_final$test == "Cholesterol"], na.rm = TRUE)
)
)
knitr::kable(comparison_df, caption = "Before vs After Outlier Handling", digits = 2)| Test | Stage | N | Mean |
|---|---|---|---|
| Glucose | Before | 86 | 599.81 |
| Glucose | After | 67 | 82.01 |
| Creatinine | Before | 92 | 55.35 |
| Creatinine | After | 69 | 1.11 |
| Cholesterol | Before | 90 | 2818.59 |
| Cholesterol | After | 68 | 147.00 |
Clinical Note: Always verify outliers before removal. Some “outliers” may be clinically significant (e.g., glucose > 400 mg/dL in diabetic ketoacidosis). Consider creating an outlier indicator variable for sensitivity analyses rather than deleting values.
Data used: lab_data_final (after
standardization and outlier handling)
For missing data filtering by row/column missing rate, use
get_valid_subset() to obtain a complete-enough subset for
analysis. First, let’s convert to wide format to create a longitudinal
cohort dataset where each row represents a patient-test date
combination:
# Convert to wide format preserving all test dates
# This creates a longitudinal cohort dataset
lab_wide <- lab_data_final %>%
filter(!is.na(test_date)) %>% # Remove records without valid dates
select(patient_id, test_date, test, value) %>%
tidyr::pivot_wider(
names_from = test,
values_from = value,
names_sort = TRUE
)
# Sort by patient and date for longitudinal view
lab_wide <- lab_wide %>%
arrange(patient_id, test_date)
knitr::kable(head(lab_wide, 10), caption = "Longitudinal Lab Data (First 10 records)")| patient_id | test_date | Cholesterol | Creatinine | Glucose |
|---|---|---|---|---|
| P001 | 2025-11-01 | NA | 1.13 | NA |
| P001 | 2025-12-11 | 3.70 | NA | NA |
| P001 | 2026-05-21 | NA | NA | NA |
| P002 | 2025-10-21 | 9.70 | NA | NA |
| P002 | 2025-12-29 | NA | NA | NA |
| P003 | 2025-07-05 | NA | NA | 7.2 |
| P003 | 2026-02-26 | NA | NA | NA |
| P003 | 2026-04-29 | NA | NA | NA |
| P004 | 2025-11-19 | 329.49 | NA | NA |
| P004 | 2025-12-06 | NA | NA | 95.6 |
View the cohort structure - number of tests per patient:
cohort_summary <- lab_wide %>%
group_by(patient_id) %>%
summarise(
n_visits = n(),
first_date = min(test_date),
last_date = max(test_date)
) %>%
head(10)
knitr::kable(cohort_summary, caption = "Cohort Structure: Visits per Patient")| patient_id | n_visits | first_date | last_date |
|---|---|---|---|
| P001 | 3 | 2025-11-01 | 2026-05-21 |
| P002 | 2 | 2025-10-21 | 2025-12-29 |
| P003 | 3 | 2025-07-05 | 2026-04-29 |
| P004 | 3 | 2025-11-19 | 2026-04-10 |
| P005 | 3 | 1900-01-01 | 2026-02-19 |
| P006 | 2 | 2026-03-05 | 2026-04-19 |
| P007 | 2 | 2026-04-10 | 2030-12-31 |
| P008 | 3 | 2025-09-24 | 2026-02-10 |
| P009 | 3 | 2025-12-17 | 2026-04-29 |
| P010 | 3 | 2025-08-25 | 2025-12-06 |
Assess missing data patterns:
missing_summary <- data.frame(
Variable = names(lab_wide)[-1],
Missing_Count = sapply(lab_wide[-1], function(x) sum(is.na(x))),
Missing_Percent = round(sapply(lab_wide[-1], function(x) sum(is.na(x)) / length(x) * 100, 2))
)
#> Error in FUN(X[[i]], ...): 参数没有用(2)
knitr::kable(missing_summary, caption = "Missing Data Summary by Test")
#> Error: 找不到对象'missing_summary'Filter records with too much missing data:
lab_wide_clean <- get_valid_subset(
lab_wide,
row_na_ratio = 0.5, # Allow up to 50% missing per record
col_na_ratio = 0.3 # Allow up to 30% missing per test type
)
cat(
"Dimensions:", nrow(lab_wide), "records",
"->", nrow(lab_wide_clean), "records\n"
)
#> Dimensions: 259 records -> 259 records
cat(
"Unique patients:", length(unique(lab_wide$patient_id)),
"->", length(unique(lab_wide_clean$patient_id)), "\n"
)
#> Unique patients: 99 -> 99Data used: lab_wide_clean (cleaned
longitudinal cohort data)
Create categorical variables from continuous lab values using clinical cut-points. In longitudinal data, each visit can have different classifications:
# Glucose categories
lab_wide_clean$glucose_category <- cut_by(
lab_wide_clean$Glucose,
breaks = c(100, 126),
labels = c("Normal", "Prediabetes", "Diabetes"),
label_with_range = FALSE
)
#> Error in cut_by(lab_wide_clean$Glucose, breaks = c(100, 126), labels = c("Normal", : Some of `breaks` are not in the range of `x`!
# Creatinine categories (eGFR approximation)
lab_wide_clean$renal_function <- cut_by(
lab_wide_clean$Creatinine,
breaks = c(1.2, 2.0, 4.0),
labels = c("Normal", "Mild", "Moderate", "Severe"),
label_with_range = FALSE
)
#> Error in cut_by(lab_wide_clean$Creatinine, breaks = c(1.2, 2, 4), labels = c("Normal", : Some of `breaks` are not in the range of `x`!
# Cholesterol categories
lab_wide_clean$cholesterol_category <- cut_by(
lab_wide_clean$Cholesterol,
breaks = c(200, 240),
labels = c("Desirable", "Borderline", "High"),
label_with_range = FALSE
)
#> Error in cut_by(lab_wide_clean$Cholesterol, breaks = c(200, 240), labels = c("Desirable", : Some of `breaks` are not in the range of `x`!
# Summary of categories across all visits
cat_summary <- data.frame(
Category = c(
rep("Glucose", length(table(lab_wide_clean$glucose_category))),
rep("Renal Function", length(table(lab_wide_clean$renal_function))),
rep("Cholesterol", length(table(lab_wide_clean$cholesterol_category)))
),
Level = c(
names(table(lab_wide_clean$glucose_category)),
names(table(lab_wide_clean$renal_function)),
names(table(lab_wide_clean$cholesterol_category))
),
Count = c(
as.vector(table(lab_wide_clean$glucose_category)),
as.vector(table(lab_wide_clean$renal_function)),
as.vector(table(lab_wide_clean$cholesterol_category))
)
)
knitr::kable(cat_summary, caption = "Clinical Category Distributions (All Visits)")| Category | Count |
|---|
View a single patient’s trajectory over time:
# Select one patient with multiple visits for demonstration
patient_trajectory <- lab_wide_clean %>%
filter(patient_id == unique(patient_id)[1]) %>%
select(
patient_id, test_date, Glucose, Creatinine, Cholesterol,
glucose_category, renal_function
) %>%
arrange(test_date)
#> Error in `select()`:
#> ! Can't select columns that don't exist.
#> ✖ Column `Glucose` doesn't exist.
knitr::kable(patient_trajectory, caption = "Example: Single Patient's Lab Trajectory")
#> Error: 找不到对象'patient_trajectory'Here’s a complete workflow from raw lab data to analysis-ready data:
# Step 1: Data Overview
overview <- data_overview(lab_data)
#> === Data Overview Summary ===
#> Dataset: 300 rows, 5 columns
#>
#> Variable Types:
#> numeric : 1 variables
#> character : 3 variables
#> date : 1 variables
#>
#> Found 6 potential quality issues:
#> outliers : 1 cases
#> missing_values : 3 cases
#> negative_in_positive : 1 cases
#> suspicious_dates : 1 cases
#>
#> Recommendations:
#> - Review outliers in these numeric variables: value
#> - Variables with < 50 % missing values: test_date, value, unit - consider imputation
#> - Numeric variables with mostly positive values but containing negatives: value
#> - Review suspicious dates (year < 1910 or > current year) in: test_date
# Step 2: Format Cleaning - Extract numeric values and convert dates
clean <- lab_data
clean$value <- extract_num(lab_data$value)
clean$test_date <- to_date(lab_data$test_date)
#> Error: Input must be numeric or character vector, got: Date
# Step 3: Unit Standardization
change_rules <- list(
list(subject = "Glucose", target_unit = "mg/dL", units2change = "mmol/L", coeffs = 18),
list(subject = "Creatinine", target_unit = "mg/dL", units2change = "umol/L", coeffs = 1 / 88.4),
list(subject = "Cholesterol", target_unit = "mg/dL", units2change = "mmol/L", coeffs = 38.67)
)
clean <- unit_standardize(
clean,
subject_col = "test", value_col = "value",
unit_col = "unit", change_rules = change_rules
)
# Step 4: Outlier Detection and Handling
for (test_name in c("Glucose", "Creatinine", "Cholesterol")) {
test_idx <- clean$test == test_name
test_values <- clean$value[test_idx]
outlier_res <- detect_outliers(test_values, method = "iqr")
clean$value[test_idx][outlier_res$outlier_mask] <- NA
}
# Step 5: Convert to wide format (longitudinal cohort data)
clean_wide <- clean %>%
filter(!is.na(test_date)) %>%
select(patient_id, test_date, test, value) %>%
tidyr::pivot_wider(names_from = test, values_from = value) %>%
arrange(patient_id, test_date)
# Step 6: Missing value filtering
clean_wide <- get_valid_subset(clean_wide, row_na_ratio = 0.5, col_na_ratio = 0.3)
# Step 7: Create clinical categories
clean_wide$glucose_category <- cut_by(
clean_wide$Glucose,
breaks = c(100, 126),
labels = c("Normal", "Prediabetes", "Diabetes")
)
#> Error in cut_by(clean_wide$Glucose, breaks = c(100, 126), labels = c("Normal", : Some of `breaks` are not in the range of `x`!
# Step 8: Final check
final_overview <- data_overview(clean_wide)
#> === Data Overview Summary ===
#> Dataset: 259 rows, 2 columns
#>
#> Variable Types:
#> character : 1 variables
#> date : 1 variables
#>
#> Found 1 potential quality issues:
#> suspicious_dates : 1 cases
#>
#> Recommendations:
#> - Review suspicious dates (year < 1910 or > current year) in: test_date
knitr::kable(final_overview$summary_stats, caption = "Final Data Quality Overview")
|
cat(
"\nOriginal records:", nrow(lab_data),
"| Final records:", nrow(clean_wide),
"| Removed:", nrow(lab_data) - nrow(clean_wide), "\n"
)
#>
#> Original records: 300 | Final records: 259 | Removed: 41
cat(
"Unique patients:", length(unique(lab_data$patient_id)),
"->", length(unique(clean_wide$patient_id)), "\n"
)
#> Unique patients: 100 -> 99| Step | Function | Purpose | Data Used |
|---|---|---|---|
| 1. Data Overview | data_overview() |
Comprehensive data quality assessment | Raw lab data |
| 2. Format Cleaning | extract_num(), to_date(),
df_view_nonnum() |
Clean messy strings, convert dates | Raw lab data |
| 3. Unit Standardization | unit_standardize() |
Standardize measurement units | Format-cleaned data |
| 4. Outlier Detection | detect_outliers(), iqr_outlier() |
Detect and handle outliers | Standardized data |
| 5. Missing Values | get_valid_subset() |
Filter by missing rate | Post-outlier data |
| 6. Data Transformation | cut_by() |
Create derived variables | Cleaned data |