From 5365b609bca677dc750ffa47d8eb8f8467703eb5 Mon Sep 17 00:00:00 2001 From: Andy Rae <1127507+AndyRae@users.noreply.github.com> Date: Mon, 13 Apr 2026 09:57:31 +0100 Subject: [PATCH 1/3] feat: Add exception columns --- nuh_helper/profile/__init__.py | 15 +++++- tests/test_profile.py | 94 ++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 tests/test_profile.py diff --git a/nuh_helper/profile/__init__.py b/nuh_helper/profile/__init__.py index 5e236d6..735000c 100644 --- a/nuh_helper/profile/__init__.py +++ b/nuh_helper/profile/__init__.py @@ -73,7 +73,17 @@ def generate_scan_report( csv_files: list[str], output_path: str = SCAN_REPORT_FILE_NAME, min_cell_count: int = 1, + excluded_columns: list[str] | None = None, ) -> str: + """Generate a WhiteRabbit-compatible Scan Report from CSV files. + + Args: + csv_files: Paths to the CSV files to profile. + output_path: Path for the output Excel file. + min_cell_count: Minimum frequency for a value to appear in the report. + excluded_columns: Column names to omit from profiling entirely. Useful + for columns such as dates that are not needed in a scan report. + """ logger.info("Generating scan report for %d table(s)", len(csv_files)) tables = [] @@ -81,9 +91,10 @@ def generate_scan_report( for csv_file in csv_files: csv_file = Path(csv_file) header = read_csv_header(csv_file.as_posix()) - logger.info("Scanning '%s' (%d field(s))", csv_file.name, len(header)) + fields = [f for f in header if f not in (excluded_columns or [])] + logger.info("Scanning '%s' (%d field(s))", csv_file.name, len(fields)) tables.append( - {"name": csv_file.name, "path": csv_file.as_posix(), "fields": header} + {"name": csv_file.name, "path": csv_file.as_posix(), "fields": fields} ) tables.sort(key=lambda t: t["name"]) diff --git a/tests/test_profile.py b/tests/test_profile.py new file mode 100644 index 0000000..bdbf286 --- /dev/null +++ b/tests/test_profile.py @@ -0,0 +1,94 @@ +import csv +from pathlib import Path + +import openpyxl +import pytest + +from nuh_helper import generate_scan_report + + +@pytest.fixture +def simple_csv(tmp_path: Path) -> Path: + """CSV with columns: id, name, dob, score.""" + path = tmp_path / "patients.csv" + with open(path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["id", "name", "dob", "score"]) + writer.writerow(["1", "Alice", "1980-01-01", "10"]) + writer.writerow(["2", "Bob", "1990-06-15", "20"]) + return path + + +def load_sheet_headers(wb: openpyxl.Workbook, sheet_name: str) -> list[str]: + ws = wb[sheet_name] + return [cell.value for cell in next(ws.iter_rows(min_row=1, max_row=1))] + + +def field_overview_fields(wb: openpyxl.Workbook) -> list[str]: + ws = wb["Field Overview"] + return [row[1].value for row in ws.iter_rows(min_row=2) if row[1].value] + + +def value_sheet_columns(wb: openpyxl.Workbook, sheet_name: str) -> list[str]: + """Return the field names (every other header) from a value sheet.""" + ws = wb[sheet_name] + headers = [cell.value for cell in next(ws.iter_rows(min_row=1, max_row=1))] + return [headers[i] for i in range(0, len(headers), 2)] + + +def test_no_excluded_columns(simple_csv: Path, tmp_path: Path) -> None: + out = tmp_path / "report.xlsx" + generate_scan_report([str(simple_csv)], output_path=str(out)) + wb = openpyxl.load_workbook(out) + fields = field_overview_fields(wb) + assert fields == ["id", "name", "dob", "score"] + + +def test_excluded_columns_removed_from_field_overview( + simple_csv: Path, tmp_path: Path +) -> None: + out = tmp_path / "report.xlsx" + generate_scan_report( + [str(simple_csv)], output_path=str(out), excluded_columns=["dob"] + ) + wb = openpyxl.load_workbook(out) + fields = field_overview_fields(wb) + assert "dob" not in fields + assert fields == ["id", "name", "score"] + + +def test_excluded_columns_removed_from_value_sheet( + simple_csv: Path, tmp_path: Path +) -> None: + out = tmp_path / "report.xlsx" + generate_scan_report( + [str(simple_csv)], output_path=str(out), excluded_columns=["dob"] + ) + wb = openpyxl.load_workbook(out) + columns = value_sheet_columns(wb, "patients.csv") + assert "dob" not in columns + assert columns == ["id", "name", "score"] + + +def test_multiple_excluded_columns(simple_csv: Path, tmp_path: Path) -> None: + out = tmp_path / "report.xlsx" + generate_scan_report( + [str(simple_csv)], output_path=str(out), excluded_columns=["dob", "id"] + ) + wb = openpyxl.load_workbook(out) + fields = field_overview_fields(wb) + assert fields == ["name", "score"] + columns = value_sheet_columns(wb, "patients.csv") + assert columns == ["name", "score"] + + +def test_excluded_nonexistent_column_is_ignored( + simple_csv: Path, tmp_path: Path +) -> None: + out = tmp_path / "report.xlsx" + generate_scan_report( + [str(simple_csv)], output_path=str(out), excluded_columns=["nonexistent"] + ) + wb = openpyxl.load_workbook(out) + fields = field_overview_fields(wb) + assert fields == ["id", "name", "dob", "score"] From 54c4f782b5be2dd5755c12c80e455531eea61daa Mon Sep 17 00:00:00 2001 From: Andy Rae <1127507+AndyRae@users.noreply.github.com> Date: Mon, 13 Apr 2026 11:36:20 +0100 Subject: [PATCH 2/3] Add configuration --- README.md | 18 ++++++++- nuh_helper/profile/__init__.py | 12 ++++-- tests/test_profile.py | 73 +++++++++++++++++++++++----------- 3 files changed, 74 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 1955dc1..78a43c4 100644 --- a/README.md +++ b/README.md @@ -168,10 +168,26 @@ Profile a dataset and generate a Scan Report. ```python from nuh_helper import generate_scan_report - csv_files = [ "patients.csv", + "visits.csv", ] generate_scan_report(csv_files, min_cell_count=5) ``` + +### Excluding columns from profiling + +Some columns (e.g. dates, free-text notes) are not useful in a scan report and can be excluded per file using `excluded_columns`: + +```python +generate_scan_report( + csv_files, + excluded_columns={ + "patients.csv": ["dob", "nhs_number"], + "visits.csv": ["visit_date"], + }, +) +``` + +Keys are CSV filenames. Excluded columns are omitted from the Field Overview and value sheets entirely. Tables not listed in the dict are unaffected. diff --git a/nuh_helper/profile/__init__.py b/nuh_helper/profile/__init__.py index 735000c..354c6f3 100644 --- a/nuh_helper/profile/__init__.py +++ b/nuh_helper/profile/__init__.py @@ -73,7 +73,7 @@ def generate_scan_report( csv_files: list[str], output_path: str = SCAN_REPORT_FILE_NAME, min_cell_count: int = 1, - excluded_columns: list[str] | None = None, + excluded_columns: dict[str, list[str]] | None = None, ) -> str: """Generate a WhiteRabbit-compatible Scan Report from CSV files. @@ -81,8 +81,11 @@ def generate_scan_report( csv_files: Paths to the CSV files to profile. output_path: Path for the output Excel file. min_cell_count: Minimum frequency for a value to appear in the report. - excluded_columns: Column names to omit from profiling entirely. Useful - for columns such as dates that are not needed in a scan report. + excluded_columns: Per-file column names to omit from profiling. Keys + are CSV filenames (e.g. ``"patients.csv"``), values are lists of + column names to exclude. For example:: + + excluded_columns={"patients.csv": ["dob", "nhs_number"]} """ logger.info("Generating scan report for %d table(s)", len(csv_files)) @@ -91,7 +94,8 @@ def generate_scan_report( for csv_file in csv_files: csv_file = Path(csv_file) header = read_csv_header(csv_file.as_posix()) - fields = [f for f in header if f not in (excluded_columns or [])] + skip = (excluded_columns or {}).get(csv_file.name, []) + fields = [f for f in header if f not in skip] logger.info("Scanning '%s' (%d field(s))", csv_file.name, len(fields)) tables.append( {"name": csv_file.name, "path": csv_file.as_posix(), "fields": fields} diff --git a/tests/test_profile.py b/tests/test_profile.py index bdbf286..598793e 100644 --- a/tests/test_profile.py +++ b/tests/test_profile.py @@ -19,18 +19,28 @@ def simple_csv(tmp_path: Path) -> Path: return path -def load_sheet_headers(wb: openpyxl.Workbook, sheet_name: str) -> list[str]: - ws = wb[sheet_name] - return [cell.value for cell in next(ws.iter_rows(min_row=1, max_row=1))] +@pytest.fixture +def second_csv(tmp_path: Path) -> Path: + """CSV with columns: visit_id, patient_id, visit_date, result.""" + path = tmp_path / "visits.csv" + with open(path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["visit_id", "patient_id", "visit_date", "result"]) + writer.writerow(["101", "1", "2024-01-10", "normal"]) + writer.writerow(["102", "2", "2024-02-20", "abnormal"]) + return path -def field_overview_fields(wb: openpyxl.Workbook) -> list[str]: +def field_overview_fields(wb: openpyxl.Workbook, table_name: str) -> list[str]: ws = wb["Field Overview"] - return [row[1].value for row in ws.iter_rows(min_row=2) if row[1].value] + return [ + row[1].value + for row in ws.iter_rows(min_row=2) + if row[0].value == table_name and row[1].value + ] def value_sheet_columns(wb: openpyxl.Workbook, sheet_name: str) -> list[str]: - """Return the field names (every other header) from a value sheet.""" ws = wb[sheet_name] headers = [cell.value for cell in next(ws.iter_rows(min_row=1, max_row=1))] return [headers[i] for i in range(0, len(headers), 2)] @@ -40,8 +50,7 @@ def test_no_excluded_columns(simple_csv: Path, tmp_path: Path) -> None: out = tmp_path / "report.xlsx" generate_scan_report([str(simple_csv)], output_path=str(out)) wb = openpyxl.load_workbook(out) - fields = field_overview_fields(wb) - assert fields == ["id", "name", "dob", "score"] + assert field_overview_fields(wb, "patients.csv") == ["id", "name", "dob", "score"] def test_excluded_columns_removed_from_field_overview( @@ -49,12 +58,12 @@ def test_excluded_columns_removed_from_field_overview( ) -> None: out = tmp_path / "report.xlsx" generate_scan_report( - [str(simple_csv)], output_path=str(out), excluded_columns=["dob"] + [str(simple_csv)], + output_path=str(out), + excluded_columns={"patients.csv": ["dob"]}, ) wb = openpyxl.load_workbook(out) - fields = field_overview_fields(wb) - assert "dob" not in fields - assert fields == ["id", "name", "score"] + assert field_overview_fields(wb, "patients.csv") == ["id", "name", "score"] def test_excluded_columns_removed_from_value_sheet( @@ -62,24 +71,39 @@ def test_excluded_columns_removed_from_value_sheet( ) -> None: out = tmp_path / "report.xlsx" generate_scan_report( - [str(simple_csv)], output_path=str(out), excluded_columns=["dob"] + [str(simple_csv)], + output_path=str(out), + excluded_columns={"patients.csv": ["dob"]}, ) wb = openpyxl.load_workbook(out) - columns = value_sheet_columns(wb, "patients.csv") - assert "dob" not in columns - assert columns == ["id", "name", "score"] + assert value_sheet_columns(wb, "patients.csv") == ["id", "name", "score"] def test_multiple_excluded_columns(simple_csv: Path, tmp_path: Path) -> None: out = tmp_path / "report.xlsx" generate_scan_report( - [str(simple_csv)], output_path=str(out), excluded_columns=["dob", "id"] + [str(simple_csv)], + output_path=str(out), + excluded_columns={"patients.csv": ["dob", "id"]}, + ) + wb = openpyxl.load_workbook(out) + assert field_overview_fields(wb, "patients.csv") == ["name", "score"] + assert value_sheet_columns(wb, "patients.csv") == ["name", "score"] + + +def test_exclusions_are_per_table( + simple_csv: Path, second_csv: Path, tmp_path: Path +) -> None: + """Exclusions on one table must not affect another.""" + out = tmp_path / "report.xlsx" + generate_scan_report( + [str(simple_csv), str(second_csv)], + output_path=str(out), + excluded_columns={"patients.csv": ["dob"]}, ) wb = openpyxl.load_workbook(out) - fields = field_overview_fields(wb) - assert fields == ["name", "score"] - columns = value_sheet_columns(wb, "patients.csv") - assert columns == ["name", "score"] + assert "dob" not in field_overview_fields(wb, "patients.csv") + assert "visit_date" in field_overview_fields(wb, "visits.csv") def test_excluded_nonexistent_column_is_ignored( @@ -87,8 +111,9 @@ def test_excluded_nonexistent_column_is_ignored( ) -> None: out = tmp_path / "report.xlsx" generate_scan_report( - [str(simple_csv)], output_path=str(out), excluded_columns=["nonexistent"] + [str(simple_csv)], + output_path=str(out), + excluded_columns={"patients.csv": ["nonexistent"]}, ) wb = openpyxl.load_workbook(out) - fields = field_overview_fields(wb) - assert fields == ["id", "name", "dob", "score"] + assert field_overview_fields(wb, "patients.csv") == ["id", "name", "dob", "score"] From a41fb7d545c7537cf35074033994ce2ddfb0fc40 Mon Sep 17 00:00:00 2001 From: Andy Rae <1127507+AndyRae@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:09:37 +0100 Subject: [PATCH 3/3] Fix columns showing --- README.md | 2 +- nuh_helper/profile/__init__.py | 11 +++++---- tests/test_profile.py | 44 +++++++++++++++++++++++++--------- 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 78a43c4..acf535e 100644 --- a/README.md +++ b/README.md @@ -190,4 +190,4 @@ generate_scan_report( ) ``` -Keys are CSV filenames. Excluded columns are omitted from the Field Overview and value sheets entirely. Tables not listed in the dict are unaffected. +Keys are CSV filenames. Excluded columns still appear in the Field Overview and as column headers in the value sheets, but no values are collected or shown for them. Tables not listed in the dict are unaffected. diff --git a/nuh_helper/profile/__init__.py b/nuh_helper/profile/__init__.py index 354c6f3..eae1636 100644 --- a/nuh_helper/profile/__init__.py +++ b/nuh_helper/profile/__init__.py @@ -81,9 +81,11 @@ def generate_scan_report( csv_files: Paths to the CSV files to profile. output_path: Path for the output Excel file. min_cell_count: Minimum frequency for a value to appear in the report. - excluded_columns: Per-file column names to omit from profiling. Keys + excluded_columns: Per-file columns to skip when collecting values. Keys are CSV filenames (e.g. ``"patients.csv"``), values are lists of - column names to exclude. For example:: + column names. The columns still appear in the Field Overview and + value sheet headers, but no values are collected for them. For + example:: excluded_columns={"patients.csv": ["dob", "nhs_number"]} """ @@ -94,8 +96,7 @@ def generate_scan_report( for csv_file in csv_files: csv_file = Path(csv_file) header = read_csv_header(csv_file.as_posix()) - skip = (excluded_columns or {}).get(csv_file.name, []) - fields = [f for f in header if f not in skip] + fields = header logger.info("Scanning '%s' (%d field(s))", csv_file.name, len(fields)) tables.append( {"name": csv_file.name, "path": csv_file.as_posix(), "fields": fields} @@ -127,6 +128,8 @@ def generate_scan_report( for table in tables: table_name_indexed = indexed_names[table["name"]] value_data, row_count = scan_csv_values(table["path"], min_cell_count) + for col in (excluded_columns or {}).get(table["name"], []): + value_data[col] = [] table_value_data[table_name_indexed] = value_data table_sheet.append( diff --git a/tests/test_profile.py b/tests/test_profile.py index 598793e..438178e 100644 --- a/tests/test_profile.py +++ b/tests/test_profile.py @@ -46,14 +46,26 @@ def value_sheet_columns(wb: openpyxl.Workbook, sheet_name: str) -> list[str]: return [headers[i] for i in range(0, len(headers), 2)] +def value_sheet_data( + wb: openpyxl.Workbook, sheet_name: str, column: str +) -> list[str]: + """Return the values listed under a given column in a value sheet.""" + ws = wb[sheet_name] + rows = list(ws.iter_rows(values_only=True)) + header = rows[0] + col_index = next(i for i, v in enumerate(header) if v == column) + return [row[col_index] for row in rows[1:] if row[col_index] not in (None, "")] + + def test_no_excluded_columns(simple_csv: Path, tmp_path: Path) -> None: out = tmp_path / "report.xlsx" generate_scan_report([str(simple_csv)], output_path=str(out)) wb = openpyxl.load_workbook(out) assert field_overview_fields(wb, "patients.csv") == ["id", "name", "dob", "score"] + assert value_sheet_columns(wb, "patients.csv") == ["id", "name", "dob", "score"] -def test_excluded_columns_removed_from_field_overview( +def test_excluded_column_still_in_field_overview( simple_csv: Path, tmp_path: Path ) -> None: out = tmp_path / "report.xlsx" @@ -63,10 +75,10 @@ def test_excluded_columns_removed_from_field_overview( excluded_columns={"patients.csv": ["dob"]}, ) wb = openpyxl.load_workbook(out) - assert field_overview_fields(wb, "patients.csv") == ["id", "name", "score"] + assert "dob" in field_overview_fields(wb, "patients.csv") -def test_excluded_columns_removed_from_value_sheet( +def test_excluded_column_still_in_value_sheet_header( simple_csv: Path, tmp_path: Path ) -> None: out = tmp_path / "report.xlsx" @@ -76,19 +88,29 @@ def test_excluded_columns_removed_from_value_sheet( excluded_columns={"patients.csv": ["dob"]}, ) wb = openpyxl.load_workbook(out) - assert value_sheet_columns(wb, "patients.csv") == ["id", "name", "score"] + assert "dob" in value_sheet_columns(wb, "patients.csv") + + +def test_excluded_column_has_no_values(simple_csv: Path, tmp_path: Path) -> None: + out = tmp_path / "report.xlsx" + generate_scan_report( + [str(simple_csv)], + output_path=str(out), + excluded_columns={"patients.csv": ["dob"]}, + ) + wb = openpyxl.load_workbook(out) + assert value_sheet_data(wb, "patients.csv", "dob") == [] -def test_multiple_excluded_columns(simple_csv: Path, tmp_path: Path) -> None: +def test_non_excluded_column_still_has_values(simple_csv: Path, tmp_path: Path) -> None: out = tmp_path / "report.xlsx" generate_scan_report( [str(simple_csv)], output_path=str(out), - excluded_columns={"patients.csv": ["dob", "id"]}, + excluded_columns={"patients.csv": ["dob"]}, ) wb = openpyxl.load_workbook(out) - assert field_overview_fields(wb, "patients.csv") == ["name", "score"] - assert value_sheet_columns(wb, "patients.csv") == ["name", "score"] + assert value_sheet_data(wb, "patients.csv", "name") != [] def test_exclusions_are_per_table( @@ -102,8 +124,8 @@ def test_exclusions_are_per_table( excluded_columns={"patients.csv": ["dob"]}, ) wb = openpyxl.load_workbook(out) - assert "dob" not in field_overview_fields(wb, "patients.csv") - assert "visit_date" in field_overview_fields(wb, "visits.csv") + assert value_sheet_data(wb, "patients.csv", "dob") == [] + assert value_sheet_data(wb, "visits.csv", "visit_date") != [] def test_excluded_nonexistent_column_is_ignored( @@ -116,4 +138,4 @@ def test_excluded_nonexistent_column_is_ignored( excluded_columns={"patients.csv": ["nonexistent"]}, ) wb = openpyxl.load_workbook(out) - assert field_overview_fields(wb, "patients.csv") == ["id", "name", "dob", "score"] + assert value_sheet_data(wb, "patients.csv", "name") != []