Dec 20, 2025

Part 1. Building a Lightweight Data Validation Pipeline with UV and Polars

Background

Recently, I migrated a Boat Rating engine from static files (.csv/.parquet) to dynamic database connections. The primary challenge was ensuring that the data from the new sources remained consistent with the legacy expectations.

The engine uses ~100 variables and lacks modern engineering “safety nets.” I didn’t want to rewrite the entire architecture, so I focused on a “minimum viable update”: creating a validation layer that flags issues before they hit the engine.

Streamlining with UV

I used uv to manage the project. It removes the hassle of managing virtual environments across different servers.

Typical workflow:

1
uv init
2
uv add pandas polars
3
uv add --dev pytest ruff
4
uv run main.py

YAML-Driven Validation

I decided to use a .yaml file to define the schema for our 100+ variables. This keeps the logic separate from the code and makes it easy for others to update rules.

Key checks included:

Existence: Is the column there?
Type Match: (e.g., Integer vs String)
Range: For numerical values.
Allowed/Forbidden Values: Using lists or regex.

The Schema (`data_schema.yaml`)

1
# example setup, uses dataset from kaggle
2
schema:
3
  id:
4
    type: integer
5
    range: [1, 200]
6

7
  mssubclass:
8
    type: integer
9
    allowed_value: [60, 20, 70, 80]
10

11
  mszoning:
12
    type: string
13
    regex: "^[A-Z]+$"
14

15
  lotfrontage:
16
    type: float
17

18
  lotarea:
19
    type: float
20
    range: [1, 220000]
21

22
  street:
23
    type: string
24
    optional: true
25

26
  lotshape:
27
    type: string
28
    not_allowed_value: ["IR3"]

The Implementation (`validate.py`)

I built a DataValidator class using Polars. Polars is ideal here because its expression API makes range and regex checks incredibly fast.

1
import yaml
2
import polars as pl
3

4
class DataValidator:
5
    def __init__(self, schema_path: str):
6
        self.schema = self._load_schema(schema_path).get("schema", {})
7
        self.polars_dtypes = {
8
            "string": [pl.String, pl.Categorical, pl.Enum],
9
            "float": [pl.Float32, pl.Float64, pl.Decimal],
10
            "integer": [
11
                pl.Int8, pl.Int16, pl.Int32, pl.Int64,
12
                pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
13
            ],
14
            "date": [pl.Date, pl.Datetime, pl.Duration, pl.Time],
15
            "boolean": [pl.Boolean],
16
        }
17

18
    def _load_schema(self, schema_path):
19
        try:
20
            with open(schema_path, "r") as f:
21
                return yaml.safe_load(f)
22
        except Exception as e:
23
            raise ValueError(f"Failed to load schema: {e}")
24

25
    def _get_example_msg(self, invalid_df: pl.DataFrame, col_name: str) -> str:
26
        """get up to 3 unique examples."""
27
        invalid_vals = invalid_df[col_name].unique().to_list()
28
        invalid_vals = [v for v in invalid_vals if v is not None]
29

30
        count = len(invalid_vals)
31
        examples = invalid_vals[:3]
32
        example_str = ", ".join(map(str, examples))
33

34
        if count > 3:
35
            return f"{example_str}, ... (+{count - 3} more)"
36
        return example_str
37

38
    def validate(self, df: pl.DataFrame) -> pl.DataFrame:
39
        errors = []
40

41
        for col_name, col_def in self.schema.items():
42
            if col_name not in df.columns:
43
                if not col_def.get("optional", False):
44
                    errors.append(
45
                        {
46
                            "variable": col_name,
47
                            "check": "Existence",
48
                            "description": "Column not found",
49
                            "examples": "N/A",
50
                        }
51
                    )
52
                continue
53

54
            col_expr = pl.col(col_name)
55

56
            # --- type check ---
57
            expected_key = col_def.get("type")
58
            actual_dtype = df[col_name].dtype
59
            if expected_key in self.polars_dtypes:
60
                if actual_dtype not in self.polars_dtypes[expected_key]:
61
                    errors.append(
62
                        {
63
                            "variable": col_name,
64
                            "check": "Type",
65
                            "description": f"Expected {expected_key}, got {actual_dtype}",
66
                            "examples": "N/A",
67
                        }
68
                    )
69

70
            # --- range check ---
71
            if "range" in col_def:
72
                low, high = col_def["range"]
73
                invalid = df.filter((col_expr < low) | (col_expr > high))
74
                if not invalid.is_empty():
75
                    errors.append(
76
                        {
77
                            "variable": col_name,
78
                            "check": "Range",
79
                            "description": f"{len(invalid)} rows outside [{low}, {high}]",
80
                            "examples": self._get_example_msg(invalid, col_name),
81
                        }
82
                    )
83

84
            # --- allowed values (whitelist) ---
85
            if "allowed_value" in col_def:
86
                allowed = col_def["allowed_value"]
87
                # :Gemini: handle string vs numeric mismatch safely
88
                # supposedly your yaml file should not have conflicting values...
89
                if (
90
                    any(isinstance(x, str) for x in allowed)
91
                    and actual_dtype != pl.String
92
                ):
93
                    check_expr = col_expr.cast(pl.String)
94
                else:
95
                    check_expr = col_expr
96

97
                invalid = df.filter(~check_expr.is_in(allowed))
98
                if not invalid.is_empty():
99
                    errors.append(
100
                        {
101
                            "variable": col_name,
102
                            "check": "Allowed Values",
103
                            "description": f"{len(invalid)} rows have invalid values",
104
                            "examples": self._get_example_msg(invalid, col_name),
105
                        }
106
                    )
107

108
            # --- not allowed values (blacklist) ---
109
            if "not_allowed_value" in col_def:
110
                forbidden = col_def["not_allowed_value"]
111
                # :Gemini: Handle String vs Numeric mismatch safely
112
                if (
113
                    any(isinstance(x, str) for x in forbidden)
114
                    and actual_dtype != pl.String
115
                ):
116
                    check_expr = col_expr.cast(pl.String)
117
                else:
118
                    check_expr = col_expr
119

120
                invalid = df.filter(check_expr.is_in(forbidden))
121
                if not invalid.is_empty():
122
                    errors.append(
123
                        {
124
                            "variable": col_name,
125
                            "check": "Forbidden Values",
126
                            "description": f"{len(invalid)} rows found in blacklist",
127
                            "examples": self._get_example_msg(invalid, col_name),
128
                        }
129
                    )
130

131
            # --- regex check ---
132
            if "regex" in col_def:
133
                pattern = col_def["regex"]
134
                invalid = df.filter(~col_expr.cast(pl.String).str.contains(pattern))
135
                if not invalid.is_empty():
136
                    errors.append(
137
                        {
138
                            "variable": col_name,
139
                            "check": "Regex",
140
                            "description": f"{len(invalid)} rows mismatch pattern '{pattern}'",
141
                            "examples": self._get_example_msg(invalid, col_name),
142
                        }
143
                    )
144

145
        schema = {
146
            "variable": pl.String,
147
            "check": pl.String,
148
            "description": pl.String,
149
            "examples": pl.String,
150
        }
151
        return pl.DataFrame(errors, schema=schema)
152

153
    def report_result(self, error_df: pl.DataFrame):
154
        width = 50
155
        title_text = "VALIDATION REPORT"
156
        print("\n" + "=" * width)
157
        print(f"{title_text:^50}")
158
        print("=" * width + "\n")
159

160
        if error_df.is_empty():
161
            print("✅ All checks passed! No errors found.")
162
        else:
163
            # :Gemini: config didn't know before - Polars to show full width, text won't cut off
164
            with pl.Config(
165
                tbl_formatting="ASCII_MARKDOWN",  # Makes it look like a nice grid
166
                tbl_hide_column_data_types=True,
167
                tbl_rows=-1,  # Show all rows
168
                fmt_str_lengths=100,  # Allow long strings
169
            ):
170
                print(error_df)
171
        print("\n")
172

173

174
def main():
175
    data_path = "./data/train.parquet"
176
    validator = DataValidator("data_schema.yaml")
177

178
    print(f"Loading {data_path}...")
179
    df = pl.read_parquet(data_path)
180

181
    df.columns = [col.lower() for col in df.columns]
182

183
    error_df = validator.validate(df)
184
    validator.report_result(error_df)
185

186
if __name__ == "__main__":
187
    main()

Next Steps

Would like to refactor the Python script to use Pydantic for the YAML parsing
Mechanices to remap data columns to match the rate engine