Nov 18, 2025

Create equal-weight bins in Python using Numpy

The problem is basically trying to create bins when you have two series (or two columns), and one of them is your weight.

My problem is that one column is predicted frequency, and the other one is exposure.

1
import polars as pl
2
import numpy as np
3

4
np.random.seed(42)
5
n_rows = 10000
6

7
df = pl.DataFrame({
8
    "pred": np.random.rand(n_rows),
9
    "exp": np.random.rand(n_rows)
10
}).lazy()
11

12

13
df_sorted = lf.select(['pred','exp']).collect().sort('pred')
14

15
cum_exp = df_sorted['exp'].cum_sum().to_numpy()
16
total_exp = cum_exp[-1]
17

18
# this will give 10%, 20% and ..., at that x% what is the weight
19
target_cum_weight = np.linspace(0, total_exp, 10 + 1)
20

21
# now we just need to find the index of that weight value
22
indices = np.searchsorted(cum_exp, target_cum_weight)
23

24
# well, but we actually need the pred-value, not the weight
25
break_points = df_sorted.select(['pred'])[indices].to_list()
26

27
(
28
    df.with_columns(
29
        pl.col('pred')
30
        .cut(breaks=break_points[1:-1]
31
        ,labels=[f'decile_{i:02d}' for i in range(10 + 1)])
32
        .alias('decile')
33
    ).groupby('decile')
34
    .agg(
35
        pl.len().alias('count'),
36
        pl.col('pred').sum().alias('total_pred'),
37
        pl.col('exp').sum().alias('total_exp'),
38
    ).collect()
39
    .sort('decile')
40
)

This should give a equal weighted bins. Similarly, this can be done by use numpy.interp. The idea is to solve the equation.

1
# similar code before
2

3
cum_weight = np.cumsum(sorted_exp)
4
start_weight = cum_weight[0]
5
end_weight = cum_weight[-1]
6

7
# define the target weights, the y-axis values
8
target_weights = np.linspace(start_weight, end_weight, 11)[1:-1]
9

10
# you need to solve for this
11
pred_breaks = np.interp(target_weights, cum_weight, sorted_pred)

First option normally is optimal, but really all the time would spend on the sort, the others are not probably small. So it’s like elephant in the room thing.

Another option with Polars native method

A bit update on 11/20.

Below is an example assume that trying to create a decile lift chart, with equal expsoure weight. This is a very specific use case when i’m analyzing Umbrella loss using some Auto data. So there’s a loss threshold option.

1
from typing import Union
2

3

4
def calculate_lift(
5
    lf: pl.LazyFrame,
6
    filter_expr: pl.Expr,
7
    score_col: str,
8
    target_col: str,
9
    target_col_threshold: int,
10
    n_bins: int = 10,
11
    weight_col: Union[str, None] = None,
12
) -> pl.DataFrame:
13
    """
14
    Calculates lift analysis metrics by binning data based on equal cumulative weight.
15
    """
16

17
    if weight_col is None:
18
        raise ValueError("weight_col must be provided for equal weight binning.")
19

20
    filtered_lf = lf.filter((filter_expr) & (pl.col(score_col).is_not_null())).with_columns(
21
        pl.when(pl.col(target_col) > target_col_threshold).then(1).otherwise(0).alias("cc_target")
22
    )
23

24
    df = (
25
        filtered_lf.select([score_col, target_col, weight_col, "cc_target"])
26
        .sort(score_col, descending=True)
27
        .collect()
28
    )
29

30
    #  assign bins based on equal weight (not equal count)
31
    df = df.with_columns(pl.col(weight_col).cum_sum().alias("cum_weight"))
32
    total_weight = df.select(pl.col(weight_col).sum()).item()
33

34
    df = df.with_columns(
35
        ((pl.col("cum_weight") / total_weight) * n_bins).ceil().cast(pl.Int32).alias("decile")
36
    )
37

38
    lift_data = df.group_by("decile").agg(
39
        [
40
            pl.col(weight_col).sum().alias("exposure"),
41
            (pl.col("cc_target")).sum().alias("actual"),
42
        ]
43
    )
44

45
    overall_rate = (df.select(pl.col("cc_target")).sum().item()) / total_weight
46

47
    lift_data = lift_data.with_columns(
48
        [
49
            (pl.col("actual") / pl.col("exposure")).alias("rate"),
50
            (pl.col("actual") / pl.col("exposure") / overall_rate).alias("lift"),
51
        ]
52
    ).sort("decile")
53

54
    lift_data = lift_data.with_columns(
55
        [
56
            pl.col("exposure").cum_sum().alias("cum_exposure"),
57
            pl.col("actual").cum_sum().alias("cum_actual"),
58
        ]
59
    )
60

61
    total_actual = lift_data.select(pl.col("actual").sum()).item()
62
    lift_data = lift_data.with_columns(
63
        [
64
            (pl.col("cum_actual") / total_actual * 100).alias("pct_captured"),
65
            (pl.col("cum_exposure") / pl.col("cum_exposure").max() * 100).alias("pct_population"),
66
        ]
67
    )
68

69
    return lift_data