Skip to content

Fix index misalignment in DataFrame concatenation for training and testing sets#30

Open
JulianAssmann wants to merge 1 commit into
SAP-samples:mainfrom
JulianAssmann:fix/index_misalignment_nan
Open

Fix index misalignment in DataFrame concatenation for training and testing sets#30
JulianAssmann wants to merge 1 commit into
SAP-samples:mainfrom
JulianAssmann:fix/index_misalignment_nan

Conversation

@JulianAssmann

Copy link
Copy Markdown

Fixes #29, a silent data corruption bug in get_tokenized_data() where pd.concat on index-misaligned X and y produced all-NaN training data, leading to all-NaN predictions.

Root Cause

fit() (rpt.py:119-122) stores X and y after type conversion:

if not isinstance(X, pd.DataFrame):
    X = pd.DataFrame(X)              # gets a fresh 0-based index
if not isinstance(y, pd.Series):
    y = pd.Series(y, name='TARGET')  # gets a fresh 0-based index

When X is already a DataFrame it keeps its original index (e.g. shuffled from train_test_split), but a numpy y gets wrapped with a 0-based index. These are stored as self.X_ and self.y_.

In get_tokenized_data() (rpt.py:152-153), pd.concat([X_train, y_train.to_frame()], axis=1) aligns on index. Mismatched indices cause pandas to fill every row with NaN, silently corrupting the entire training set.

Two of four input combinations are affected:

Case X y Broken?
1 DataFrame (shuffled) numpy (0-based) Yes
2 DataFrame Series (matching) No
3 numpy (0-based) numpy (0-based) No
4 numpy (0-based) Series (shuffled) Yes

Tests

All four input-type combinations are tested end-to-end (fit + predict) and assert finite predictions:

@pytest.fixture(scope="module")
def california_splits():
    data = fetch_california_housing()
    X_full = pd.DataFrame(data.data, columns=data.feature_names)

    y_np = data.target
    X_train_np, X_test_np, y_train_np, _ = train_test_split(
        X_full, y_np, train_size=500, random_state=42
    )

    y_series = pd.Series(data.target, name="MedHouseVal")
    X_train_s, X_test_s, y_train_series, _ = train_test_split(
        X_full, y_series, train_size=500, random_state=42
    )

    return {
        "X_train_df_np": (X_train_np, y_train_np, X_test_np.iloc[:5]),
        "X_train_df_series": (X_train_s, y_train_series, X_test_s.iloc[:5]),
        "X_train_np_np": (X_train_np.values, y_train_np, X_test_np.values[:5]),
        "X_train_np_series": (X_train_s.values, y_train_series, X_test_s.values[:5]),
    }

def _fit_and_predict(X_train, y_train, X_test):
    model = sap_rpt_oss.SAP_RPT_OSS_Regressor()
    model.fit(X_train, y_train)
    return model.predict(X_test)

class TestIndexMisalignmentFix:
    def test_case1_dataframe_x_numpy_y(self, california_splits):
        """Case 1: DataFrame X (shuffled index) + numpy y — was all-NaN before fix."""
        X_train, y_train, X_test = california_splits["X_train_df_np"]
        preds = _fit_and_predict(X_train, y_train, X_test)
        assert np.all(np.isfinite(preds))

    def test_case2_dataframe_x_series_y_matching_index(self, california_splits):
        """Case 2: DataFrame X + Series y — regression check, already worked."""
        X_train, y_train, X_test = california_splits["X_train_df_series"]
        preds = _fit_and_predict(X_train, y_train, X_test)
        assert np.all(np.isfinite(preds))

    def test_case3_numpy_x_numpy_y(self, california_splits):
        """Case 3: numpy X + numpy y — regression check, already worked."""
        X_train, y_train, X_test = california_splits["X_train_np_np"]
        preds = _fit_and_predict(X_train, y_train, X_test)
        assert np.all(np.isfinite(preds))

    def test_case4_numpy_x_series_y_custom_index(self, california_splits):
        """Case 4: numpy X + Series y (shuffled index) — was all-NaN before fix."""
        X_train, y_train, X_test = california_splits["X_train_np_series"]
        preds = _fit_and_predict(X_train, y_train, X_test)
        assert np.all(np.isfinite(preds))

@cla-assistant

cla-assistant Bot commented Mar 25, 2026

Copy link
Copy Markdown

CLA assistant check
Thank you for your submission! We really appreciate it. Like many open source projects, we ask that you sign our Contributor License Agreement before we can accept your contribution.
You have signed the CLA already but the status is still pending? Let us recheck it.

1 similar comment
@cla-assistant

cla-assistant Bot commented Mar 25, 2026

Copy link
Copy Markdown

CLA assistant check
Thank you for your submission! We really appreciate it. Like many open source projects, we ask that you sign our Contributor License Agreement before we can accept your contribution.
You have signed the CLA already but the status is still pending? Let us recheck it.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

predict() returns all-NaN when X is a DataFrame and y is a numpy array

2 participants