In [1]:

# CELL 0 This was for dbrepo DO NOT START WITH THIS
import getpass
import requests
from dbrepo.RestClient import RestClient

# 1) connect to DBRepo
username = "bbaltal"
password = getpass.getpass("Enter DBRepo password: ")
client = RestClient(
    endpoint="https://test.dbrepo.tuwien.ac.at",
    username=username,
    password=password
)

# 2) download each PID to local CSV
splits = {
    "regular_train.csv": "4421e56c-4cd3-4ec1-a566-a89d7ec0bced",
    "playoff_train.csv":  "bcb3cf2b-27df-48cc-8b76-9e49254783d0",
    "regular_test.csv": "f9d84d5e-db01-4475-b7d1-80cfe9fe0e61",
    "playoff_test.csv":  "de37d568-e97f-4cb9-bc05-2e600cc97102"
}
for fname, pid in splits.items():
    df = client.get_identifier_data(identifier_id=pid)
    df.to_csv(fname, index=False)
    print(f"Downloaded → {fname}")

Downloaded → regular_train.csv
Downloaded → playoff_train.csv
Downloaded → regular_test.csv
Downloaded → playoff_test.csv

In [1]:

# Cell-1 Reading the csv data
from pathlib import Path
import pandas as pd
import os
# compute project root (one level up from notebooks/)
project_root = Path().resolve().parent

# build path to processed data
processed_dir = project_root / "processed"

df_reg_tr = pd.read_csv(os.path.join(processed_dir, "regular_train.csv"))
df_reg_te = pd.read_csv(os.path.join(processed_dir, "regular_test.csv"))
df_plf_tr = pd.read_csv(os.path.join(processed_dir, "playoff_train.csv"))
df_plf_te = pd.read_csv(os.path.join(processed_dir, "playoff_test.csv"))

In [2]:

# Cell 2 ── Define your six core numeric targets
TARGETS = ["pts","reb","stl","tov","fg3a","fga"]

In [3]:

# Cell 3 ── Preprocess function (drops one TARGET, identifiers, encodes)
import pandas as pd

def preprocess(df_tr, df_te, TARGET):
    # split out the one target
    X_tr = df_tr.drop(columns=[TARGET])
    X_te = df_te.drop(columns=[TARGET])
    y_tr = df_tr[TARGET]
    y_te = df_te[TARGET]
    # drop identifiers/high-cardinality
    for c in ['regular_season_index','playoff_index','player_id','player','team_id','team']:
        X_tr = X_tr.drop(columns=[c], errors='ignore')
        X_te = X_te.drop(columns=[c], errors='ignore')
    # convert any remaining object cols to numeric
    for c in X_tr.select_dtypes(include=['object']).columns:
        X_tr[c] = pd.to_numeric(X_tr[c], errors='coerce')
        X_te[c] = pd.to_numeric(X_te[c], errors='coerce')
    # one-hot encode
    X_tr = pd.get_dummies(X_tr, drop_first=True)
    X_te = pd.get_dummies(X_te, drop_first=True).reindex(columns=X_tr.columns, fill_value=0)
    return X_tr, X_te, y_tr, y_te

In [4]:

import pandas as pd

# 0) load the CSVs you downloaded  from Cell 1
df_reg_tr = pd.read_csv(os.path.join(processed_dir, "regular_train.csv"))
df_reg_te = pd.read_csv(os.path.join(processed_dir, "regular_test.csv"))
df_plf_tr = pd.read_csv(os.path.join(processed_dir, "playoff_train.csv"))
df_plf_te = pd.read_csv(os.path.join(processed_dir, "playoff_test.csv"))

# 1) now call your preprocess() on each
X_reg_tr, X_reg_te, Y_reg_tr_pts, Y_reg_te_pts = preprocess(df_reg_tr, df_reg_te, TARGET=TARGETS[0])
X_plf_tr, X_plf_te, Y_plf_tr_pts, Y_plf_te_pts = preprocess(df_plf_tr, df_plf_te, TARGET=TARGETS[0])

print("X_reg_tr", X_reg_tr.shape, "X_reg_te", X_reg_te.shape)
print("X_plf_tr", X_plf_tr.shape, "X_plf_te", X_plf_te.shape)

X_reg_tr (5148, 25) X_reg_te (2145, 25)
X_plf_tr (539, 25) X_plf_te (214, 25)

In [5]:

# Cell 5 ── Build full multi-Y and drop all TARGET columns from X

# Whole regular-season multi-Y
Y_reg_tr = df_reg_tr[TARGETS].apply(pd.to_numeric, errors='coerce')
Y_reg_te = df_reg_te[TARGETS].apply(pd.to_numeric, errors='coerce')
for t in TARGETS:
    X_reg_tr = X_reg_tr.drop(columns=[t], errors='ignore')
    X_reg_te = X_reg_te.drop(columns=[t], errors='ignore')

# Whole playoffs multi-Y
Y_plf_tr = df_plf_tr[TARGETS].apply(pd.to_numeric, errors='coerce')
Y_plf_te = df_plf_te[TARGETS].apply(pd.to_numeric, errors='coerce')
for t in TARGETS:
    X_plf_tr = X_plf_tr.drop(columns=[t], errors='ignore')
    X_plf_te = X_plf_te.drop(columns=[t], errors='ignore')

In [6]:

# Cell 8 ── LAL team split (regular & playoffs)

# regular LAL 70/30 random
from sklearn.model_selection import train_test_split
df_LAL_r = pd.concat([df_reg_tr, df_reg_te]).query("team=='LAL'").reset_index(drop=True)
df_LAL_r_tr, df_LAL_r_te = train_test_split(df_LAL_r, test_size=0.3, random_state=42)

# playoffs LAL 70/30 random
df_LAL_p = pd.concat([df_plf_tr, df_plf_te]).query("team=='LAL'").reset_index(drop=True)
df_LAL_p_tr, df_LAL_p_te = train_test_split(df_LAL_p, test_size=0.3, random_state=42)

# preprocess & multi-Y
X_LAL_r_tr, X_LAL_r_te, _, _ = preprocess(df_LAL_r_tr, df_LAL_r_te, TARGET=TARGETS[0])
Y_LAL_r_tr = df_LAL_r_tr[TARGETS].apply(pd.to_numeric, errors='coerce')
Y_LAL_r_te = df_LAL_r_te[TARGETS].apply(pd.to_numeric, errors='coerce')
for t in TARGETS:
    X_LAL_r_tr = X_LAL_r_tr.drop(columns=[t], errors='ignore')
    X_LAL_r_te = X_LAL_r_te.drop(columns=[t], errors='ignore')

X_LAL_p_tr, X_LAL_p_te, _, _ = preprocess(df_LAL_p_tr, df_LAL_p_te, TARGET=TARGETS[0])
Y_LAL_p_tr = df_LAL_p_tr[TARGETS].apply(pd.to_numeric, errors='coerce')
Y_LAL_p_te = df_LAL_p_te[TARGETS].apply(pd.to_numeric, errors='coerce')
for t in TARGETS:
    X_LAL_p_tr = X_LAL_p_tr.drop(columns=[t], errors='ignore')
    X_LAL_p_te = X_LAL_p_te.drop(columns=[t], errors='ignore')

print("Shapes (X_tr, Y_tr, X_te, Y_te):")
for name, (Xtr, Ytr, Xte, Yte) in [
    ("Reg-Whole", (X_reg_tr, Y_reg_tr, X_reg_te, Y_reg_te)),
    ("Plf-Whole", (X_plf_tr, Y_plf_tr, X_plf_te, Y_plf_te)),
    ("Reg-LAL",   (X_LAL_r_tr, Y_LAL_r_tr, X_LAL_r_te, Y_LAL_r_te)),
    ("Plf-LAL",   (X_LAL_p_tr, Y_LAL_p_tr, X_LAL_p_te, Y_LAL_p_te)),
]:
    print(f"{name:8s}", Xtr.shape, Ytr.shape, Xte.shape, Yte.shape)

Shapes (X_tr, Y_tr, X_te, Y_te):
Reg-Whole (5148, 20) (5148, 6) (2145, 20) (2145, 6)
Plf-Whole (539, 20) (539, 6) (214, 20) (214, 6)
Reg-LAL  (155, 20) (155, 6) (67, 20) (67, 6)
Plf-LAL  (18, 20) (18, 6) (9, 20) (9, 6)

In [7]:

# Cell 3 ── Define your six core numeric targets and scenario list (no R10)
TARGETS = ["pts","reb","stl","tov","fg3a","fga"]

# scenarios ← only Whole and LAL, for Regular and Playoffs
scenarios = [
    ("Reg-Whole", df_reg_tr, df_reg_te),
    ("Plf-Whole", df_plf_tr, df_plf_te),
    ("Reg-LAL", *train_test_split(pd.concat([df_reg_tr, df_reg_te])
                                   .query("team=='LAL'"),
                                  test_size=0.3, random_state=42)),
    ("Plf-LAL", *train_test_split(pd.concat([df_plf_tr, df_plf_te])
                                   .query("team=='LAL'"),
                                  test_size=0.3, random_state=42)),
]

# Cell 4 ── Preprocess function
import pandas as pd

def preprocess(df_tr, df_te, TARGET):
    X_tr = df_tr.drop(columns=[TARGET])
    X_te = df_te.drop(columns=[TARGET])
    y_tr = df_tr[TARGET]
    y_te = df_te[TARGET]
    for c in ['regular_season_index','playoff_index','player_id','player','team_id','team']:
        X_tr = X_tr.drop(columns=[c], errors='ignore')
        X_te = X_te.drop(columns=[c], errors='ignore')
    for c in X_tr.select_dtypes(include=['object']).columns:
        X_tr[c] = pd.to_numeric(X_tr[c], errors='coerce')
        X_te[c] = pd.to_numeric(X_te[c], errors='coerce')
    X_tr = pd.get_dummies(X_tr, drop_first=True)
    X_te = pd.get_dummies(X_te, drop_first=True).reindex(columns=X_tr.columns, fill_value=0)
    return X_tr, X_te, y_tr, y_te

# Cell 5 ── Build feature / multi-Y matrices for each scenario
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

results = []
for name, dtr, dte in scenarios:
    # preprocess all TARGETS at once
    # start by doing pts then drop all TARGET columns from X
    X_tr, X_te, _, _ = preprocess(dtr, dte, TARGET=TARGETS[0])
    for t in TARGETS:
        X_tr = X_tr.drop(columns=[t], errors='ignore')
        X_te = X_te.drop(columns=[t], errors='ignore')
    Y_tr = dtr[TARGETS].apply(pd.to_numeric, errors='coerce')
    Y_te = dte[TARGETS].apply(pd.to_numeric, errors='coerce')

    # train & predict multi-output
    m = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
    m.fit(X_tr, Y_tr)
    Y_pr = m.predict(X_te)

    # evaluate & plot
    n = len(TARGETS)
    fig, axes = plt.subplots(2, n, figsize=(4*n, 8))
    fig.suptitle(name)
    for i, stat in enumerate(TARGETS):
        y_true = Y_te[stat].to_numpy()
        y_hat  = Y_pr[:, i]

        # scatter
        ax = axes[0, i]
        ax.scatter(y_true, y_hat, s=15)
        mn, mx = y_true.min(), y_true.max()
        ax.plot([mn,mx],[mn,mx],"--")
        ax.set_title(stat)
        ax.set_xlabel("Actual"); ax.set_ylabel("Predicted")

        # metrics bar
        mse = mean_squared_error(y_true, y_hat)
        mae = mean_absolute_error(y_true, y_hat)
        r2  = r2_score(y_true, y_hat)
        ax2 = axes[1, i]
        ax2.bar(["MSE","MAE","R²"], [mse, mae, abs(r2)])
        ax2.set_yscale("log")
        for j, v in enumerate([mse, mae, r2]):
            ax2.text(j, abs(v)*1.1, f"{v:.2f}", ha='center')

        results.append({"Scenario":name, "Stat":stat, "MAE":mae, "R²":r2})

    plt.tight_layout(rect=[0,0.03,1,0.95])
    plt.show()

# summary table
import pandas as pd
df_res = pd.DataFrame(results)
print(df_res.pivot(index="Scenario", columns="Stat", values=["MAE","R²"]))

No description has been provided for this image

                 MAE                                                           \
Stat            fg3a         fga         pts        reb        stl        tov   
Scenario                                                                        
Plf-LAL    40.632222  116.510000  149.147778  64.412222  10.538889  14.696667   
Plf-Whole   2.288505  649.697103  947.783738   1.689579   1.884626  15.672570   
Reg-LAL    10.390448   30.702090   19.914925  15.928358   6.322985   9.510149   
Reg-Whole   0.875274  450.904120  680.817953   0.443408   1.359520   8.698237   

                 R²                                                      
Stat           fg3a        fga        pts       reb       stl       tov  
Scenario                                                                 
Plf-LAL    0.736449   0.779868   0.807360  0.703269  0.638002  0.843946  
Plf-Whole  0.988650 -64.640755 -86.233810  0.994945  0.783666 -0.946914  
Reg-LAL    0.984303   0.978229   0.994966  0.961746  0.876785  0.920367  
Reg-Whole  0.997265 -37.670530 -54.540892  0.999440  0.908254  0.445678