# CELL 0 This was for dbrepo DO NOT START WITH THIS
import getpass
import requests
from dbrepo.RestClient import RestClient
# 1) connect to DBRepo
username = "bbaltal"
password = getpass.getpass("Enter DBRepo password: ")
client = RestClient(
endpoint="https://test.dbrepo.tuwien.ac.at",
username=username,
password=password
)
# 2) download each PID to local CSV
splits = {
"regular_train.csv": "4421e56c-4cd3-4ec1-a566-a89d7ec0bced",
"playoff_train.csv": "bcb3cf2b-27df-48cc-8b76-9e49254783d0",
"regular_test.csv": "f9d84d5e-db01-4475-b7d1-80cfe9fe0e61",
"playoff_test.csv": "de37d568-e97f-4cb9-bc05-2e600cc97102"
}
for fname, pid in splits.items():
df = client.get_identifier_data(identifier_id=pid)
df.to_csv(fname, index=False)
print(f"Downloaded → {fname}")
Downloaded → regular_train.csv Downloaded → playoff_train.csv Downloaded → regular_test.csv Downloaded → playoff_test.csv
# Cell-1 Reading the csv data
from pathlib import Path
import pandas as pd
import os
# compute project root (one level up from notebooks/)
project_root = Path().resolve().parent
# build path to processed data
processed_dir = project_root / "processed"
df_reg_tr = pd.read_csv(os.path.join(processed_dir, "regular_train.csv"))
df_reg_te = pd.read_csv(os.path.join(processed_dir, "regular_test.csv"))
df_plf_tr = pd.read_csv(os.path.join(processed_dir, "playoff_train.csv"))
df_plf_te = pd.read_csv(os.path.join(processed_dir, "playoff_test.csv"))
# Cell 2 ── Define your six core numeric targets
TARGETS = ["pts","reb","stl","tov","fg3a","fga"]
# Cell 3 ── Preprocess function (drops one TARGET, identifiers, encodes)
import pandas as pd
def preprocess(df_tr, df_te, TARGET):
# split out the one target
X_tr = df_tr.drop(columns=[TARGET])
X_te = df_te.drop(columns=[TARGET])
y_tr = df_tr[TARGET]
y_te = df_te[TARGET]
# drop identifiers/high-cardinality
for c in ['regular_season_index','playoff_index','player_id','player','team_id','team']:
X_tr = X_tr.drop(columns=[c], errors='ignore')
X_te = X_te.drop(columns=[c], errors='ignore')
# convert any remaining object cols to numeric
for c in X_tr.select_dtypes(include=['object']).columns:
X_tr[c] = pd.to_numeric(X_tr[c], errors='coerce')
X_te[c] = pd.to_numeric(X_te[c], errors='coerce')
# one-hot encode
X_tr = pd.get_dummies(X_tr, drop_first=True)
X_te = pd.get_dummies(X_te, drop_first=True).reindex(columns=X_tr.columns, fill_value=0)
return X_tr, X_te, y_tr, y_te
import pandas as pd
# 0) load the CSVs you downloaded from Cell 1
df_reg_tr = pd.read_csv(os.path.join(processed_dir, "regular_train.csv"))
df_reg_te = pd.read_csv(os.path.join(processed_dir, "regular_test.csv"))
df_plf_tr = pd.read_csv(os.path.join(processed_dir, "playoff_train.csv"))
df_plf_te = pd.read_csv(os.path.join(processed_dir, "playoff_test.csv"))
# 1) now call your preprocess() on each
X_reg_tr, X_reg_te, Y_reg_tr_pts, Y_reg_te_pts = preprocess(df_reg_tr, df_reg_te, TARGET=TARGETS[0])
X_plf_tr, X_plf_te, Y_plf_tr_pts, Y_plf_te_pts = preprocess(df_plf_tr, df_plf_te, TARGET=TARGETS[0])
print("X_reg_tr", X_reg_tr.shape, "X_reg_te", X_reg_te.shape)
print("X_plf_tr", X_plf_tr.shape, "X_plf_te", X_plf_te.shape)
X_reg_tr (5148, 25) X_reg_te (2145, 25) X_plf_tr (539, 25) X_plf_te (214, 25)
# Cell 5 ── Build full multi-Y and drop all TARGET columns from X
# Whole regular-season multi-Y
Y_reg_tr = df_reg_tr[TARGETS].apply(pd.to_numeric, errors='coerce')
Y_reg_te = df_reg_te[TARGETS].apply(pd.to_numeric, errors='coerce')
for t in TARGETS:
X_reg_tr = X_reg_tr.drop(columns=[t], errors='ignore')
X_reg_te = X_reg_te.drop(columns=[t], errors='ignore')
# Whole playoffs multi-Y
Y_plf_tr = df_plf_tr[TARGETS].apply(pd.to_numeric, errors='coerce')
Y_plf_te = df_plf_te[TARGETS].apply(pd.to_numeric, errors='coerce')
for t in TARGETS:
X_plf_tr = X_plf_tr.drop(columns=[t], errors='ignore')
X_plf_te = X_plf_te.drop(columns=[t], errors='ignore')
# Cell 8 ── LAL team split (regular & playoffs)
# regular LAL 70/30 random
from sklearn.model_selection import train_test_split
df_LAL_r = pd.concat([df_reg_tr, df_reg_te]).query("team=='LAL'").reset_index(drop=True)
df_LAL_r_tr, df_LAL_r_te = train_test_split(df_LAL_r, test_size=0.3, random_state=42)
# playoffs LAL 70/30 random
df_LAL_p = pd.concat([df_plf_tr, df_plf_te]).query("team=='LAL'").reset_index(drop=True)
df_LAL_p_tr, df_LAL_p_te = train_test_split(df_LAL_p, test_size=0.3, random_state=42)
# preprocess & multi-Y
X_LAL_r_tr, X_LAL_r_te, _, _ = preprocess(df_LAL_r_tr, df_LAL_r_te, TARGET=TARGETS[0])
Y_LAL_r_tr = df_LAL_r_tr[TARGETS].apply(pd.to_numeric, errors='coerce')
Y_LAL_r_te = df_LAL_r_te[TARGETS].apply(pd.to_numeric, errors='coerce')
for t in TARGETS:
X_LAL_r_tr = X_LAL_r_tr.drop(columns=[t], errors='ignore')
X_LAL_r_te = X_LAL_r_te.drop(columns=[t], errors='ignore')
X_LAL_p_tr, X_LAL_p_te, _, _ = preprocess(df_LAL_p_tr, df_LAL_p_te, TARGET=TARGETS[0])
Y_LAL_p_tr = df_LAL_p_tr[TARGETS].apply(pd.to_numeric, errors='coerce')
Y_LAL_p_te = df_LAL_p_te[TARGETS].apply(pd.to_numeric, errors='coerce')
for t in TARGETS:
X_LAL_p_tr = X_LAL_p_tr.drop(columns=[t], errors='ignore')
X_LAL_p_te = X_LAL_p_te.drop(columns=[t], errors='ignore')
print("Shapes (X_tr, Y_tr, X_te, Y_te):")
for name, (Xtr, Ytr, Xte, Yte) in [
("Reg-Whole", (X_reg_tr, Y_reg_tr, X_reg_te, Y_reg_te)),
("Plf-Whole", (X_plf_tr, Y_plf_tr, X_plf_te, Y_plf_te)),
("Reg-LAL", (X_LAL_r_tr, Y_LAL_r_tr, X_LAL_r_te, Y_LAL_r_te)),
("Plf-LAL", (X_LAL_p_tr, Y_LAL_p_tr, X_LAL_p_te, Y_LAL_p_te)),
]:
print(f"{name:8s}", Xtr.shape, Ytr.shape, Xte.shape, Yte.shape)
Shapes (X_tr, Y_tr, X_te, Y_te): Reg-Whole (5148, 20) (5148, 6) (2145, 20) (2145, 6) Plf-Whole (539, 20) (539, 6) (214, 20) (214, 6) Reg-LAL (155, 20) (155, 6) (67, 20) (67, 6) Plf-LAL (18, 20) (18, 6) (9, 20) (9, 6)
# Cell 3 ── Define your six core numeric targets and scenario list (no R10)
TARGETS = ["pts","reb","stl","tov","fg3a","fga"]
# scenarios ← only Whole and LAL, for Regular and Playoffs
scenarios = [
("Reg-Whole", df_reg_tr, df_reg_te),
("Plf-Whole", df_plf_tr, df_plf_te),
("Reg-LAL", *train_test_split(pd.concat([df_reg_tr, df_reg_te])
.query("team=='LAL'"),
test_size=0.3, random_state=42)),
("Plf-LAL", *train_test_split(pd.concat([df_plf_tr, df_plf_te])
.query("team=='LAL'"),
test_size=0.3, random_state=42)),
]
# Cell 4 ── Preprocess function
import pandas as pd
def preprocess(df_tr, df_te, TARGET):
X_tr = df_tr.drop(columns=[TARGET])
X_te = df_te.drop(columns=[TARGET])
y_tr = df_tr[TARGET]
y_te = df_te[TARGET]
for c in ['regular_season_index','playoff_index','player_id','player','team_id','team']:
X_tr = X_tr.drop(columns=[c], errors='ignore')
X_te = X_te.drop(columns=[c], errors='ignore')
for c in X_tr.select_dtypes(include=['object']).columns:
X_tr[c] = pd.to_numeric(X_tr[c], errors='coerce')
X_te[c] = pd.to_numeric(X_te[c], errors='coerce')
X_tr = pd.get_dummies(X_tr, drop_first=True)
X_te = pd.get_dummies(X_te, drop_first=True).reindex(columns=X_tr.columns, fill_value=0)
return X_tr, X_te, y_tr, y_te
# Cell 5 ── Build feature / multi-Y matrices for each scenario
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
results = []
for name, dtr, dte in scenarios:
# preprocess all TARGETS at once
# start by doing pts then drop all TARGET columns from X
X_tr, X_te, _, _ = preprocess(dtr, dte, TARGET=TARGETS[0])
for t in TARGETS:
X_tr = X_tr.drop(columns=[t], errors='ignore')
X_te = X_te.drop(columns=[t], errors='ignore')
Y_tr = dtr[TARGETS].apply(pd.to_numeric, errors='coerce')
Y_te = dte[TARGETS].apply(pd.to_numeric, errors='coerce')
# train & predict multi-output
m = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
m.fit(X_tr, Y_tr)
Y_pr = m.predict(X_te)
# evaluate & plot
n = len(TARGETS)
fig, axes = plt.subplots(2, n, figsize=(4*n, 8))
fig.suptitle(name)
for i, stat in enumerate(TARGETS):
y_true = Y_te[stat].to_numpy()
y_hat = Y_pr[:, i]
# scatter
ax = axes[0, i]
ax.scatter(y_true, y_hat, s=15)
mn, mx = y_true.min(), y_true.max()
ax.plot([mn,mx],[mn,mx],"--")
ax.set_title(stat)
ax.set_xlabel("Actual"); ax.set_ylabel("Predicted")
# metrics bar
mse = mean_squared_error(y_true, y_hat)
mae = mean_absolute_error(y_true, y_hat)
r2 = r2_score(y_true, y_hat)
ax2 = axes[1, i]
ax2.bar(["MSE","MAE","R²"], [mse, mae, abs(r2)])
ax2.set_yscale("log")
for j, v in enumerate([mse, mae, r2]):
ax2.text(j, abs(v)*1.1, f"{v:.2f}", ha='center')
results.append({"Scenario":name, "Stat":stat, "MAE":mae, "R²":r2})
plt.tight_layout(rect=[0,0.03,1,0.95])
plt.show()
# summary table
import pandas as pd
df_res = pd.DataFrame(results)
print(df_res.pivot(index="Scenario", columns="Stat", values=["MAE","R²"]))
MAE \
Stat fg3a fga pts reb stl tov
Scenario
Plf-LAL 40.632222 116.510000 149.147778 64.412222 10.538889 14.696667
Plf-Whole 2.288505 649.697103 947.783738 1.689579 1.884626 15.672570
Reg-LAL 10.390448 30.702090 19.914925 15.928358 6.322985 9.510149
Reg-Whole 0.875274 450.904120 680.817953 0.443408 1.359520 8.698237
R²
Stat fg3a fga pts reb stl tov
Scenario
Plf-LAL 0.736449 0.779868 0.807360 0.703269 0.638002 0.843946
Plf-Whole 0.988650 -64.640755 -86.233810 0.994945 0.783666 -0.946914
Reg-LAL 0.984303 0.978229 0.994966 0.961746 0.876785 0.920367
Reg-Whole 0.997265 -37.670530 -54.540892 0.999440 0.908254 0.445678