From be12360f9a670105ae3ac46898c0222d197e8a53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Batuhan=20Berk=20Ba=C5=9Fo=C4=9Flu?= Date: Mon, 29 Sep 2025 22:32:43 -0400 Subject: [PATCH] Fixed the different evaluation metrics. --- linear-regression-parkinsons.py | 26 ++--- logistic-regression-wdbc.py | 7 +- ...-batch-sgd-linear-regression-parkinsons.py | 7 +- mini-batch-sgd-logistic-regression-wdbc.py | 106 +++++++++++++++++- 4 files changed, 126 insertions(+), 20 deletions(-) diff --git a/linear-regression-parkinsons.py b/linear-regression-parkinsons.py index 08dc105..83e2634 100644 --- a/linear-regression-parkinsons.py +++ b/linear-regression-parkinsons.py @@ -6,8 +6,9 @@ class LinearRegression: Constructor for the linear regression with analytical solution. It uses bias. It also initializes the weight, mean and standard deviation. ''' - def __init__(self, add_bias): # add degree as value for the polynomial features + def __init__(self, add_bias, verbose): # add degree as value for the polynomial features self.add_bias = add_bias # bias to prepend a column of ones (the intercept term) + self.verbose = verbose # this is for the different evaluation metrics #self.degree = degree # degree for polynomial expansion (non-linear base) self.w = None # weight/coefficient self.mean = None # used for standardisation @@ -67,6 +68,12 @@ class LinearRegression: w_np.ravel(), # flattens the array into 1-D array index=x.columns ) + + if self.verbose: + mse = self.mse(x, y) + mae = self.mae(x, y) + rmse = self.rmse(x, y) + print(f"MSE: {mse:.6f} | MAE: {mae:.6f} | RMSE: {rmse:.6f}") return self @@ -117,17 +124,6 @@ class LinearRegression: y_true = pd.Series(y).astype('float64') return (((y_true - y_hat) ** 2).mean()) ** 0.5 - def regression_report(self, x: pd.DataFrame, y: pd.Series) -> dict: - """ - Comprehensive classification report - """ - return { - 'R^2': self.score(x, y), - 'MAE': self.mae(x, y), - 'MSE': self.mse(x, y), - 'RMSE': self.rmse(x, y) - } - if __name__ == "__main__": df = pd.read_csv('parkinsons_updrs.data', dtype=str) @@ -211,7 +207,7 @@ if __name__ == "__main__": df = df[(df['Jitter(%)'] >= 0) & (df['Jitter(%)'] <= 10)] df = df[(df['Shimmer(dB)'] >= 0) & (df['Shimmer(dB)'] <= 10)] - print(f"Rows after sanity checks: {len(df)}") + print(f"Rows after sanity checks: {len(df)}\n") # check if there are still null values assert df.isna().sum().sum() == 0, "There are still some null values." @@ -228,8 +224,8 @@ if __name__ == "__main__": y_train, y_test = y.iloc[:n_train], y.iloc[n_train:] # training of the model - model = LinearRegression(add_bias=True) - #model = LinearRegression(add_bias=True, degree=2) # using polynomial degree for non-linear base calculation. + model = LinearRegression(add_bias=True, verbose=True) + #model = LinearRegression(add_bias=True, verbose=true, degree=2) # using polynomial degree for non-linear base calculation. model.fit(x_train, y_train) # evaluation of the model diff --git a/logistic-regression-wdbc.py b/logistic-regression-wdbc.py index c78f777..a6843ae 100644 --- a/logistic-regression-wdbc.py +++ b/logistic-regression-wdbc.py @@ -89,7 +89,11 @@ class LogisticRegression: # if verbose, it shows the loss every 100 iterations and displays it if self.verbose and i % 100 == 0: - print(f"Iter {i:4d} – loss: {loss:.6f}") + precision = self.precision(self.x, self.y) + recall = self.recall(self.x, self.y) + f1_score = self.f1_score(self.x, self.y) + # 'au_roc = self.au_roc(self.x, self.y) + print(f"Iter {i:4d} – loss: {loss:.6f} | precision: {precision:.6f} | recall: {recall:.6f} | f1_score: {f1_score:.6f}") # tests whether the absolute change in loss is smaller than the tolerance if i > 1 and abs(self.loss[-2] - loss) < self.tol: @@ -220,7 +224,6 @@ class LogisticRegression: Comprehensive classification report """ return { - 'accuracy': self.score(x, y), 'precision': self.precision(x, y), 'recall': self.recall(x, y), 'f1_score': self.f1_score(x, y), diff --git a/mini-batch-sgd-linear-regression-parkinsons.py b/mini-batch-sgd-linear-regression-parkinsons.py index 845628a..f022d7d 100644 --- a/mini-batch-sgd-linear-regression-parkinsons.py +++ b/mini-batch-sgd-linear-regression-parkinsons.py @@ -95,7 +95,10 @@ class LinearRegression: if self.verbose and epoch % 100 == 0: y_full_pred = x.dot(w_np) mse = ((y_np - y_full_pred) ** 2).mean() - print(f"Iter {epoch:5d} | MSE: {mse:.6f}") + mae = float(np.mean(np.abs(y_np - y_full_pred))) + rmse = (((y_np - y_full_pred) ** 2).mean()) ** 0.5 + print(f"Iter {epoch:5d} | MSE: {mse:.6f} | MAE: {mae:.6f} | RMSE: {rmse:.6f}") + self.w = pd.Series(w_np, index=x.columns) # store weights back as a pandas series return self @@ -206,7 +209,7 @@ if __name__ == "__main__": df = df[(df['Jitter(%)'] >= 0) & (df['Jitter(%)'] <= 10)] df = df[(df['Shimmer(dB)'] >= 0) & (df['Shimmer(dB)'] <= 10)] - print(f"Rows after sanity checks: {len(df)}") + print(f"Rows after sanity checks: {len(df)}\n") # check if there are still null values assert df.isna().sum().sum() == 0, "There are still some null values." diff --git a/mini-batch-sgd-logistic-regression-wdbc.py b/mini-batch-sgd-logistic-regression-wdbc.py index 1fbe6ce..3009821 100644 --- a/mini-batch-sgd-logistic-regression-wdbc.py +++ b/mini-batch-sgd-logistic-regression-wdbc.py @@ -1,5 +1,7 @@ import numpy as np import pandas as pd +from matplotlib import pyplot as plt +from sklearn.metrics import confusion_matrix, roc_auc_score class LogisticRegression: ''' @@ -105,7 +107,11 @@ class LogisticRegression: # if verbose, it shows the loss every 100 iterations and displays it if self.verbose and epoch % 100 == 0: - print(f"Iter {epoch:4d} – loss: {loss:.6f}") + precision = self.precision(self.x, self.y) + recall = self.recall(self.x, self.y) + f1_score = self.f1_score(self.x, self.y) + # 'au_roc = self.au_roc(self.x, self.y) + print(f"Iter {epoch:4d} – loss: {loss:.6f} | precision: {precision:.6f} | recall: {recall:.6f} | f1_score: {f1_score:.6f}") # tests whether the absolute change in loss is smaller than the tolerance if epoch > 1 and abs(self.loss[-2] - loss) < self.tol: @@ -134,6 +140,104 @@ class LogisticRegression: y_true = np.asarray(y).astype(int) return np.mean(y_pred == y_true) # mean is calculated if Y values match + def confusion_matrix(self, x: pd.DataFrame, y: pd.Series, + normalize: bool = False) -> np.ndarray: + """ + Confusion Matrix + Returns a 2x2 matrix: [[TN, FP], [FN, TP]] + """ + y_pred = self.predict(x) + y_true = np.asarray(y).astype(int) + + cm = confusion_matrix(y_true, y_pred) + + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + + return cm + + def plot_confusion_matrix(self, x: pd.DataFrame, y: pd.Series, + normalize: bool = False, title: str = "Confusion Matrix", sns=None) -> None: + """ + Plot confusion matrix as a heatmap + """ + cm = self.confusion_matrix(x, y, normalize) + + plt.figure(figsize=(8, 6)) + sns.heatmap(cm, annot=True, fmt='.2f' if normalize else 'd', + cmap='Blues', cbar=False, + xticklabels=['Predicted 0', 'Predicted 1'], + yticklabels=['Actual 0', 'Actual 1']) + plt.title(title) + plt.ylabel('True Label') + plt.xlabel('Predicted Label') + plt.show() + + def precision(self, x: pd.DataFrame, y: pd.Series) -> float: + """ + Precision = TP / (TP + FP) + Measures how many of the predicted positives are actually positive + """ + cm = self.confusion_matrix(x, y) + tp, fp = cm[1, 1], cm[0, 1] + + if tp + fp == 0: #div by 0!!! + return 0.0 + + return tp / (tp + fp) + + def recall(self, x: pd.DataFrame, y: pd.Series) -> float: + """ + Recall = TP / (TP + FN) + ratio of true positives to all the positives in ground truth + """ + cm = self.confusion_matrix(x, y) + tp, fn = cm[1, 1], cm[1, 0] + + if tp + fn == 0: + return 0.0 # Avoid division by zero + + return tp / (tp + fn) + + def f1_score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float: + """ + F1-Score = 2 * ((Precision * Recall) / (Precision + Recall)) + """ + prec = self.precision(x, y) + rec = self.recall(x, y) + + if prec + rec == 0: + return 0.0 # Avoid division by zero + + return 2 * ((prec * rec) / (prec + rec)) + + ''' + def predict_proba(self, x: np.ndarray | pd.DataFrame) -> np.ndarray: + """ + Predict probability scores instead of binary labels + """ + if isinstance(x, pd.DataFrame): + x = x.values + + if self.w is None: + raise ValueError("Model not fitted yet") + + # Add bias term if needed + if x.shape[1] == len(self.w) - 1: + x = np.column_stack([np.ones(x.shape[0]), x]) + + return self.sigmoid(x @ self.w) + def au_roc(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float: + """ + Measures the model's ability to distinguish between classes + """ + # make sure self.sigmoid outputs floats between 0 and 1 + y_true = np.asarray(y).astype(int) + y_proba = self.predict_proba(x) + + return roc_auc_score(y_true, y_proba) +''' + if __name__ == "__main__": columns = [ 'ID', 'Diagnosis',