From be12360f9a670105ae3ac46898c0222d197e8a53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Batuhan=20Berk=20Ba=C5=9Fo=C4=9Flu?= <batuhan@basoglu.ca>
Date: Mon, 29 Sep 2025 22:32:43 -0400
Subject: [PATCH] Fixed the different evaluation metrics.

---
 linear-regression-parkinsons.py               |  26 ++---
 logistic-regression-wdbc.py                   |   7 +-
 ...-batch-sgd-linear-regression-parkinsons.py |   7 +-
 mini-batch-sgd-logistic-regression-wdbc.py    | 106 +++++++++++++++++-
 4 files changed, 126 insertions(+), 20 deletions(-)

diff --git a/linear-regression-parkinsons.py b/linear-regression-parkinsons.py
index 08dc105..83e2634 100644
--- a/linear-regression-parkinsons.py
+++ b/linear-regression-parkinsons.py
@@ -6,8 +6,9 @@ class LinearRegression:
         Constructor for the linear regression with analytical solution. It uses bias. It also
         initializes the weight, mean and standard deviation.
     '''
-    def __init__(self, add_bias): # add degree as value for the polynomial features
+    def __init__(self, add_bias, verbose): # add degree as value for the polynomial features
         self.add_bias = add_bias  # bias to prepend a column of ones (the intercept term)
+        self.verbose = verbose # this is for the different evaluation metrics
         #self.degree = degree  # degree for polynomial expansion (non-linear base)
         self.w = None  # weight/coefficient
         self.mean = None  # used for standardisation
@@ -67,6 +68,12 @@ class LinearRegression:
             w_np.ravel(), # flattens the array into 1-D array
             index=x.columns
         )
+
+        if self.verbose:
+            mse = self.mse(x, y)
+            mae = self.mae(x, y)
+            rmse = self.rmse(x, y)
+            print(f"MSE: {mse:.6f} | MAE: {mae:.6f} | RMSE: {rmse:.6f}")
         return self
 
 
@@ -117,17 +124,6 @@ class LinearRegression:
         y_true = pd.Series(y).astype('float64')
         return (((y_true - y_hat) ** 2).mean()) ** 0.5
 
-    def regression_report(self, x: pd.DataFrame, y: pd.Series) -> dict:
-        """
-        Comprehensive classification report
-        """
-        return {
-            'R^2': self.score(x, y),
-            'MAE': self.mae(x, y),
-            'MSE': self.mse(x, y),
-            'RMSE': self.rmse(x, y)
-        }
-
 
 if __name__ == "__main__":
     df = pd.read_csv('parkinsons_updrs.data', dtype=str)
@@ -211,7 +207,7 @@ if __name__ == "__main__":
     df = df[(df['Jitter(%)'] >= 0) & (df['Jitter(%)'] <= 10)]
     df = df[(df['Shimmer(dB)'] >= 0) & (df['Shimmer(dB)'] <= 10)]
 
-    print(f"Rows after sanity checks: {len(df)}")
+    print(f"Rows after sanity checks: {len(df)}\n")
 
     # check if there are still null values
     assert df.isna().sum().sum() == 0, "There are still some null values."
@@ -228,8 +224,8 @@ if __name__ == "__main__":
     y_train, y_test = y.iloc[:n_train], y.iloc[n_train:]
 
     # training of the model
-    model = LinearRegression(add_bias=True)
-    #model = LinearRegression(add_bias=True, degree=2) # using polynomial degree for non-linear base calculation.
+    model = LinearRegression(add_bias=True, verbose=True)
+    #model = LinearRegression(add_bias=True, verbose=true, degree=2) # using polynomial degree for non-linear base calculation.
     model.fit(x_train, y_train)
 
     # evaluation of the model
diff --git a/logistic-regression-wdbc.py b/logistic-regression-wdbc.py
index c78f777..a6843ae 100644
--- a/logistic-regression-wdbc.py
+++ b/logistic-regression-wdbc.py
@@ -89,7 +89,11 @@ class LogisticRegression:
 
             # if verbose, it shows the loss every 100 iterations and displays it
             if self.verbose and i % 100 == 0:
-                print(f"Iter {i:4d} – loss: {loss:.6f}")
+                precision = self.precision(self.x, self.y)
+                recall = self.recall(self.x, self.y)
+                f1_score = self.f1_score(self.x, self.y)
+                # 'au_roc = self.au_roc(self.x, self.y)
+                print(f"Iter {i:4d} – loss: {loss:.6f} | precision: {precision:.6f} | recall: {recall:.6f} | f1_score: {f1_score:.6f}")
 
             # tests whether the absolute change in loss is smaller than the tolerance
             if i > 1 and abs(self.loss[-2] - loss) < self.tol:
@@ -220,7 +224,6 @@ class LogisticRegression:
         Comprehensive classification report
         """
         return {
-            'accuracy': self.score(x, y),
             'precision': self.precision(x, y),
             'recall': self.recall(x, y),
             'f1_score': self.f1_score(x, y),
diff --git a/mini-batch-sgd-linear-regression-parkinsons.py b/mini-batch-sgd-linear-regression-parkinsons.py
index 845628a..f022d7d 100644
--- a/mini-batch-sgd-linear-regression-parkinsons.py
+++ b/mini-batch-sgd-linear-regression-parkinsons.py
@@ -95,7 +95,10 @@ class LinearRegression:
             if self.verbose and epoch % 100 == 0:
                 y_full_pred = x.dot(w_np)
                 mse = ((y_np - y_full_pred) ** 2).mean()
-                print(f"Iter {epoch:5d} | MSE: {mse:.6f}")
+                mae = float(np.mean(np.abs(y_np - y_full_pred)))
+                rmse = (((y_np - y_full_pred) ** 2).mean()) ** 0.5
+                print(f"Iter {epoch:5d} | MSE: {mse:.6f} | MAE: {mae:.6f} | RMSE: {rmse:.6f}")
+
 
         self.w = pd.Series(w_np, index=x.columns) # store weights back as a pandas series
         return self
@@ -206,7 +209,7 @@ if __name__ == "__main__":
     df = df[(df['Jitter(%)'] >= 0) & (df['Jitter(%)'] <= 10)]
     df = df[(df['Shimmer(dB)'] >= 0) & (df['Shimmer(dB)'] <= 10)]
 
-    print(f"Rows after sanity checks: {len(df)}")
+    print(f"Rows after sanity checks: {len(df)}\n")
 
     # check if there are still null values
     assert df.isna().sum().sum() == 0, "There are still some null values."
diff --git a/mini-batch-sgd-logistic-regression-wdbc.py b/mini-batch-sgd-logistic-regression-wdbc.py
index 1fbe6ce..3009821 100644
--- a/mini-batch-sgd-logistic-regression-wdbc.py
+++ b/mini-batch-sgd-logistic-regression-wdbc.py
@@ -1,5 +1,7 @@
 import numpy as np
 import pandas as pd
+from matplotlib import pyplot as plt
+from sklearn.metrics import confusion_matrix, roc_auc_score
 
 class LogisticRegression:
     '''
@@ -105,7 +107,11 @@ class LogisticRegression:
 
             # if verbose, it shows the loss every 100 iterations and displays it
             if self.verbose and epoch % 100 == 0:
-                print(f"Iter {epoch:4d} – loss: {loss:.6f}")
+                precision = self.precision(self.x, self.y)
+                recall = self.recall(self.x, self.y)
+                f1_score = self.f1_score(self.x, self.y)
+                # 'au_roc = self.au_roc(self.x, self.y)
+                print(f"Iter {epoch:4d} – loss: {loss:.6f} | precision: {precision:.6f} | recall: {recall:.6f} | f1_score: {f1_score:.6f}")
 
             # tests whether the absolute change in loss is smaller than the tolerance
             if epoch > 1 and abs(self.loss[-2] - loss) < self.tol:
@@ -134,6 +140,104 @@ class LogisticRegression:
         y_true = np.asarray(y).astype(int)
         return np.mean(y_pred == y_true) # mean is calculated if Y values match
 
+    def confusion_matrix(self, x: pd.DataFrame, y: pd.Series,
+                         normalize: bool = False) -> np.ndarray:
+        """
+        Confusion Matrix
+        Returns a 2x2 matrix: [[TN, FP], [FN, TP]]
+        """
+        y_pred = self.predict(x)
+        y_true = np.asarray(y).astype(int)
+
+        cm = confusion_matrix(y_true, y_pred)
+
+        if normalize:
+            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+
+        return cm
+
+    def plot_confusion_matrix(self, x: pd.DataFrame, y: pd.Series,
+                              normalize: bool = False, title: str = "Confusion Matrix", sns=None) -> None:
+        """
+        Plot confusion matrix as a heatmap
+        """
+        cm = self.confusion_matrix(x, y, normalize)
+
+        plt.figure(figsize=(8, 6))
+        sns.heatmap(cm, annot=True, fmt='.2f' if normalize else 'd',
+                    cmap='Blues', cbar=False,
+                    xticklabels=['Predicted 0', 'Predicted 1'],
+                    yticklabels=['Actual 0', 'Actual 1'])
+        plt.title(title)
+        plt.ylabel('True Label')
+        plt.xlabel('Predicted Label')
+        plt.show()
+
+    def precision(self, x: pd.DataFrame, y: pd.Series) -> float:
+        """
+        Precision = TP / (TP + FP)
+        Measures how many of the predicted positives are actually positive
+        """
+        cm = self.confusion_matrix(x, y)
+        tp, fp = cm[1, 1], cm[0, 1]
+
+        if tp + fp == 0: #div by 0!!!
+            return 0.0
+
+        return tp / (tp + fp)
+
+    def recall(self, x: pd.DataFrame, y: pd.Series) -> float:
+        """
+        Recall = TP / (TP + FN)
+        ratio of true positives to all the positives in ground truth
+        """
+        cm = self.confusion_matrix(x, y)
+        tp, fn = cm[1, 1], cm[1, 0]
+
+        if tp + fn == 0:
+            return 0.0  # Avoid division by zero
+
+        return tp / (tp + fn)
+
+    def f1_score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
+        """
+        F1-Score = 2 * ((Precision * Recall) / (Precision + Recall))
+        """
+        prec = self.precision(x, y)
+        rec = self.recall(x, y)
+
+        if prec + rec == 0:
+            return 0.0  # Avoid division by zero
+
+        return 2 * ((prec * rec) / (prec + rec))
+
+    '''
+    def predict_proba(self, x: np.ndarray | pd.DataFrame) -> np.ndarray:
+        """
+        Predict probability scores instead of binary labels
+        """
+        if isinstance(x, pd.DataFrame):
+            x = x.values
+
+        if self.w is None:
+            raise ValueError("Model not fitted yet")
+
+        # Add bias term if needed
+        if x.shape[1] == len(self.w) - 1:
+            x = np.column_stack([np.ones(x.shape[0]), x])
+
+        return self.sigmoid(x @ self.w)
+    def au_roc(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
+        """
+        Measures the model's ability to distinguish between classes
+        """
+        # make sure self.sigmoid outputs floats between 0 and 1
+        y_true = np.asarray(y).astype(int)
+        y_proba = self.predict_proba(x)
+
+        return roc_auc_score(y_true, y_proba)
+'''
+
 if __name__ == "__main__":
     columns = [
         'ID', 'Diagnosis',