From 5d1e5e75c250d2d7550b2eff03c33a2f5d1cfbf5 Mon Sep 17 00:00:00 2001 From: ShaaniBel Date: Sun, 28 Sep 2025 23:29:07 -0400 Subject: [PATCH] evaluation metrics --- linear-regression-parkinsons.py | 29 +++++++- logistic-regression-wdbc.py | 115 +++++++++++++++++++++++++++++++- 2 files changed, 140 insertions(+), 4 deletions(-) diff --git a/linear-regression-parkinsons.py b/linear-regression-parkinsons.py index f16f045..e167b01 100644 --- a/linear-regression-parkinsons.py +++ b/linear-regression-parkinsons.py @@ -72,7 +72,7 @@ class LinearRegression: def score(self, x: pd.DataFrame, y: pd.Series) -> float: ''' This method is used to calculate coefficient of determination to assess the goodness - of fit from the linear regression model + of fit from the linear regression model (R^2) ''' y_pred = self.predict(x) # predicts Y value with X predict method. y = pd.Series(y).astype('float64') @@ -82,6 +82,33 @@ class LinearRegression: # total sum of squares, uses the difference between Y values and Y mean value return 1.0 - ss_res / ss_tot + def mae(self, x: pd.DataFrame, y: pd.Series) -> float: + """ + Mean Absolute Error + """ + y_hat = self.predict(x) + y_true = np.asarray(y, dtype=np.float64) + return float(np.mean(np.abs(y_true - y_hat))) + + def mse(self, x: pd.DataFrame, y: pd.Series) -> float: + ''' + Mean Squared Error + ''' + y_hat = self.predict(x) + y_true = pd.Series(y).astype('float64') + return ((y_true - y_hat) ** 2).mean() + + def rmse(self, x: pd.DataFrame, y: pd.Series) -> float: + ''' + Root Mean Squared Error + Square root of MSE, in same units as the target variable + More interpretable than MSE while still penalizing larger errors + Lower values indicate better performance + ''' + y_hat = self.predict(x) + y_true = pd.Series(y).astype('float64') + return (((y_true - y_hat) ** 2).mean()) ** 0.5 + if __name__ == "__main__": df = pd.read_csv('parkinsons_updrs.data', dtype=str) diff --git a/logistic-regression-wdbc.py b/logistic-regression-wdbc.py index e47e32e..72fa4da 100644 --- a/logistic-regression-wdbc.py +++ b/logistic-regression-wdbc.py @@ -1,5 +1,8 @@ import numpy as np import pandas as pd +from matplotlib import pyplot as plt +from sklearn.metrics import confusion_matrix, roc_auc_score + class LogisticRegression: ''' @@ -33,10 +36,8 @@ class LogisticRegression: def prepare(self, df: pd.DataFrame, target_col: str) -> None: """ - Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column. Then it does standardisation, adds bias and initializes the weight/coefficient. - """ if target_col not in df.columns: raise ValueError(f"Target column '{target_col}' not found in DataFrame.") @@ -111,6 +112,115 @@ class LogisticRegression: y_true = np.asarray(y).astype(int) return np.mean(y_pred == y_true) # mean is calculated if Y values match + def confusion_matrix(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series, + normalize: bool = False) -> np.ndarray: + """ + Confusion Matrix + Returns a 2x2 matrix: [[TN, FP], [FN, TP]] + """ + y_pred = self.predict(x) + y_true = np.asarray(y).astype(int) + + cm = confusion_matrix(y_true, y_pred) + + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + + return cm + + def plot_confusion_matrix(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series, + normalize: bool = False, title: str = "Confusion Matrix", sns=None) -> None: + """ + Plot confusion matrix as a heatmap + """ + cm = self.confusion_matrix(x, y, normalize) + + plt.figure(figsize=(8, 6)) + sns.heatmap(cm, annot=True, fmt='.2f' if normalize else 'd', + cmap='Blues', cbar=False, + xticklabels=['Predicted 0', 'Predicted 1'], + yticklabels=['Actual 0', 'Actual 1']) + plt.title(title) + plt.ylabel('True Label') + plt.xlabel('Predicted Label') + plt.show() + + def precision(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float: + """ + Precision = TP / (TP + FP) + Measures how many of the predicted positives are actually positive + """ + cm = self.confusion_matrix(x, y) + tp, fp = cm[1, 1], cm[0, 1] + + if tp + fp == 0: #div by 0!!! + return 0.0 + + return tp / (tp + fp) + + def recall(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float: + """ + Recall = TP / (TP + FN) + ratio of true positives to all the positives in ground truth + """ + cm = self.confusion_matrix(x, y) + tp, fn = cm[1, 1], cm[1, 0] + + if tp + fn == 0: + return 0.0 # Avoid division by zero + + return tp / (tp + fn) + + def f1_score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float: + """ + F1-Score = 2 * ((Precision * Recall) / (Precision + Recall)) + """ + prec = self.precision(x, y) + rec = self.recall(x, y) + + if prec + rec == 0: + return 0.0 # Avoid division by zero + + return 2 * ((prec * rec) / (prec + rec)) + + ''' + def predict_proba(self, x: np.ndarray | pd.DataFrame) -> np.ndarray: + """ + Predict probability scores instead of binary labels + """ + if isinstance(x, pd.DataFrame): + x = x.values + + if self.w is None: + raise ValueError("Model not fitted yet") + + # Add bias term if needed + if x.shape[1] == len(self.w) - 1: + x = np.column_stack([np.ones(x.shape[0]), x]) + + return self.sigmoid(x @ self.w) + def au_roc(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float: + """ + Measures the model's ability to distinguish between classes + """ + # make sure self.sigmoid outputs floats between 0 and 1 + y_true = np.asarray(y).astype(int) + y_proba = self.predict_proba(x) + + return roc_auc_score(y_true, y_proba) +''' + def classification_report(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> dict: + """ + Comprehensive classification report + """ + return { + 'accuracy': self.score(x, y), + 'precision': self.precision(x, y), + 'recall': self.recall(x, y), + 'f1_score': self.f1_score(x, y), + #'au_roc': self.au_roc(x, y) + } + if __name__ == "__main__": columns = [ 'ID', 'Diagnosis', @@ -131,7 +241,6 @@ if __name__ == "__main__": df = df.drop_duplicates() # check data types: --> everything is good # print(df.dtypes) - ''' # ____________________________________________________________________________________ # HANDLE OUTLIERS AND INCONSISTENCIES