evaluation metrics

2025-09-28 23:29:07 -04:00 · 2025-09-28 23:29:07 -04:00 · 5d1e5e75c2
commit 5d1e5e75c2
parent a79c30f0d3
2 changed files with 140 additions and 4 deletions
--- a/linear-regression-parkinsons.py
+++ b/linear-regression-parkinsons.py
@ -72,7 +72,7 @@ class LinearRegression:
    def score(self, x: pd.DataFrame, y: pd.Series) -> float:
        '''
            This method is used to calculate coefficient of determination to assess the goodness
-            of fit from the linear regression model
+            of fit from the linear regression model (R^2)
        '''
        y_pred = self.predict(x)  # predicts Y value with X predict method.
        y = pd.Series(y).astype('float64')
@ -82,6 +82,33 @@ class LinearRegression:
        # total sum of squares, uses the difference between Y values and Y mean value
        return 1.0 - ss_res / ss_tot
    def mae(self, x: pd.DataFrame, y: pd.Series) -> float:
        """
        Mean Absolute Error
        """
        y_hat = self.predict(x)
        y_true = np.asarray(y, dtype=np.float64)
        return float(np.mean(np.abs(y_true - y_hat)))
    def mse(self, x: pd.DataFrame, y: pd.Series) -> float:
        '''
            Mean Squared Error
        '''
        y_hat = self.predict(x)
        y_true = pd.Series(y).astype('float64')
        return ((y_true - y_hat) ** 2).mean()
    def rmse(self, x: pd.DataFrame, y: pd.Series) -> float:
        '''
            Root Mean Squared Error
            Square root of MSE, in same units as the target variable
            More interpretable than MSE while still penalizing larger errors
            Lower values indicate better performance
        '''
        y_hat = self.predict(x)
        y_true = pd.Series(y).astype('float64')
        return (((y_true - y_hat) ** 2).mean()) ** 0.5
 if __name__ == "__main__":
    df = pd.read_csv('parkinsons_updrs.data', dtype=str)
--- a/logistic-regression-wdbc.py
+++ b/logistic-regression-wdbc.py
@ -1,5 +1,8 @@
 import numpy as np
 import pandas as pd
 from matplotlib import pyplot as plt
 from sklearn.metrics import confusion_matrix, roc_auc_score
 class LogisticRegression:
    '''
@ -33,10 +36,8 @@ class LogisticRegression:
    def prepare(self, df: pd.DataFrame, target_col: str) -> None:
        """
        Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column.
        Then it does standardisation, adds bias and initializes the weight/coefficient.
        """
        if target_col not in df.columns:
            raise ValueError(f"Target column '{target_col}' not found in DataFrame.")
@ -111,6 +112,115 @@ class LogisticRegression:
        y_true = np.asarray(y).astype(int)
        return np.mean(y_pred == y_true) # mean is calculated if Y values match
    def confusion_matrix(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series,
                         normalize: bool = False) -> np.ndarray:
        """
        Confusion Matrix
        Returns a 2x2 matrix: [[TN, FP], [FN, TP]]
        """
        y_pred = self.predict(x)
        y_true = np.asarray(y).astype(int)
        cm = confusion_matrix(y_true, y_pred)
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        return cm
    def plot_confusion_matrix(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series,
                              normalize: bool = False, title: str = "Confusion Matrix", sns=None) -> None:
        """
        Plot confusion matrix as a heatmap
        """
        cm = self.confusion_matrix(x, y, normalize)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='.2f' if normalize else 'd',
                    cmap='Blues', cbar=False,
                    xticklabels=['Predicted 0', 'Predicted 1'],
                    yticklabels=['Actual 0', 'Actual 1'])
        plt.title(title)
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()
    def precision(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
        """
        Precision = TP / (TP + FP)
        Measures how many of the predicted positives are actually positive
        """
        cm = self.confusion_matrix(x, y)
        tp, fp = cm[1, 1], cm[0, 1]
        if tp + fp == 0: #div by 0!!!
            return 0.0
        return tp / (tp + fp)
    def recall(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
        """
        Recall = TP / (TP + FN)
        ratio of true positives to all the positives in ground truth
        """
        cm = self.confusion_matrix(x, y)
        tp, fn = cm[1, 1], cm[1, 0]
        if tp + fn == 0:
            return 0.0  # Avoid division by zero
        return tp / (tp + fn)
    def f1_score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
        """
        F1-Score = 2 * ((Precision * Recall) / (Precision + Recall))
        """
        prec = self.precision(x, y)
        rec = self.recall(x, y)
        if prec + rec == 0:
            return 0.0  # Avoid division by zero
        return 2 * ((prec * rec) / (prec + rec))
    '''
    def predict_proba(self, x: np.ndarray | pd.DataFrame) -> np.ndarray:
        """
        Predict probability scores instead of binary labels
        """
        if isinstance(x, pd.DataFrame):
            x = x.values
        if self.w is None:
            raise ValueError("Model not fitted yet")
        # Add bias term if needed
        if x.shape[1] == len(self.w) - 1:
            x = np.column_stack([np.ones(x.shape[0]), x])
        return self.sigmoid(x @ self.w)
    def au_roc(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
        """
        Measures the model's ability to distinguish between classes
        """
        # make sure self.sigmoid outputs floats between 0 and 1
        y_true = np.asarray(y).astype(int)
        y_proba = self.predict_proba(x)
        return roc_auc_score(y_true, y_proba)
 '''
    def classification_report(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> dict:
        """
        Comprehensive classification report
        """
        return {
            'accuracy': self.score(x, y),
            'precision': self.precision(x, y),
            'recall': self.recall(x, y),
            'f1_score': self.f1_score(x, y),
            #'au_roc': self.au_roc(x, y)
        }
 if __name__ == "__main__":
    columns = [
        'ID', 'Diagnosis',
@ -131,7 +241,6 @@ if __name__ == "__main__":
    df = df.drop_duplicates()
    # check data types: --> everything is good
    # print(df.dtypes)
    '''
    # ____________________________________________________________________________________
    # HANDLE OUTLIERS AND INCONSISTENCIES