evaluation metrics

This commit is contained in:
ShaaniBel 2025-09-28 23:29:07 -04:00
parent a79c30f0d3
commit 5d1e5e75c2
2 changed files with 140 additions and 4 deletions

View file

@ -72,7 +72,7 @@ class LinearRegression:
def score(self, x: pd.DataFrame, y: pd.Series) -> float:
'''
This method is used to calculate coefficient of determination to assess the goodness
of fit from the linear regression model
of fit from the linear regression model (R^2)
'''
y_pred = self.predict(x) # predicts Y value with X predict method.
y = pd.Series(y).astype('float64')
@ -82,6 +82,33 @@ class LinearRegression:
# total sum of squares, uses the difference between Y values and Y mean value
return 1.0 - ss_res / ss_tot
def mae(self, x: pd.DataFrame, y: pd.Series) -> float:
"""
Mean Absolute Error
"""
y_hat = self.predict(x)
y_true = np.asarray(y, dtype=np.float64)
return float(np.mean(np.abs(y_true - y_hat)))
def mse(self, x: pd.DataFrame, y: pd.Series) -> float:
'''
Mean Squared Error
'''
y_hat = self.predict(x)
y_true = pd.Series(y).astype('float64')
return ((y_true - y_hat) ** 2).mean()
def rmse(self, x: pd.DataFrame, y: pd.Series) -> float:
'''
Root Mean Squared Error
Square root of MSE, in same units as the target variable
More interpretable than MSE while still penalizing larger errors
Lower values indicate better performance
'''
y_hat = self.predict(x)
y_true = pd.Series(y).astype('float64')
return (((y_true - y_hat) ** 2).mean()) ** 0.5
if __name__ == "__main__":
df = pd.read_csv('parkinsons_updrs.data', dtype=str)

View file

@ -1,5 +1,8 @@
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score
class LogisticRegression:
'''
@ -33,10 +36,8 @@ class LogisticRegression:
def prepare(self, df: pd.DataFrame, target_col: str) -> None:
"""
Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column.
Then it does standardisation, adds bias and initializes the weight/coefficient.
"""
if target_col not in df.columns:
raise ValueError(f"Target column '{target_col}' not found in DataFrame.")
@ -111,6 +112,115 @@ class LogisticRegression:
y_true = np.asarray(y).astype(int)
return np.mean(y_pred == y_true) # mean is calculated if Y values match
def confusion_matrix(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series,
normalize: bool = False) -> np.ndarray:
"""
Confusion Matrix
Returns a 2x2 matrix: [[TN, FP], [FN, TP]]
"""
y_pred = self.predict(x)
y_true = np.asarray(y).astype(int)
cm = confusion_matrix(y_true, y_pred)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
return cm
def plot_confusion_matrix(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series,
normalize: bool = False, title: str = "Confusion Matrix", sns=None) -> None:
"""
Plot confusion matrix as a heatmap
"""
cm = self.confusion_matrix(x, y, normalize)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='.2f' if normalize else 'd',
cmap='Blues', cbar=False,
xticklabels=['Predicted 0', 'Predicted 1'],
yticklabels=['Actual 0', 'Actual 1'])
plt.title(title)
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
def precision(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
"""
Precision = TP / (TP + FP)
Measures how many of the predicted positives are actually positive
"""
cm = self.confusion_matrix(x, y)
tp, fp = cm[1, 1], cm[0, 1]
if tp + fp == 0: #div by 0!!!
return 0.0
return tp / (tp + fp)
def recall(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
"""
Recall = TP / (TP + FN)
ratio of true positives to all the positives in ground truth
"""
cm = self.confusion_matrix(x, y)
tp, fn = cm[1, 1], cm[1, 0]
if tp + fn == 0:
return 0.0 # Avoid division by zero
return tp / (tp + fn)
def f1_score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
"""
F1-Score = 2 * ((Precision * Recall) / (Precision + Recall))
"""
prec = self.precision(x, y)
rec = self.recall(x, y)
if prec + rec == 0:
return 0.0 # Avoid division by zero
return 2 * ((prec * rec) / (prec + rec))
'''
def predict_proba(self, x: np.ndarray | pd.DataFrame) -> np.ndarray:
"""
Predict probability scores instead of binary labels
"""
if isinstance(x, pd.DataFrame):
x = x.values
if self.w is None:
raise ValueError("Model not fitted yet")
# Add bias term if needed
if x.shape[1] == len(self.w) - 1:
x = np.column_stack([np.ones(x.shape[0]), x])
return self.sigmoid(x @ self.w)
def au_roc(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
"""
Measures the model's ability to distinguish between classes
"""
# make sure self.sigmoid outputs floats between 0 and 1
y_true = np.asarray(y).astype(int)
y_proba = self.predict_proba(x)
return roc_auc_score(y_true, y_proba)
'''
def classification_report(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> dict:
"""
Comprehensive classification report
"""
return {
'accuracy': self.score(x, y),
'precision': self.precision(x, y),
'recall': self.recall(x, y),
'f1_score': self.f1_score(x, y),
#'au_roc': self.au_roc(x, y)
}
if __name__ == "__main__":
columns = [
'ID', 'Diagnosis',
@ -131,7 +241,6 @@ if __name__ == "__main__":
df = df.drop_duplicates()
# check data types: --> everything is good
# print(df.dtypes)
'''
# ____________________________________________________________________________________
# HANDLE OUTLIERS AND INCONSISTENCIES