evaluation metrics
This commit is contained in:
parent
a79c30f0d3
commit
5d1e5e75c2
2 changed files with 140 additions and 4 deletions
|
|
@ -72,7 +72,7 @@ class LinearRegression:
|
||||||
def score(self, x: pd.DataFrame, y: pd.Series) -> float:
|
def score(self, x: pd.DataFrame, y: pd.Series) -> float:
|
||||||
'''
|
'''
|
||||||
This method is used to calculate coefficient of determination to assess the goodness
|
This method is used to calculate coefficient of determination to assess the goodness
|
||||||
of fit from the linear regression model
|
of fit from the linear regression model (R^2)
|
||||||
'''
|
'''
|
||||||
y_pred = self.predict(x) # predicts Y value with X predict method.
|
y_pred = self.predict(x) # predicts Y value with X predict method.
|
||||||
y = pd.Series(y).astype('float64')
|
y = pd.Series(y).astype('float64')
|
||||||
|
|
@ -82,6 +82,33 @@ class LinearRegression:
|
||||||
# total sum of squares, uses the difference between Y values and Y mean value
|
# total sum of squares, uses the difference between Y values and Y mean value
|
||||||
return 1.0 - ss_res / ss_tot
|
return 1.0 - ss_res / ss_tot
|
||||||
|
|
||||||
|
def mae(self, x: pd.DataFrame, y: pd.Series) -> float:
|
||||||
|
"""
|
||||||
|
Mean Absolute Error
|
||||||
|
"""
|
||||||
|
y_hat = self.predict(x)
|
||||||
|
y_true = np.asarray(y, dtype=np.float64)
|
||||||
|
return float(np.mean(np.abs(y_true - y_hat)))
|
||||||
|
|
||||||
|
def mse(self, x: pd.DataFrame, y: pd.Series) -> float:
|
||||||
|
'''
|
||||||
|
Mean Squared Error
|
||||||
|
'''
|
||||||
|
y_hat = self.predict(x)
|
||||||
|
y_true = pd.Series(y).astype('float64')
|
||||||
|
return ((y_true - y_hat) ** 2).mean()
|
||||||
|
|
||||||
|
def rmse(self, x: pd.DataFrame, y: pd.Series) -> float:
|
||||||
|
'''
|
||||||
|
Root Mean Squared Error
|
||||||
|
Square root of MSE, in same units as the target variable
|
||||||
|
More interpretable than MSE while still penalizing larger errors
|
||||||
|
Lower values indicate better performance
|
||||||
|
'''
|
||||||
|
y_hat = self.predict(x)
|
||||||
|
y_true = pd.Series(y).astype('float64')
|
||||||
|
return (((y_true - y_hat) ** 2).mean()) ** 0.5
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
df = pd.read_csv('parkinsons_updrs.data', dtype=str)
|
df = pd.read_csv('parkinsons_updrs.data', dtype=str)
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,8 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
from sklearn.metrics import confusion_matrix, roc_auc_score
|
||||||
|
|
||||||
|
|
||||||
class LogisticRegression:
|
class LogisticRegression:
|
||||||
'''
|
'''
|
||||||
|
|
@ -33,10 +36,8 @@ class LogisticRegression:
|
||||||
|
|
||||||
def prepare(self, df: pd.DataFrame, target_col: str) -> None:
|
def prepare(self, df: pd.DataFrame, target_col: str) -> None:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column.
|
Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column.
|
||||||
Then it does standardisation, adds bias and initializes the weight/coefficient.
|
Then it does standardisation, adds bias and initializes the weight/coefficient.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if target_col not in df.columns:
|
if target_col not in df.columns:
|
||||||
raise ValueError(f"Target column '{target_col}' not found in DataFrame.")
|
raise ValueError(f"Target column '{target_col}' not found in DataFrame.")
|
||||||
|
|
@ -111,6 +112,115 @@ class LogisticRegression:
|
||||||
y_true = np.asarray(y).astype(int)
|
y_true = np.asarray(y).astype(int)
|
||||||
return np.mean(y_pred == y_true) # mean is calculated if Y values match
|
return np.mean(y_pred == y_true) # mean is calculated if Y values match
|
||||||
|
|
||||||
|
def confusion_matrix(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series,
|
||||||
|
normalize: bool = False) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Confusion Matrix
|
||||||
|
Returns a 2x2 matrix: [[TN, FP], [FN, TP]]
|
||||||
|
"""
|
||||||
|
y_pred = self.predict(x)
|
||||||
|
y_true = np.asarray(y).astype(int)
|
||||||
|
|
||||||
|
cm = confusion_matrix(y_true, y_pred)
|
||||||
|
|
||||||
|
if normalize:
|
||||||
|
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
|
||||||
|
|
||||||
|
return cm
|
||||||
|
|
||||||
|
def plot_confusion_matrix(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series,
|
||||||
|
normalize: bool = False, title: str = "Confusion Matrix", sns=None) -> None:
|
||||||
|
"""
|
||||||
|
Plot confusion matrix as a heatmap
|
||||||
|
"""
|
||||||
|
cm = self.confusion_matrix(x, y, normalize)
|
||||||
|
|
||||||
|
plt.figure(figsize=(8, 6))
|
||||||
|
sns.heatmap(cm, annot=True, fmt='.2f' if normalize else 'd',
|
||||||
|
cmap='Blues', cbar=False,
|
||||||
|
xticklabels=['Predicted 0', 'Predicted 1'],
|
||||||
|
yticklabels=['Actual 0', 'Actual 1'])
|
||||||
|
plt.title(title)
|
||||||
|
plt.ylabel('True Label')
|
||||||
|
plt.xlabel('Predicted Label')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
def precision(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
|
||||||
|
"""
|
||||||
|
Precision = TP / (TP + FP)
|
||||||
|
Measures how many of the predicted positives are actually positive
|
||||||
|
"""
|
||||||
|
cm = self.confusion_matrix(x, y)
|
||||||
|
tp, fp = cm[1, 1], cm[0, 1]
|
||||||
|
|
||||||
|
if tp + fp == 0: #div by 0!!!
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
return tp / (tp + fp)
|
||||||
|
|
||||||
|
def recall(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
|
||||||
|
"""
|
||||||
|
Recall = TP / (TP + FN)
|
||||||
|
ratio of true positives to all the positives in ground truth
|
||||||
|
"""
|
||||||
|
cm = self.confusion_matrix(x, y)
|
||||||
|
tp, fn = cm[1, 1], cm[1, 0]
|
||||||
|
|
||||||
|
if tp + fn == 0:
|
||||||
|
return 0.0 # Avoid division by zero
|
||||||
|
|
||||||
|
return tp / (tp + fn)
|
||||||
|
|
||||||
|
def f1_score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
|
||||||
|
"""
|
||||||
|
F1-Score = 2 * ((Precision * Recall) / (Precision + Recall))
|
||||||
|
"""
|
||||||
|
prec = self.precision(x, y)
|
||||||
|
rec = self.recall(x, y)
|
||||||
|
|
||||||
|
if prec + rec == 0:
|
||||||
|
return 0.0 # Avoid division by zero
|
||||||
|
|
||||||
|
return 2 * ((prec * rec) / (prec + rec))
|
||||||
|
|
||||||
|
'''
|
||||||
|
def predict_proba(self, x: np.ndarray | pd.DataFrame) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Predict probability scores instead of binary labels
|
||||||
|
"""
|
||||||
|
if isinstance(x, pd.DataFrame):
|
||||||
|
x = x.values
|
||||||
|
|
||||||
|
if self.w is None:
|
||||||
|
raise ValueError("Model not fitted yet")
|
||||||
|
|
||||||
|
# Add bias term if needed
|
||||||
|
if x.shape[1] == len(self.w) - 1:
|
||||||
|
x = np.column_stack([np.ones(x.shape[0]), x])
|
||||||
|
|
||||||
|
return self.sigmoid(x @ self.w)
|
||||||
|
def au_roc(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
|
||||||
|
"""
|
||||||
|
Measures the model's ability to distinguish between classes
|
||||||
|
"""
|
||||||
|
# make sure self.sigmoid outputs floats between 0 and 1
|
||||||
|
y_true = np.asarray(y).astype(int)
|
||||||
|
y_proba = self.predict_proba(x)
|
||||||
|
|
||||||
|
return roc_auc_score(y_true, y_proba)
|
||||||
|
'''
|
||||||
|
def classification_report(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> dict:
|
||||||
|
"""
|
||||||
|
Comprehensive classification report
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
'accuracy': self.score(x, y),
|
||||||
|
'precision': self.precision(x, y),
|
||||||
|
'recall': self.recall(x, y),
|
||||||
|
'f1_score': self.f1_score(x, y),
|
||||||
|
#'au_roc': self.au_roc(x, y)
|
||||||
|
}
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
columns = [
|
columns = [
|
||||||
'ID', 'Diagnosis',
|
'ID', 'Diagnosis',
|
||||||
|
|
@ -131,7 +241,6 @@ if __name__ == "__main__":
|
||||||
df = df.drop_duplicates()
|
df = df.drop_duplicates()
|
||||||
# check data types: --> everything is good
|
# check data types: --> everything is good
|
||||||
# print(df.dtypes)
|
# print(df.dtypes)
|
||||||
|
|
||||||
'''
|
'''
|
||||||
# ____________________________________________________________________________________
|
# ____________________________________________________________________________________
|
||||||
# HANDLE OUTLIERS AND INCONSISTENCIES
|
# HANDLE OUTLIERS AND INCONSISTENCIES
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue