Fixed the different evaluation metrics.

This commit is contained in:
Batuhan Berk Başoğlu 2025-09-29 22:32:43 -04:00
parent 4ed70f6bd4
commit be12360f9a
Signed by: batuhan-basoglu
SSH key fingerprint: SHA256:kEsnuHX+qbwhxSAXPUQ4ox535wFHu/hIRaa53FzxRpo
4 changed files with 126 additions and 20 deletions

View file

@ -6,8 +6,9 @@ class LinearRegression:
Constructor for the linear regression with analytical solution. It uses bias. It also Constructor for the linear regression with analytical solution. It uses bias. It also
initializes the weight, mean and standard deviation. initializes the weight, mean and standard deviation.
''' '''
def __init__(self, add_bias): # add degree as value for the polynomial features def __init__(self, add_bias, verbose): # add degree as value for the polynomial features
self.add_bias = add_bias # bias to prepend a column of ones (the intercept term) self.add_bias = add_bias # bias to prepend a column of ones (the intercept term)
self.verbose = verbose # this is for the different evaluation metrics
#self.degree = degree # degree for polynomial expansion (non-linear base) #self.degree = degree # degree for polynomial expansion (non-linear base)
self.w = None # weight/coefficient self.w = None # weight/coefficient
self.mean = None # used for standardisation self.mean = None # used for standardisation
@ -67,6 +68,12 @@ class LinearRegression:
w_np.ravel(), # flattens the array into 1-D array w_np.ravel(), # flattens the array into 1-D array
index=x.columns index=x.columns
) )
if self.verbose:
mse = self.mse(x, y)
mae = self.mae(x, y)
rmse = self.rmse(x, y)
print(f"MSE: {mse:.6f} | MAE: {mae:.6f} | RMSE: {rmse:.6f}")
return self return self
@ -117,17 +124,6 @@ class LinearRegression:
y_true = pd.Series(y).astype('float64') y_true = pd.Series(y).astype('float64')
return (((y_true - y_hat) ** 2).mean()) ** 0.5 return (((y_true - y_hat) ** 2).mean()) ** 0.5
def regression_report(self, x: pd.DataFrame, y: pd.Series) -> dict:
"""
Comprehensive classification report
"""
return {
'R^2': self.score(x, y),
'MAE': self.mae(x, y),
'MSE': self.mse(x, y),
'RMSE': self.rmse(x, y)
}
if __name__ == "__main__": if __name__ == "__main__":
df = pd.read_csv('parkinsons_updrs.data', dtype=str) df = pd.read_csv('parkinsons_updrs.data', dtype=str)
@ -211,7 +207,7 @@ if __name__ == "__main__":
df = df[(df['Jitter(%)'] >= 0) & (df['Jitter(%)'] <= 10)] df = df[(df['Jitter(%)'] >= 0) & (df['Jitter(%)'] <= 10)]
df = df[(df['Shimmer(dB)'] >= 0) & (df['Shimmer(dB)'] <= 10)] df = df[(df['Shimmer(dB)'] >= 0) & (df['Shimmer(dB)'] <= 10)]
print(f"Rows after sanity checks: {len(df)}") print(f"Rows after sanity checks: {len(df)}\n")
# check if there are still null values # check if there are still null values
assert df.isna().sum().sum() == 0, "There are still some null values." assert df.isna().sum().sum() == 0, "There are still some null values."
@ -228,8 +224,8 @@ if __name__ == "__main__":
y_train, y_test = y.iloc[:n_train], y.iloc[n_train:] y_train, y_test = y.iloc[:n_train], y.iloc[n_train:]
# training of the model # training of the model
model = LinearRegression(add_bias=True) model = LinearRegression(add_bias=True, verbose=True)
#model = LinearRegression(add_bias=True, degree=2) # using polynomial degree for non-linear base calculation. #model = LinearRegression(add_bias=True, verbose=true, degree=2) # using polynomial degree for non-linear base calculation.
model.fit(x_train, y_train) model.fit(x_train, y_train)
# evaluation of the model # evaluation of the model

View file

@ -89,7 +89,11 @@ class LogisticRegression:
# if verbose, it shows the loss every 100 iterations and displays it # if verbose, it shows the loss every 100 iterations and displays it
if self.verbose and i % 100 == 0: if self.verbose and i % 100 == 0:
print(f"Iter {i:4d} loss: {loss:.6f}") precision = self.precision(self.x, self.y)
recall = self.recall(self.x, self.y)
f1_score = self.f1_score(self.x, self.y)
# 'au_roc = self.au_roc(self.x, self.y)
print(f"Iter {i:4d} loss: {loss:.6f} | precision: {precision:.6f} | recall: {recall:.6f} | f1_score: {f1_score:.6f}")
# tests whether the absolute change in loss is smaller than the tolerance # tests whether the absolute change in loss is smaller than the tolerance
if i > 1 and abs(self.loss[-2] - loss) < self.tol: if i > 1 and abs(self.loss[-2] - loss) < self.tol:
@ -220,7 +224,6 @@ class LogisticRegression:
Comprehensive classification report Comprehensive classification report
""" """
return { return {
'accuracy': self.score(x, y),
'precision': self.precision(x, y), 'precision': self.precision(x, y),
'recall': self.recall(x, y), 'recall': self.recall(x, y),
'f1_score': self.f1_score(x, y), 'f1_score': self.f1_score(x, y),

View file

@ -95,7 +95,10 @@ class LinearRegression:
if self.verbose and epoch % 100 == 0: if self.verbose and epoch % 100 == 0:
y_full_pred = x.dot(w_np) y_full_pred = x.dot(w_np)
mse = ((y_np - y_full_pred) ** 2).mean() mse = ((y_np - y_full_pred) ** 2).mean()
print(f"Iter {epoch:5d} | MSE: {mse:.6f}") mae = float(np.mean(np.abs(y_np - y_full_pred)))
rmse = (((y_np - y_full_pred) ** 2).mean()) ** 0.5
print(f"Iter {epoch:5d} | MSE: {mse:.6f} | MAE: {mae:.6f} | RMSE: {rmse:.6f}")
self.w = pd.Series(w_np, index=x.columns) # store weights back as a pandas series self.w = pd.Series(w_np, index=x.columns) # store weights back as a pandas series
return self return self
@ -206,7 +209,7 @@ if __name__ == "__main__":
df = df[(df['Jitter(%)'] >= 0) & (df['Jitter(%)'] <= 10)] df = df[(df['Jitter(%)'] >= 0) & (df['Jitter(%)'] <= 10)]
df = df[(df['Shimmer(dB)'] >= 0) & (df['Shimmer(dB)'] <= 10)] df = df[(df['Shimmer(dB)'] >= 0) & (df['Shimmer(dB)'] <= 10)]
print(f"Rows after sanity checks: {len(df)}") print(f"Rows after sanity checks: {len(df)}\n")
# check if there are still null values # check if there are still null values
assert df.isna().sum().sum() == 0, "There are still some null values." assert df.isna().sum().sum() == 0, "There are still some null values."

View file

@ -1,5 +1,7 @@
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score
class LogisticRegression: class LogisticRegression:
''' '''
@ -105,7 +107,11 @@ class LogisticRegression:
# if verbose, it shows the loss every 100 iterations and displays it # if verbose, it shows the loss every 100 iterations and displays it
if self.verbose and epoch % 100 == 0: if self.verbose and epoch % 100 == 0:
print(f"Iter {epoch:4d} loss: {loss:.6f}") precision = self.precision(self.x, self.y)
recall = self.recall(self.x, self.y)
f1_score = self.f1_score(self.x, self.y)
# 'au_roc = self.au_roc(self.x, self.y)
print(f"Iter {epoch:4d} loss: {loss:.6f} | precision: {precision:.6f} | recall: {recall:.6f} | f1_score: {f1_score:.6f}")
# tests whether the absolute change in loss is smaller than the tolerance # tests whether the absolute change in loss is smaller than the tolerance
if epoch > 1 and abs(self.loss[-2] - loss) < self.tol: if epoch > 1 and abs(self.loss[-2] - loss) < self.tol:
@ -134,6 +140,104 @@ class LogisticRegression:
y_true = np.asarray(y).astype(int) y_true = np.asarray(y).astype(int)
return np.mean(y_pred == y_true) # mean is calculated if Y values match return np.mean(y_pred == y_true) # mean is calculated if Y values match
def confusion_matrix(self, x: pd.DataFrame, y: pd.Series,
normalize: bool = False) -> np.ndarray:
"""
Confusion Matrix
Returns a 2x2 matrix: [[TN, FP], [FN, TP]]
"""
y_pred = self.predict(x)
y_true = np.asarray(y).astype(int)
cm = confusion_matrix(y_true, y_pred)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
return cm
def plot_confusion_matrix(self, x: pd.DataFrame, y: pd.Series,
normalize: bool = False, title: str = "Confusion Matrix", sns=None) -> None:
"""
Plot confusion matrix as a heatmap
"""
cm = self.confusion_matrix(x, y, normalize)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='.2f' if normalize else 'd',
cmap='Blues', cbar=False,
xticklabels=['Predicted 0', 'Predicted 1'],
yticklabels=['Actual 0', 'Actual 1'])
plt.title(title)
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
def precision(self, x: pd.DataFrame, y: pd.Series) -> float:
"""
Precision = TP / (TP + FP)
Measures how many of the predicted positives are actually positive
"""
cm = self.confusion_matrix(x, y)
tp, fp = cm[1, 1], cm[0, 1]
if tp + fp == 0: #div by 0!!!
return 0.0
return tp / (tp + fp)
def recall(self, x: pd.DataFrame, y: pd.Series) -> float:
"""
Recall = TP / (TP + FN)
ratio of true positives to all the positives in ground truth
"""
cm = self.confusion_matrix(x, y)
tp, fn = cm[1, 1], cm[1, 0]
if tp + fn == 0:
return 0.0 # Avoid division by zero
return tp / (tp + fn)
def f1_score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
"""
F1-Score = 2 * ((Precision * Recall) / (Precision + Recall))
"""
prec = self.precision(x, y)
rec = self.recall(x, y)
if prec + rec == 0:
return 0.0 # Avoid division by zero
return 2 * ((prec * rec) / (prec + rec))
'''
def predict_proba(self, x: np.ndarray | pd.DataFrame) -> np.ndarray:
"""
Predict probability scores instead of binary labels
"""
if isinstance(x, pd.DataFrame):
x = x.values
if self.w is None:
raise ValueError("Model not fitted yet")
# Add bias term if needed
if x.shape[1] == len(self.w) - 1:
x = np.column_stack([np.ones(x.shape[0]), x])
return self.sigmoid(x @ self.w)
def au_roc(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
"""
Measures the model's ability to distinguish between classes
"""
# make sure self.sigmoid outputs floats between 0 and 1
y_true = np.asarray(y).astype(int)
y_proba = self.predict_proba(x)
return roc_auc_score(y_true, y_proba)
'''
if __name__ == "__main__": if __name__ == "__main__":
columns = [ columns = [
'ID', 'Diagnosis', 'ID', 'Diagnosis',