Fixed the different evaluation metrics.
This commit is contained in:
parent
4ed70f6bd4
commit
be12360f9a
4 changed files with 126 additions and 20 deletions
|
|
@ -6,8 +6,9 @@ class LinearRegression:
|
||||||
Constructor for the linear regression with analytical solution. It uses bias. It also
|
Constructor for the linear regression with analytical solution. It uses bias. It also
|
||||||
initializes the weight, mean and standard deviation.
|
initializes the weight, mean and standard deviation.
|
||||||
'''
|
'''
|
||||||
def __init__(self, add_bias): # add degree as value for the polynomial features
|
def __init__(self, add_bias, verbose): # add degree as value for the polynomial features
|
||||||
self.add_bias = add_bias # bias to prepend a column of ones (the intercept term)
|
self.add_bias = add_bias # bias to prepend a column of ones (the intercept term)
|
||||||
|
self.verbose = verbose # this is for the different evaluation metrics
|
||||||
#self.degree = degree # degree for polynomial expansion (non-linear base)
|
#self.degree = degree # degree for polynomial expansion (non-linear base)
|
||||||
self.w = None # weight/coefficient
|
self.w = None # weight/coefficient
|
||||||
self.mean = None # used for standardisation
|
self.mean = None # used for standardisation
|
||||||
|
|
@ -67,6 +68,12 @@ class LinearRegression:
|
||||||
w_np.ravel(), # flattens the array into 1-D array
|
w_np.ravel(), # flattens the array into 1-D array
|
||||||
index=x.columns
|
index=x.columns
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if self.verbose:
|
||||||
|
mse = self.mse(x, y)
|
||||||
|
mae = self.mae(x, y)
|
||||||
|
rmse = self.rmse(x, y)
|
||||||
|
print(f"MSE: {mse:.6f} | MAE: {mae:.6f} | RMSE: {rmse:.6f}")
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -117,17 +124,6 @@ class LinearRegression:
|
||||||
y_true = pd.Series(y).astype('float64')
|
y_true = pd.Series(y).astype('float64')
|
||||||
return (((y_true - y_hat) ** 2).mean()) ** 0.5
|
return (((y_true - y_hat) ** 2).mean()) ** 0.5
|
||||||
|
|
||||||
def regression_report(self, x: pd.DataFrame, y: pd.Series) -> dict:
|
|
||||||
"""
|
|
||||||
Comprehensive classification report
|
|
||||||
"""
|
|
||||||
return {
|
|
||||||
'R^2': self.score(x, y),
|
|
||||||
'MAE': self.mae(x, y),
|
|
||||||
'MSE': self.mse(x, y),
|
|
||||||
'RMSE': self.rmse(x, y)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
df = pd.read_csv('parkinsons_updrs.data', dtype=str)
|
df = pd.read_csv('parkinsons_updrs.data', dtype=str)
|
||||||
|
|
@ -211,7 +207,7 @@ if __name__ == "__main__":
|
||||||
df = df[(df['Jitter(%)'] >= 0) & (df['Jitter(%)'] <= 10)]
|
df = df[(df['Jitter(%)'] >= 0) & (df['Jitter(%)'] <= 10)]
|
||||||
df = df[(df['Shimmer(dB)'] >= 0) & (df['Shimmer(dB)'] <= 10)]
|
df = df[(df['Shimmer(dB)'] >= 0) & (df['Shimmer(dB)'] <= 10)]
|
||||||
|
|
||||||
print(f"Rows after sanity checks: {len(df)}")
|
print(f"Rows after sanity checks: {len(df)}\n")
|
||||||
|
|
||||||
# check if there are still null values
|
# check if there are still null values
|
||||||
assert df.isna().sum().sum() == 0, "There are still some null values."
|
assert df.isna().sum().sum() == 0, "There are still some null values."
|
||||||
|
|
@ -228,8 +224,8 @@ if __name__ == "__main__":
|
||||||
y_train, y_test = y.iloc[:n_train], y.iloc[n_train:]
|
y_train, y_test = y.iloc[:n_train], y.iloc[n_train:]
|
||||||
|
|
||||||
# training of the model
|
# training of the model
|
||||||
model = LinearRegression(add_bias=True)
|
model = LinearRegression(add_bias=True, verbose=True)
|
||||||
#model = LinearRegression(add_bias=True, degree=2) # using polynomial degree for non-linear base calculation.
|
#model = LinearRegression(add_bias=True, verbose=true, degree=2) # using polynomial degree for non-linear base calculation.
|
||||||
model.fit(x_train, y_train)
|
model.fit(x_train, y_train)
|
||||||
|
|
||||||
# evaluation of the model
|
# evaluation of the model
|
||||||
|
|
|
||||||
|
|
@ -89,7 +89,11 @@ class LogisticRegression:
|
||||||
|
|
||||||
# if verbose, it shows the loss every 100 iterations and displays it
|
# if verbose, it shows the loss every 100 iterations and displays it
|
||||||
if self.verbose and i % 100 == 0:
|
if self.verbose and i % 100 == 0:
|
||||||
print(f"Iter {i:4d} – loss: {loss:.6f}")
|
precision = self.precision(self.x, self.y)
|
||||||
|
recall = self.recall(self.x, self.y)
|
||||||
|
f1_score = self.f1_score(self.x, self.y)
|
||||||
|
# 'au_roc = self.au_roc(self.x, self.y)
|
||||||
|
print(f"Iter {i:4d} – loss: {loss:.6f} | precision: {precision:.6f} | recall: {recall:.6f} | f1_score: {f1_score:.6f}")
|
||||||
|
|
||||||
# tests whether the absolute change in loss is smaller than the tolerance
|
# tests whether the absolute change in loss is smaller than the tolerance
|
||||||
if i > 1 and abs(self.loss[-2] - loss) < self.tol:
|
if i > 1 and abs(self.loss[-2] - loss) < self.tol:
|
||||||
|
|
@ -220,7 +224,6 @@ class LogisticRegression:
|
||||||
Comprehensive classification report
|
Comprehensive classification report
|
||||||
"""
|
"""
|
||||||
return {
|
return {
|
||||||
'accuracy': self.score(x, y),
|
|
||||||
'precision': self.precision(x, y),
|
'precision': self.precision(x, y),
|
||||||
'recall': self.recall(x, y),
|
'recall': self.recall(x, y),
|
||||||
'f1_score': self.f1_score(x, y),
|
'f1_score': self.f1_score(x, y),
|
||||||
|
|
|
||||||
|
|
@ -95,7 +95,10 @@ class LinearRegression:
|
||||||
if self.verbose and epoch % 100 == 0:
|
if self.verbose and epoch % 100 == 0:
|
||||||
y_full_pred = x.dot(w_np)
|
y_full_pred = x.dot(w_np)
|
||||||
mse = ((y_np - y_full_pred) ** 2).mean()
|
mse = ((y_np - y_full_pred) ** 2).mean()
|
||||||
print(f"Iter {epoch:5d} | MSE: {mse:.6f}")
|
mae = float(np.mean(np.abs(y_np - y_full_pred)))
|
||||||
|
rmse = (((y_np - y_full_pred) ** 2).mean()) ** 0.5
|
||||||
|
print(f"Iter {epoch:5d} | MSE: {mse:.6f} | MAE: {mae:.6f} | RMSE: {rmse:.6f}")
|
||||||
|
|
||||||
|
|
||||||
self.w = pd.Series(w_np, index=x.columns) # store weights back as a pandas series
|
self.w = pd.Series(w_np, index=x.columns) # store weights back as a pandas series
|
||||||
return self
|
return self
|
||||||
|
|
@ -206,7 +209,7 @@ if __name__ == "__main__":
|
||||||
df = df[(df['Jitter(%)'] >= 0) & (df['Jitter(%)'] <= 10)]
|
df = df[(df['Jitter(%)'] >= 0) & (df['Jitter(%)'] <= 10)]
|
||||||
df = df[(df['Shimmer(dB)'] >= 0) & (df['Shimmer(dB)'] <= 10)]
|
df = df[(df['Shimmer(dB)'] >= 0) & (df['Shimmer(dB)'] <= 10)]
|
||||||
|
|
||||||
print(f"Rows after sanity checks: {len(df)}")
|
print(f"Rows after sanity checks: {len(df)}\n")
|
||||||
|
|
||||||
# check if there are still null values
|
# check if there are still null values
|
||||||
assert df.isna().sum().sum() == 0, "There are still some null values."
|
assert df.isna().sum().sum() == 0, "There are still some null values."
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
from sklearn.metrics import confusion_matrix, roc_auc_score
|
||||||
|
|
||||||
class LogisticRegression:
|
class LogisticRegression:
|
||||||
'''
|
'''
|
||||||
|
|
@ -105,7 +107,11 @@ class LogisticRegression:
|
||||||
|
|
||||||
# if verbose, it shows the loss every 100 iterations and displays it
|
# if verbose, it shows the loss every 100 iterations and displays it
|
||||||
if self.verbose and epoch % 100 == 0:
|
if self.verbose and epoch % 100 == 0:
|
||||||
print(f"Iter {epoch:4d} – loss: {loss:.6f}")
|
precision = self.precision(self.x, self.y)
|
||||||
|
recall = self.recall(self.x, self.y)
|
||||||
|
f1_score = self.f1_score(self.x, self.y)
|
||||||
|
# 'au_roc = self.au_roc(self.x, self.y)
|
||||||
|
print(f"Iter {epoch:4d} – loss: {loss:.6f} | precision: {precision:.6f} | recall: {recall:.6f} | f1_score: {f1_score:.6f}")
|
||||||
|
|
||||||
# tests whether the absolute change in loss is smaller than the tolerance
|
# tests whether the absolute change in loss is smaller than the tolerance
|
||||||
if epoch > 1 and abs(self.loss[-2] - loss) < self.tol:
|
if epoch > 1 and abs(self.loss[-2] - loss) < self.tol:
|
||||||
|
|
@ -134,6 +140,104 @@ class LogisticRegression:
|
||||||
y_true = np.asarray(y).astype(int)
|
y_true = np.asarray(y).astype(int)
|
||||||
return np.mean(y_pred == y_true) # mean is calculated if Y values match
|
return np.mean(y_pred == y_true) # mean is calculated if Y values match
|
||||||
|
|
||||||
|
def confusion_matrix(self, x: pd.DataFrame, y: pd.Series,
|
||||||
|
normalize: bool = False) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Confusion Matrix
|
||||||
|
Returns a 2x2 matrix: [[TN, FP], [FN, TP]]
|
||||||
|
"""
|
||||||
|
y_pred = self.predict(x)
|
||||||
|
y_true = np.asarray(y).astype(int)
|
||||||
|
|
||||||
|
cm = confusion_matrix(y_true, y_pred)
|
||||||
|
|
||||||
|
if normalize:
|
||||||
|
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
|
||||||
|
|
||||||
|
return cm
|
||||||
|
|
||||||
|
def plot_confusion_matrix(self, x: pd.DataFrame, y: pd.Series,
|
||||||
|
normalize: bool = False, title: str = "Confusion Matrix", sns=None) -> None:
|
||||||
|
"""
|
||||||
|
Plot confusion matrix as a heatmap
|
||||||
|
"""
|
||||||
|
cm = self.confusion_matrix(x, y, normalize)
|
||||||
|
|
||||||
|
plt.figure(figsize=(8, 6))
|
||||||
|
sns.heatmap(cm, annot=True, fmt='.2f' if normalize else 'd',
|
||||||
|
cmap='Blues', cbar=False,
|
||||||
|
xticklabels=['Predicted 0', 'Predicted 1'],
|
||||||
|
yticklabels=['Actual 0', 'Actual 1'])
|
||||||
|
plt.title(title)
|
||||||
|
plt.ylabel('True Label')
|
||||||
|
plt.xlabel('Predicted Label')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
def precision(self, x: pd.DataFrame, y: pd.Series) -> float:
|
||||||
|
"""
|
||||||
|
Precision = TP / (TP + FP)
|
||||||
|
Measures how many of the predicted positives are actually positive
|
||||||
|
"""
|
||||||
|
cm = self.confusion_matrix(x, y)
|
||||||
|
tp, fp = cm[1, 1], cm[0, 1]
|
||||||
|
|
||||||
|
if tp + fp == 0: #div by 0!!!
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
return tp / (tp + fp)
|
||||||
|
|
||||||
|
def recall(self, x: pd.DataFrame, y: pd.Series) -> float:
|
||||||
|
"""
|
||||||
|
Recall = TP / (TP + FN)
|
||||||
|
ratio of true positives to all the positives in ground truth
|
||||||
|
"""
|
||||||
|
cm = self.confusion_matrix(x, y)
|
||||||
|
tp, fn = cm[1, 1], cm[1, 0]
|
||||||
|
|
||||||
|
if tp + fn == 0:
|
||||||
|
return 0.0 # Avoid division by zero
|
||||||
|
|
||||||
|
return tp / (tp + fn)
|
||||||
|
|
||||||
|
def f1_score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
|
||||||
|
"""
|
||||||
|
F1-Score = 2 * ((Precision * Recall) / (Precision + Recall))
|
||||||
|
"""
|
||||||
|
prec = self.precision(x, y)
|
||||||
|
rec = self.recall(x, y)
|
||||||
|
|
||||||
|
if prec + rec == 0:
|
||||||
|
return 0.0 # Avoid division by zero
|
||||||
|
|
||||||
|
return 2 * ((prec * rec) / (prec + rec))
|
||||||
|
|
||||||
|
'''
|
||||||
|
def predict_proba(self, x: np.ndarray | pd.DataFrame) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Predict probability scores instead of binary labels
|
||||||
|
"""
|
||||||
|
if isinstance(x, pd.DataFrame):
|
||||||
|
x = x.values
|
||||||
|
|
||||||
|
if self.w is None:
|
||||||
|
raise ValueError("Model not fitted yet")
|
||||||
|
|
||||||
|
# Add bias term if needed
|
||||||
|
if x.shape[1] == len(self.w) - 1:
|
||||||
|
x = np.column_stack([np.ones(x.shape[0]), x])
|
||||||
|
|
||||||
|
return self.sigmoid(x @ self.w)
|
||||||
|
def au_roc(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
|
||||||
|
"""
|
||||||
|
Measures the model's ability to distinguish between classes
|
||||||
|
"""
|
||||||
|
# make sure self.sigmoid outputs floats between 0 and 1
|
||||||
|
y_true = np.asarray(y).astype(int)
|
||||||
|
y_proba = self.predict_proba(x)
|
||||||
|
|
||||||
|
return roc_auc_score(y_true, y_proba)
|
||||||
|
'''
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
columns = [
|
columns = [
|
||||||
'ID', 'Diagnosis',
|
'ID', 'Diagnosis',
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue