Mini-Project-of-Machine-Lea.../logistic-regression-wdbc.py

204 lines
8.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import numpy as np
import pandas as pd
class LogisticRegressionGD:
"""Binary logistic regression trained with batch gradient descent."""
def __init__(self,
learning_rate: float = 0.01,
n_iter: int = 1000,
tolerance: float = 1e-5,
verbose: bool = False):
"""
Parameters
----------
learning_rate : float
Step size for weight updates.
n_iter : int
Maximum number of iterations.
tolerance : float
Stopping criterion: if the change in loss is < tolerance, stop.
verbose : bool
If True, prints loss at every 100 iterations.
"""
self.lr = learning_rate
self.n_iter = n_iter
self.tol = tolerance
self.verbose = verbose
# placeholders that will be filled during training
self.w_ = None # weights (including bias as w[0])
self.loss_history_ = [] # loss at each iteration
self.X_ = None # feature matrix (after standardisation)
self.y_ = None # target vector (0/1)
# ------------------------------------------------------------------
# 2. Sigmoid helper (vectorised)
# ------------------------------------------------------------------
@staticmethod
def _sigmoid(z: np.ndarray) -> np.ndarray:
return 1.0 / (1.0 + np.exp(-z))
# ------------------------------------------------------------------
# 3. Cost function (crossentropy)
# ------------------------------------------------------------------
@staticmethod
def _cost(y: np.ndarray, p: np.ndarray) -> float:
# avoid log(0) by clipping
eps = 1e-15
p = np.clip(p, eps, 1 - eps)
return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p))
# ------------------------------------------------------------------
# 4. Data preparation this is where we split X / y, scale, etc.
# ------------------------------------------------------------------
def prepare(self, df: pd.DataFrame, target_col: str = 'Diagnosis') -> None:
"""
Splits `df` into X and y, standardises X (mean=0, std=1),
and stores the result in the class attributes.
Parameters
----------
df : pd.DataFrame
Cleaned data *already* contains a numeric target in `target_col`.
target_col : str
Name of the binary target column.
"""
# target must be a 0/1 array
self.y_ = df[target_col].values.astype(np.int64)
# X all columns except the target
X_raw = df.drop(columns=[target_col]).values.astype(np.float64)
# -----------------------------------------------------------------
# 3.1 Feature scaling we put the bias in the first column
# -----------------------------------------------------------------
# compute mean / std on the whole training set (no train/val split yet)
self.mean_ = X_raw.mean(axis=0)
self.std_ = X_raw.std(axis=0)
# avoid division by zero
self.std_[self.std_ == 0] = 1.0
X_scaled = (X_raw - self.mean_) / self.std_
# add bias column (all ones)
X_scaled = np.hstack([np.ones((X_scaled.shape[0], 1)), X_scaled])
self.X_ = X_scaled
self.w_ = np.zeros(X_scaled.shape[1]) # initialise weights
# ------------------------------------------------------------------
# 4. Fit batch gradient descent
# ------------------------------------------------------------------
def fit(self) -> None:
"""Runs batch gradient descent for `n_iter` epochs."""
for i in range(1, self.n_iter + 1):
z = np.dot(self.X_, self.w_) # linear part
p = self._sigmoid(z) # predicted probabilities
# gradient of the loglikelihood (including bias)
gradient = np.dot(self.X_.T, (p - self.y_)) / self.y_.size
# weight update
self.w_ -= self.lr * gradient
# record cost and check stopping criterion
loss = self._cost(self.y_, p)
self.loss_history_.append(loss)
if self.verbose and i % 100 == 0:
print(f"Iteration {i:4d} loss: {loss:.6f}")
if i > 1 and abs(self.loss_history_[-2] - loss) < self.tol:
if self.verbose:
print(f"Converged after {i} iterations.")
break
# ------------------------------------------------------------------
# 5. Predict binary class labels
# ------------------------------------------------------------------
def predict(self, X: np.ndarray) -> np.ndarray:
"""Return 0/1 predictions for a new X matrix (already scaled)."""
z = np.dot(X, self.w_)
probs = self._sigmoid(z)
return (probs >= 0.5).astype(int)
# ------------------------------------------------------------------
# 6. Score accuracy on a given (X, y) pair
# ------------------------------------------------------------------
def score(self, X: np.ndarray, y: np.ndarray) -> float:
"""Return the classification accuracy."""
y_pred = self.predict(X)
return np.mean(y_pred == y)
if __name__ == "__main__":
columns = [
'ID', 'Diagnosis',
'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se',
'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst'
]
df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
df.drop(columns=['ID'], inplace=True) # drops id column
missing_rows = df[df.isin(['?', 'NA', 'na', '']).any(axis=1)] # checks null values
print(f"Rows with null values: {len(missing_rows)}")
df.replace(['?','NA', 'na', ''], pd.NA, inplace=True) # replace null values with NA identifier
num_cols = df.columns.difference(['Diagnosis'])
for col in num_cols:
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
df.dropna(inplace=True) # remove null values
print(f"Rows remaining after drop of the null values: {len(df)}")
for col in num_cols:
df = df[df[col] >= 0]
# sanity checks for data validity - max tumor sizes possible
df = df[(df['radius_mean'] > 0) & (df['radius_mean'] <= 30)]
df = df[(df['radius_worst'] > 0) & (df['radius_worst'] <= 30)]
df = df[(df['texture_mean'] >= 0) & (df['texture_mean'] <= 100)]
df = df[(df['texture_worst'] >= 0) & (df['texture_worst'] <= 100)]
df = df[(df['perimeter_mean'] > 0) & (df['perimeter_mean'] <= 200)]
df = df[(df['perimeter_worst'] > 0) & (df['perimeter_worst'] <= 200)]
df = df[(df['area_mean'] > 0) & (df['area_mean'] <= 600)]
df = df[(df['area_worst'] > 0) & (df['area_worst'] <= 600)]
# check if there are still null values
assert df.isna().sum().sum() == 0, "There are still some null values."
df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0}) # making diagnosis numeric
df['Diagnosis'] = df['Diagnosis'].astype('category')
# ---- 7.2 Instantiate and train ------------------------------------
model = LogisticRegressionGD(learning_rate=0.05,
n_iter=5000,
tolerance=1e-6,
verbose=True)
# we need to split X / y here
X = df.drop(columns=['Diagnosis'])
y = df['Diagnosis'].cat.codes.values # 0/1 array
# Standardise X inside the model for us well do it in `prepare`
model.X_ = (X - X.mean()) / X.std() # biascolumn will be added later
model.X_ = np.hstack([np.ones((model.X_.shape[0], 1)), model.X_]) # add bias
model.y_ = y
# Fit the model
model.fit()
# -------------------------------------------------
# 8. Evaluate on the same data (you could split)
# -------------------------------------------------
acc = model.score(model.X_, model.y_)
print(f"Training accuracy (on the whole cleaned set): {acc:.4f}")
# Example: predict on the first 10 samples
y_hat = model.predict(model.X_[:10])
print("First 10 predictions:", y_hat)