Finished the implementation of the python code.

This commit is contained in:
Batuhan Berk Başoğlu 2025-09-18 20:58:02 -04:00
parent 5702c3c1b8
commit 455b48c89b
Signed by: batuhan-basoglu
SSH key fingerprint: SHA256:kEsnuHX+qbwhxSAXPUQ4ox535wFHu/hIRaa53FzxRpo
6 changed files with 540 additions and 159 deletions

View file

@ -1,144 +1,126 @@
import numpy as np
import pandas as pd
class LogisticRegressionGD:
"""Binary logistic regression trained with batch gradient descent."""
def __init__(self,
learning_rate: float = 0.01,
n_iter: int = 1000,
tolerance: float = 1e-5,
verbose: bool = False):
"""
Parameters
----------
learning_rate : float
Step size for weight updates.
n_iter : int
Maximum number of iterations.
tolerance : float
Stopping criterion: if the change in loss is < tolerance, stop.
verbose : bool
If True, prints loss at every 100 iterations.
"""
class LogisticRegression:
'''
Constructor for the logistic regression with gradient descent. It uses learning rate, iteration number,
tolerance and verbose. It also initializes the weight, loss, x, y, mean and std.
'''
def __init__(self, learning_rate: float, n_iter: int, tolerance: float, verbose: bool) -> None:
self.lr = learning_rate
self.n_iter = n_iter
self.tol = tolerance
self.verbose = verbose
self.w: np.ndarray | None = None # weight/coefficient (bias as first element)
self.loss: list[float] = [] # loss per iteration
self.x: np.ndarray | None = None # matrix of inputs after standardisation
self.y: np.ndarray | None = None # target vector
self.mean: np.ndarray | None = None # used for standardisation
self.std: np.ndarray | None = None # standard deviation
# placeholders that will be filled during training
self.w_ = None # weights (including bias as w[0])
self.loss_history_ = [] # loss at each iteration
self.X_ = None # feature matrix (after standardisation)
self.y_ = None # target vector (0/1)
# ------------------------------------------------------------------
# 2. Sigmoid helper (vectorised)
# ------------------------------------------------------------------
@staticmethod
def _sigmoid(z: np.ndarray) -> np.ndarray:
return 1.0 / (1.0 + np.exp(-z))
def sigmoid(z: np.ndarray) -> np.ndarray:
"""Sigmoid method for the logistic regression method."""
return 1.0 / (1.0 + np.exp(-z)) # 1/(1+exp(-z))
# ------------------------------------------------------------------
# 3. Cost function (crossentropy)
# ------------------------------------------------------------------
@staticmethod
def _cost(y: np.ndarray, p: np.ndarray) -> float:
# avoid log(0) by clipping
def cost(y: np.ndarray, p: np.ndarray) -> float:
"""Crossentropy loss is used for the cost calculation"""
eps = 1e-15
p = np.clip(p, eps, 1 - eps)
return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p))
# ------------------------------------------------------------------
# 4. Data preparation this is where we split X / y, scale, etc.
# ------------------------------------------------------------------
def prepare(self, df: pd.DataFrame, target_col: str = 'Diagnosis') -> None:
def prepare(self, df: pd.DataFrame, target_col: str) -> None:
"""
Splits `df` into X and y, standardises X (mean=0, std=1),
and stores the result in the class attributes.
Parameters
----------
df : pd.DataFrame
Cleaned data *already* contains a numeric target in `target_col`.
target_col : str
Name of the binary target column.
Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column.
Then it does standardisation, adds bias and initializes the weight/coefficient.
"""
# target must be a 0/1 array
self.y_ = df[target_col].values.astype(np.int64)
if target_col not in df.columns:
raise ValueError(f"Target column '{target_col}' not found in DataFrame.")
# X all columns except the target
X_raw = df.drop(columns=[target_col]).values.astype(np.float64)
self.y = df[target_col].values.astype(np.int64)
# -----------------------------------------------------------------
# 3.1 Feature scaling we put the bias in the first column
# -----------------------------------------------------------------
# compute mean / std on the whole training set (no train/val split yet)
self.mean_ = X_raw.mean(axis=0)
self.std_ = X_raw.std(axis=0)
# avoid division by zero
self.std_[self.std_ == 0] = 1.0
x_raw = df.drop(columns=[target_col]).values.astype(np.float64)
X_scaled = (X_raw - self.mean_) / self.std_
# add bias column (all ones)
X_scaled = np.hstack([np.ones((X_scaled.shape[0], 1)), X_scaled])
# standardisation
self.mean = x_raw.mean(axis=0)
self.std = x_raw.std(axis=0)
self.std[self.std == 0] = 1.0
self.X_ = X_scaled
self.w_ = np.zeros(X_scaled.shape[1]) # initialise weights
x_scaled = (x_raw - self.mean) / self.std # standardisation formula
bias = np.ones((x_scaled.shape[0], 1), dtype=np.float64) # adding bias
self.x = np.hstack((bias, x_scaled))
self.w = np.zeros(self.x.shape[1], dtype=np.float64) # initialize weight as zero
# ------------------------------------------------------------------
# 4. Fit batch gradient descent
# ------------------------------------------------------------------
def fit(self) -> None:
"""Runs batch gradient descent for `n_iter` epochs."""
"""
Fit method to fit X and Y datas through pandas and train the linear model by gradient descent.
For the n iterations, it finds probabilities through sigmoid of linear prediction and does the
gradient to calculate the loss.
"""
if self.x is None or self.y is None: # if x or y are empty, throw error
raise RuntimeError("Model is not fitted yet. Call `fit` first.")
for i in range(1, self.n_iter + 1):
z = np.dot(self.X_, self.w_) # linear part
p = self._sigmoid(z) # predicted probabilities
z = self.x.dot(self.w) # linear prediction
p = self.sigmoid(z) # probabilities of the model predictions
# gradient of the loglikelihood (including bias)
gradient = np.dot(self.X_.T, (p - self.y_)) / self.y_.size
gradient = self.x.T.dot(p - self.y) / self.y.size # gradient calculation formula
# weight update
self.w_ -= self.lr * gradient
self.w -= self.lr * gradient # gradient multiplied by learning rate is removed from weight
# record cost and check stopping criterion
loss = self._cost(self.y_, p)
self.loss_history_.append(loss)
loss = self.cost(self.y, p) # cost is calculated through crossentropy and added for the current range
self.loss.append(loss)
# if verbose, it shows the loss every 100 iterations and displays it
if self.verbose and i % 100 == 0:
print(f"Iteration {i:4d} loss: {loss:.6f}")
print(f"Iter {i:4d} loss: {loss:.6f}")
if i > 1 and abs(self.loss_history_[-2] - loss) < self.tol:
# tests whether the absolute change in loss is smaller than the tolerance
if i > 1 and abs(self.loss[-2] - loss) < self.tol:
if self.verbose:
print(f"Converged after {i} iterations.")
break
break # loss is stopped so further training would be unnecessary
# ------------------------------------------------------------------
# 5. Predict binary class labels
# ------------------------------------------------------------------
def predict(self, X: np.ndarray) -> np.ndarray:
"""Return 0/1 predictions for a new X matrix (already scaled)."""
z = np.dot(X, self.w_)
probs = self._sigmoid(z)
return (probs >= 0.5).astype(int)
# ------------------------------------------------------------------
# 6. Score accuracy on a given (X, y) pair
# ------------------------------------------------------------------
def score(self, X: np.ndarray, y: np.ndarray) -> float:
"""Return the classification accuracy."""
y_pred = self.predict(X)
return np.mean(y_pred == y)
def predict(self, x: np.ndarray | pd.DataFrame) -> np.ndarray:
"""
Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors
and then calculates the model probability by applying sigmoid function.
"""
if isinstance(x, pd.DataFrame): # verifies value type
x = x.values.astype(np.float64)
if x.ndim == 1:
x = x.reshape(1, -1)
z = x.dot(self.w)
probs = self.sigmoid(z) # probability calculation through sigmoid method
return (probs >= 0.5).astype(int) # 0.5 is commonly used to define positivity of the probability
def score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
"""
This method is used to calculate mean accuracy with the prediction of Y and actual Y values.
"""
y_pred = self.predict(x)
y_true = np.asarray(y).astype(int)
return np.mean(y_pred == y_true) # mean is calculated if Y values match
if __name__ == "__main__":
columns = [
'ID', 'Diagnosis',
'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
'compactness_mean', 'concavitymean', 'concave_points_mean', 'symmetrymean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se',
'compactness_se', 'concavityse', 'concave_points_se', 'symmetryse', 'fractal_dimension_se',
'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst'
'compactness_worst', 'concavityworst', 'concave_points_worst', 'symmetryworst', 'fractal_dimension_worst'
]
df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
@ -155,7 +137,7 @@ if __name__ == "__main__":
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
df.dropna(inplace=True) # remove null values
print(f"Rows remaining after drop of the null values: {len(df)}")
print(f"Rows remaining after drop of the null values: {len(df)}\n")
for col in num_cols:
df = df[df[col] >= 0]
@ -172,33 +154,40 @@ if __name__ == "__main__":
# check if there are still null values
assert df.isna().sum().sum() == 0, "There are still some null values."
df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0}) # making diagnosis numeric
df['Diagnosis'] = df['Diagnosis'].astype('category')
# making diagnosis numeric
df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category")
# ---- 7.2 Instantiate and train ------------------------------------
model = LogisticRegressionGD(learning_rate=0.05,
n_iter=5000,
tolerance=1e-6,
verbose=True)
rng = np.random.default_rng(seed=42)
n_train = len(df)
indices = rng.permutation(n_train)
train_size = int(0.8 * n_train)
# we need to split X / y here
X = df.drop(columns=['Diagnosis'])
y = df['Diagnosis'].cat.codes.values # 0/1 array
train_idx = indices[:train_size]
test_idx = indices[train_size:]
# Standardise X inside the model for us well do it in `prepare`
model.X_ = (X - X.mean()) / X.std() # biascolumn will be added later
model.X_ = np.hstack([np.ones((model.X_.shape[0], 1)), model.X_]) # add bias
model.y_ = y
df_train = df.iloc[train_idx].reset_index(drop=True)
df_test = df.iloc[test_idx].reset_index(drop=True)
# Fit the model
# training of the model
model = LogisticRegression(learning_rate=0.00005, n_iter=5000, tolerance=1e-6, verbose=True)
# other values could be used, for example (lr=0.01, n_iter=2000, tolerance=1e-3, verbose=False)
model.prepare(df_train, target_col="Diagnosis")
model.fit()
# -------------------------------------------------
# 8. Evaluate on the same data (you could split)
# -------------------------------------------------
acc = model.score(model.X_, model.y_)
print(f"Training accuracy (on the whole cleaned set): {acc:.4f}")
# evaluation of the model
train_acc = model.score(model.x, model.y)
print(f"\nMean accuracy on training data: {train_acc:.4f}")
# Example: predict on the first 10 samples
y_hat = model.predict(model.X_[:10])
print("First 10 predictions:", y_hat)
# copied prepare method for building test X data
x_test_raw = df_test.drop(columns=['Diagnosis']).values.astype(np.float64)
x_test_scaled = (x_test_raw - model.mean) / model.std
bias_test = np.ones((x_test_scaled.shape[0], 1), dtype=np.float64)
X_test = np.hstack((bias_test, x_test_scaled))
y_test = df_test['Diagnosis'].values.astype(int)
test_acc = model.score(X_test, y_test)
print(f"Mean accuracy on testing data: {test_acc:.4f}")
# predict Y values using the trained data
first_10 = X_test[:10]
y_hat = model.predict(first_10)
print("\nFirst 10 predictions:", y_hat.ravel())