Finished the implementation of the python code.

2025-09-18 20:58:02 -04:00 · 2025-09-18 20:58:02 -04:00 · 455b48c89b
commit 455b48c89b
parent 5702c3c1b8
6 changed files with 540 additions and 159 deletions
--- a/mini-batch-sgd-logistic-regression-wdbc.py
+++ b/mini-batch-sgd-logistic-regression-wdbc.py
@ -1,28 +1,143 @@
 import numpy as np
 import pandas as pd

-'''
+
 class LogisticRegression:
-    def __init__(self):
+    '''
+        Constructor for the logistic regression with gradient descent. It uses learning rate, iteration number,
+        tolerance and verbose. It also initializes the weight, loss, x, y, mean and std.
+    '''

-    def prepare(self):
+    def __init__(self, learning_rate: float, n_iter: int, batch_size: int, tolerance: float, verbose: bool) -> None:
+        self.lr = learning_rate
+        self.n_iter = n_iter
+        self.batch_size = batch_size
+        self.tol = tolerance
+        self.verbose = verbose
+        self.w: np.ndarray | None = None         # weight/coefficient (bias as first element)
+        self.loss: list[float] = []              # loss per iteration
+        self.x: np.ndarray | None = None         # matrix of inputs after standardisation
+        self.y: np.ndarray | None = None         # target vector
+        self.mean: np.ndarray | None = None      # used for standardisation
+        self.std: np.ndarray | None = None       # standard deviation

-    def fit(self):
+    @staticmethod
+    def sigmoid(z: np.ndarray) -> np.ndarray:
+        """Sigmoid method for the logistic regression method."""
+        return 1.0 / (1.0 + np.exp(-z)) # 1/(1+exp(-z))

-    def predict(self):
+    @staticmethod
+    def cost(y: np.ndarray, p: np.ndarray) -> float:
+        """Cross‑entropy loss is used for the cost calculation"""
+        eps = 1e-15
+        p = np.clip(p, eps, 1 - eps)
+        return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p))

-    def score(self):
-'''
+    def prepare(self, df: pd.DataFrame, target_col: str) -> None:
+        """
+
+        Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column.
+        Then it does standardisation, adds bias and initializes the weight/coefficient.
+
+        """
+        if target_col not in df.columns:
+            raise ValueError(f"Target column '{target_col}' not found in DataFrame.")
+
+        self.y = df[target_col].values.astype(np.int64)
+
+        x_raw = df.drop(columns=[target_col]).values.astype(np.float64)
+
+        # standardisation
+        self.mean = x_raw.mean(axis=0)
+        self.std = x_raw.std(axis=0)
+        self.std[self.std == 0] = 1.0
+
+        x_scaled = (x_raw - self.mean) / self.std  # standardisation formula
+
+
+        bias = np.ones((x_scaled.shape[0], 1), dtype=np.float64)  # adding bias
+        self.x = np.hstack((bias, x_scaled))
+
+        self.w = np.zeros(self.x.shape[1], dtype=np.float64) # initialize weight as zero
+
+    def fit(self) -> None:
+        """
+
+        Fit method to fit X and Y datas through pandas and train the linear model by gradient descent.
+        For the n iterations, it finds probabilities through sigmoid of linear prediction and does the
+        gradient to calculate the loss.
+
+        """
+        if self.x is None or self.y is None: # if x or y are empty, throw error
+            raise RuntimeError("Model is not fitted yet. Call `prepare` first.")
+
+        n_samples = self.x.shape[0]
+        batch_size = self.batch_size or n_samples
+
+        for epoch in range(1, self.n_iter + 1):
+            shuffled_idx = np.random.permutation(n_samples) # random permutation of the indices
+            x_shuffled = self.x[shuffled_idx]
+            y_shuffled = self.y[shuffled_idx]
+
+            # process execution for each mini‑batch
+            for b in range(0, n_samples, batch_size):
+                start = b * batch_size
+                end   = start + batch_size
+                idx = shuffled_idx[start:end]
+
+                x_batch = x_shuffled[idx]
+                y_batch = y_shuffled[idx]
+
+                z = x_batch.dot(self.w)
+                p = self.sigmoid(z)
+
+                grad = x_batch.T.dot(p - y_batch) / y_batch.size # gradient calculation formula
+                self.w -= self.lr * grad # gradient multiplied by learning rate is removed from weight
+
+            # cost is calculated through cross‑entropy and added for the current range
+            loss = self.cost(self.y, self.sigmoid(self.x.dot(self.w)))
+            self.loss.append(loss)
+
+            # if verbose, it shows the loss every 100 iterations and displays it
+            if self.verbose and epoch % 100 == 0:
+                print(f"Iter {epoch:4d} – loss: {loss:.6f}")
+
+            # tests whether the absolute change in loss is smaller than the tolerance
+            if epoch > 1 and abs(self.loss[-2] - loss) < self.tol:
+                if self.verbose:
+                    print(f"Converged after {epoch} iterations.")
+                break
+
+    def predict(self, x: np.ndarray | pd.DataFrame) -> np.ndarray:
+        """
+            Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors
+            and then calculates the model probability by applying sigmoid function.
+        """
+        if isinstance(x, pd.DataFrame): # verifies value type
+            x = x.values.astype(np.float64)
+        if x.ndim == 1:
+            x = x.reshape(1, -1)
+        z = x.dot(self.w)
+        probs = self.sigmoid(z) # probability calculation through sigmoid method
+        return (probs >= 0.5).astype(int) # 0.5 is commonly used to define positivity of the probability
+
+    def score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
+        """
+            This method is used to calculate mean accuracy with the prediction of Y and actual Y values.
+        """
+        y_pred = self.predict(x)
+        y_true = np.asarray(y).astype(int)
+        return np.mean(y_pred == y_true) # mean is calculated if Y values match

 if __name__ == "__main__":
    columns = [
        'ID', 'Diagnosis',
        'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
-        'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
+        'compactness_mean', 'concavitymean', 'concave_points_mean', 'symmetrymean', 'fractal_dimension_mean',
        'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
-        'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se',
+        'compactness_se', 'concavityse', 'concave_points_se', 'symmetryse', 'fractal_dimension_se',
        'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
-        'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst'
+        'compactness_worst', 'concavityworst', 'concave_points_worst', 'symmetryworst', 'fractal_dimension_worst'
    ]

    df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
@ -39,7 +154,7 @@ if __name__ == "__main__":
        df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values

    df.dropna(inplace=True) # remove null values
-    print(f"Rows remaining after drop of the null values: {len(df)}")
+    print(f"Rows remaining after drop of the null values: {len(df)}\n")
    for col in num_cols:
        df = df[df[col] >= 0]

@ -56,5 +171,40 @@ if __name__ == "__main__":
    # check if there are still null values
    assert df.isna().sum().sum() == 0, "There are still some null values."

-    df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0})  # making diagnosis numeric
-    df['Diagnosis'] = df['Diagnosis'].astype('category')
+    # making diagnosis numeric
+    df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category")
+
+    rng = np.random.default_rng(seed=42)
+    n_samples = len(df)
+    indices = rng.permutation(n_samples)
+    train_size = int(0.8 * n_samples)
+
+    train_idx = indices[:train_size]
+    test_idx = indices[train_size:]
+
+    df_train = df.iloc[train_idx].reset_index(drop=True)
+    df_test = df.iloc[test_idx].reset_index(drop=True)
+
+    # training of the model
+    model = LogisticRegression(learning_rate=0.00005, n_iter=5000, batch_size=64, tolerance=1e-6, verbose=True)
+    # other values could be used, for example (lr=0.01, n_iter=2000, tolerance=1e-3, verbose=False)
+    model.prepare(df_train, target_col="Diagnosis")
+    model.fit()
+
+    # evaluation of the model
+    train_acc = model.score(model.x, model.y)
+    print(f"\nMean accuracy on training data: {train_acc:.4f}")
+
+    # copied prepare method for building test X data
+    x_test_raw = df_test.drop(columns=['Diagnosis']).values.astype(np.float64)
+    x_test_scaled = (x_test_raw - model.mean) / model.std
+    bias_test = np.ones((x_test_scaled.shape[0], 1), dtype=np.float64)
+    X_test = np.hstack((bias_test, x_test_scaled))
+    y_test = df_test['Diagnosis'].values.astype(int)
+    test_acc = model.score(X_test, y_test)
+    print(f"Mean accuracy on testing data: {test_acc:.4f}")
+
+    # predict Y values using the trained data
+    first_10 = X_test[:10]
+    y_hat = model.predict(first_10)
+    print("\nFirst 10 predictions:", y_hat.ravel())