Finished the implementation of the python code.

2025-09-18 20:58:02 -04:00 · 2025-09-18 20:58:02 -04:00 · 455b48c89b
commit 455b48c89b
parent 5702c3c1b8
6 changed files with 540 additions and 159 deletions
--- a/linear-regression-parkinsons.py
+++ b/linear-regression-parkinsons.py
@ -27,8 +27,8 @@ class LinearRegression:
 class LinearRegression:
    '''
-        Constructor for the Linear Regression with analytical. It uses bias. It also
+        Constructor for the linear regression with analytical solution. It uses bias. It also
-        initializes the weight, mean and std.
+        initializes the weight, mean and standard deviation.
    '''
    def __init__(self, add_bias):
        self.add_bias = add_bias  # bias to prepend a column of ones (the intercept term)
@ -60,7 +60,8 @@ class LinearRegression:
    def fit(self, x: pd.DataFrame, y: pd.Series) -> "LinearRegression":
        '''
            Fit method to fit X and Y datas through pandas and train the linear model by analytical solution.
-            It uses pandas DataFrame for the X and Series for the Y.
+            It uses pandas DataFrame for the X and Series for the Y. It uses the linear regression formula
            to calculate weight
        '''
        x = self.prepare(x)
        y = pd.Series(y).astype("float64")
@ -84,7 +85,7 @@ class LinearRegression:
    def predict(self, x: pd.DataFrame) -> pd.Series:
        '''
-            Predict method is used to test trained data to do X prediction by multiplying X and weight vectors.
+            Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors.
        '''
        if self.w is None:  # if weight is empty, throw error
            raise RuntimeError("Model is not fitted yet. Call `fit` first.")
@ -95,7 +96,7 @@ class LinearRegression:
    def score(self, x: pd.DataFrame, y: pd.Series) -> float:
        '''
            This method is used to calculate coefficient of determination to assess the goodness
-            of fit from a regression model
+            of fit from the linear regression model
        '''
        y_pred = self.predict(x)  # predicts Y value with X predict method.
        y = pd.Series(y).astype('float64')
@ -127,7 +128,7 @@ if __name__ == "__main__":
        df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
    df.dropna(inplace=True) # remove null values
-    print(f"Rows remaining after drop of the null values: {len(df)}")
+    print(f"Rows remaining after drop of the null values: {len(df)}\n")
    # sanity checks for data validity - realistic parkinson data range estimations
    df = df[(df['age'] >= 18) & (df['age'] <= 95)]
@ -157,12 +158,9 @@ if __name__ == "__main__":
    # evaluation of the model
    print("\nR² on training data:", model.score(x_train, y_train))
-    print("\nR² on testing data:", model.score(x_test, y_test))
+    print("R² on testing data:", model.score(x_test, y_test))
    # predict Y values using the trained data
    preds = model.predict(x_test)
-    print("\nFirst 5 predictions:")
+    print("\nFirst 10 predictions:")
-    print(preds.head())
+    print(preds.head(10))
    print("\nWeights:")
    print(model.w.round(4))
--- a/logistic-regression-wdbc.py
+++ b/logistic-regression-wdbc.py
@ -1,144 +1,126 @@
 import numpy as np
 import pandas as pd
-class LogisticRegressionGD:
+
-    """Binary logistic regression trained with batch gradient descent."""
+class LogisticRegression:
-    def __init__(self,
+    '''
-                 learning_rate: float = 0.01,
+        Constructor for the logistic regression with gradient descent. It uses learning rate, iteration number,
-                 n_iter: int = 1000,
+        tolerance and verbose. It also initializes the weight, loss, x, y, mean and std.
-                 tolerance: float = 1e-5,
+    '''
-                 verbose: bool = False):
+
-        """
+    def __init__(self, learning_rate: float, n_iter: int, tolerance: float, verbose: bool) -> None:
        Parameters
        ----------
        learning_rate : float
            Step size for weight updates.
        n_iter : int
            Maximum number of iterations.
        tolerance : float
            Stopping criterion: if the change in loss is < tolerance, stop.
        verbose : bool
            If True, prints loss at every 100 iterations.
        """
        self.lr = learning_rate
        self.n_iter = n_iter
        self.tol = tolerance
        self.verbose = verbose
        self.w: np.ndarray | None = None         # weight/coefficient (bias as first element)
        self.loss: list[float] = []              # loss per iteration
        self.x: np.ndarray | None = None         # matrix of inputs after standardisation
        self.y: np.ndarray | None = None         # target vector
        self.mean: np.ndarray | None = None      # used for standardisation
        self.std: np.ndarray | None = None       # standard deviation
        # placeholders that will be filled during training
        self.w_ = None          # weights (including bias as w[0])
        self.loss_history_ = [] # loss at each iteration
        self.X_ = None          # feature matrix (after standardisation)
        self.y_ = None          # target vector (0/1)
    # ------------------------------------------------------------------
    # 2. Sigmoid helper (vectorised)
    # ------------------------------------------------------------------
    @staticmethod
-    def _sigmoid(z: np.ndarray) -> np.ndarray:
+    def sigmoid(z: np.ndarray) -> np.ndarray:
-        return 1.0 / (1.0 + np.exp(-z))
+        """Sigmoid method for the logistic regression method."""
        return 1.0 / (1.0 + np.exp(-z)) # 1/(1+exp(-z))
    # ------------------------------------------------------------------
    # 3. Cost function (cross‑entropy)
    # ------------------------------------------------------------------
    @staticmethod
-    def _cost(y: np.ndarray, p: np.ndarray) -> float:
+    def cost(y: np.ndarray, p: np.ndarray) -> float:
-        # avoid log(0) by clipping
+        """Cross‑entropy loss is used for the cost calculation"""
        eps = 1e-15
        p = np.clip(p, eps, 1 - eps)
        return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p))
-    # ------------------------------------------------------------------
+    def prepare(self, df: pd.DataFrame, target_col: str) -> None:
    # 4. Data preparation – this is where we split X / y, scale, etc.
    # ------------------------------------------------------------------
    def prepare(self, df: pd.DataFrame, target_col: str = 'Diagnosis') -> None:
        """
        Splits `df` into X and y, standardises X (mean=0, std=1),
        and stores the result in the class attributes.
-        Parameters
+        Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column.
-        ----------
+        Then it does standardisation, adds bias and initializes the weight/coefficient.
-        df : pd.DataFrame
+
            Cleaned data – *already* contains a numeric target in `target_col`.
        target_col : str
            Name of the binary target column.
        """
-        # target must be a 0/1 array
+        if target_col not in df.columns:
-        self.y_ = df[target_col].values.astype(np.int64)
+            raise ValueError(f"Target column '{target_col}' not found in DataFrame.")
-        # X – all columns except the target
+        self.y = df[target_col].values.astype(np.int64)
        X_raw = df.drop(columns=[target_col]).values.astype(np.float64)
-        # -----------------------------------------------------------------
+        x_raw = df.drop(columns=[target_col]).values.astype(np.float64)
        # 3.1  Feature scaling – we put the bias in the first column
        # -----------------------------------------------------------------
        # compute mean / std on the whole training set (no train/val split yet)
        self.mean_ = X_raw.mean(axis=0)
        self.std_ = X_raw.std(axis=0)
        # avoid division by zero
        self.std_[self.std_ == 0] = 1.0
-        X_scaled = (X_raw - self.mean_) / self.std_
+        # standardisation
-        # add bias column (all ones)
+        self.mean = x_raw.mean(axis=0)
-        X_scaled = np.hstack([np.ones((X_scaled.shape[0], 1)), X_scaled])
+        self.std = x_raw.std(axis=0)
        self.std[self.std == 0] = 1.0
-        self.X_ = X_scaled
+        x_scaled = (x_raw - self.mean) / self.std  # standardisation formula
-        self.w_ = np.zeros(X_scaled.shape[1])  # initialise weights
+
        bias = np.ones((x_scaled.shape[0], 1), dtype=np.float64)  # adding bias
        self.x = np.hstack((bias, x_scaled))
        self.w = np.zeros(self.x.shape[1], dtype=np.float64) # initialize weight as zero
    # ------------------------------------------------------------------
    # 4. Fit – batch gradient descent
    # ------------------------------------------------------------------
    def fit(self) -> None:
-        """Runs batch gradient descent for `n_iter` epochs."""
+        """
        Fit method to fit X and Y datas through pandas and train the linear model by gradient descent.
        For the n iterations, it finds probabilities through sigmoid of linear prediction and does the
        gradient to calculate the loss.
        """
        if self.x is None or self.y is None: # if x or y are empty, throw error
            raise RuntimeError("Model is not fitted yet. Call `fit` first.")
        for i in range(1, self.n_iter + 1):
-            z = np.dot(self.X_, self.w_)          # linear part
+            z = self.x.dot(self.w) # linear prediction
-            p = self._sigmoid(z)                   # predicted probabilities
+            p = self.sigmoid(z) # probabilities of the model predictions
-            # gradient of the log‑likelihood (including bias)
+            gradient = self.x.T.dot(p - self.y) / self.y.size # gradient calculation formula
            gradient = np.dot(self.X_.T, (p - self.y_)) / self.y_.size
-            # weight update
+            self.w -= self.lr * gradient # gradient multiplied by learning rate is removed from weight
            self.w_ -= self.lr * gradient
-            # record cost and check stopping criterion
+            loss = self.cost(self.y, p) # cost is calculated through cross‑entropy and added for the current range
-            loss = self._cost(self.y_, p)
+            self.loss.append(loss)
            self.loss_history_.append(loss)
            # if verbose, it shows the loss every 100 iterations and displays it
            if self.verbose and i % 100 == 0:
-                print(f"Iteration {i:4d} – loss: {loss:.6f}")
+                print(f"Iter {i:4d} – loss: {loss:.6f}")
-            if i > 1 and abs(self.loss_history_[-2] - loss) < self.tol:
+            # tests whether the absolute change in loss is smaller than the tolerance
            if i > 1 and abs(self.loss[-2] - loss) < self.tol:
                if self.verbose:
                    print(f"Converged after {i} iterations.")
-                break
+                break # loss is stopped so further training would be unnecessary
-    # ------------------------------------------------------------------
+    def predict(self, x: np.ndarray | pd.DataFrame) -> np.ndarray:
-    # 5. Predict – binary class labels
+        """
-    # ------------------------------------------------------------------
+            Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors
-    def predict(self, X: np.ndarray) -> np.ndarray:
+            and then calculates the model probability by applying sigmoid function.
-        """Return 0/1 predictions for a new X matrix (already scaled)."""
+        """
-        z = np.dot(X, self.w_)
+        if isinstance(x, pd.DataFrame): # verifies value type
-        probs = self._sigmoid(z)
+            x = x.values.astype(np.float64)
-        return (probs >= 0.5).astype(int)
+        if x.ndim == 1:
-
+            x = x.reshape(1, -1)
-    # ------------------------------------------------------------------
+        z = x.dot(self.w)
-    # 6. Score – accuracy on a given (X, y) pair
+        probs = self.sigmoid(z) # probability calculation through sigmoid method
-    # ------------------------------------------------------------------
+        return (probs >= 0.5).astype(int) # 0.5 is commonly used to define positivity of the probability
    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        """Return the classification accuracy."""
        y_pred = self.predict(X)
        return np.mean(y_pred == y)
    def score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
        """
            This method is used to calculate mean accuracy with the prediction of Y and actual Y values.
        """
        y_pred = self.predict(x)
        y_true = np.asarray(y).astype(int)
        return np.mean(y_pred == y_true) # mean is calculated if Y values match
 if __name__ == "__main__":
    columns = [
        'ID', 'Diagnosis',
        'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
-        'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
+        'compactness_mean', 'concavitymean', 'concave_points_mean', 'symmetrymean', 'fractal_dimension_mean',
        'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
-        'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se',
+        'compactness_se', 'concavityse', 'concave_points_se', 'symmetryse', 'fractal_dimension_se',
        'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
-        'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst'
+        'compactness_worst', 'concavityworst', 'concave_points_worst', 'symmetryworst', 'fractal_dimension_worst'
    ]
    df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
@ -155,7 +137,7 @@ if __name__ == "__main__":
        df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
    df.dropna(inplace=True) # remove null values
-    print(f"Rows remaining after drop of the null values: {len(df)}")
+    print(f"Rows remaining after drop of the null values: {len(df)}\n")
    for col in num_cols:
        df = df[df[col] >= 0]
@ -172,33 +154,40 @@ if __name__ == "__main__":
    # check if there are still null values
    assert df.isna().sum().sum() == 0, "There are still some null values."
-    df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0})  # making diagnosis numeric
+    # making diagnosis numeric
-    df['Diagnosis'] = df['Diagnosis'].astype('category')
+    df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category")
-    # ---- 7.2  Instantiate and train ------------------------------------
+    rng = np.random.default_rng(seed=42)
-    model = LogisticRegressionGD(learning_rate=0.05,
+    n_train = len(df)
-                                 n_iter=5000,
+    indices = rng.permutation(n_train)
-                                 tolerance=1e-6,
+    train_size = int(0.8 * n_train)
                                 verbose=True)
-    # we need to split X / y here
+    train_idx = indices[:train_size]
-    X = df.drop(columns=['Diagnosis'])
+    test_idx = indices[train_size:]
    y = df['Diagnosis'].cat.codes.values   # 0/1 array
-    # Standardise X inside the model for us – we’ll do it in `prepare`
+    df_train = df.iloc[train_idx].reset_index(drop=True)
-    model.X_ = (X - X.mean()) / X.std()          # bias‑column will be added later
+    df_test = df.iloc[test_idx].reset_index(drop=True)
    model.X_ = np.hstack([np.ones((model.X_.shape[0], 1)), model.X_])  # add bias
    model.y_ = y
-    # Fit the model
+    # training of the model
    model = LogisticRegression(learning_rate=0.00005, n_iter=5000, tolerance=1e-6, verbose=True)
    # other values could be used, for example (lr=0.01, n_iter=2000, tolerance=1e-3, verbose=False)
    model.prepare(df_train, target_col="Diagnosis")
    model.fit()
-    # -------------------------------------------------
+    # evaluation of the model
-    # 8. Evaluate on the same data (you could split)
+    train_acc = model.score(model.x, model.y)
-    # -------------------------------------------------
+    print(f"\nMean accuracy on training data: {train_acc:.4f}")
    acc = model.score(model.X_, model.y_)
    print(f"Training accuracy (on the whole cleaned set): {acc:.4f}")
-    # Example: predict on the first 10 samples
+    # copied prepare method for building test X data
-    y_hat = model.predict(model.X_[:10])
+    x_test_raw = df_test.drop(columns=['Diagnosis']).values.astype(np.float64)
-    print("First 10 predictions:", y_hat)
+    x_test_scaled = (x_test_raw - model.mean) / model.std
    bias_test = np.ones((x_test_scaled.shape[0], 1), dtype=np.float64)
    X_test = np.hstack((bias_test, x_test_scaled))
    y_test = df_test['Diagnosis'].values.astype(int)
    test_acc = model.score(X_test, y_test)
    print(f"Mean accuracy on testing data: {test_acc:.4f}")
    # predict Y values using the trained data
    first_10 = X_test[:10]
    y_hat = model.predict(first_10)
    print("\nFirst 10 predictions:", y_hat.ravel())
--- a/mini-batch-sgd-linear-regression-parkinsons.py
+++ b/mini-batch-sgd-linear-regression-parkinsons.py
@ -3,8 +3,8 @@ import pandas as pd
 class LinearRegression:
    '''
-        Constructor for the Linear Regression with mini‑batch stochastic gradient descent. It uses learning rate,
+        Constructor for the linear regression with mini‑batch stochastic gradient descent. It uses learning rate,
-        iteration number, batch size, bias and verbose. It also initializes the weight, mean and std.
+        iteration number, batch size, bias and verbose. It also initializes the weight, mean and standard deviation.
    '''
    def __init__(self, lr, n_iter, batch_size, add_bias, verbose):
        self.lr = lr  # learning rate
@ -90,7 +90,7 @@ class LinearRegression:
    def predict(self, x: pd.DataFrame) -> pd.Series:
        '''
-            Predict method makes X prediction by multiplying X and weight vectors.
+            Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors.
        '''
        if self.w is None:  # if weight is empty, throw error
            raise RuntimeError("Model is not fitted yet. Call `fit` first.")
@ -101,7 +101,7 @@ class LinearRegression:
    def score(self, x: pd.DataFrame, y: pd.Series) -> float:
        '''
            This method is used to calculate coefficient of determination to assess the goodness
-            of fit from a regression model
+            of fit from the linear regression model
        '''
        y_pred = self.predict(x)  # predicts Y value with X predict method.
        y = pd.Series(y).astype('float64')
@ -133,7 +133,7 @@ if __name__ == "__main__":
        df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
    df.dropna(inplace=True) # remove null values
-    print(f"Rows remaining after drop of the null values: {len(df)}")
+    print(f"Rows remaining after drop of the null values: {len(df)}\n")
    # sanity checks for data validity - realistic parkinson data range estimations
    df = df[(df['age'] >= 18) & (df['age'] <= 95)]
@ -164,12 +164,9 @@ if __name__ == "__main__":
    # evaluation of the model
    print("\nR² on training data:", model.score(x_train, y_train))
-    print("\nR² on testing data:", model.score(x_test, y_test))
+    print("R² on testing data:", model.score(x_test, y_test))
    # predict Y values using the trained data
    preds = model.predict(x_test)
-    print("\nFirst 5 predictions:")
+    print("\nFirst 10 predictions:")
-    print(preds.head())
+    print(preds.head(10))
    print("\nWeights:")
    print(model.w.round(4))
--- a/mini-batch-sgd-logistic-regression-wdbc.py
+++ b/mini-batch-sgd-logistic-regression-wdbc.py
@ -1,28 +1,143 @@
 import numpy as np
 import pandas as pd
-'''
+
 class LogisticRegression:
    def __init__(self):
    def prepare(self):
    def fit(self):
    def predict(self):
    def score(self):
    '''
        Constructor for the logistic regression with gradient descent. It uses learning rate, iteration number,
        tolerance and verbose. It also initializes the weight, loss, x, y, mean and std.
    '''
    def __init__(self, learning_rate: float, n_iter: int, batch_size: int, tolerance: float, verbose: bool) -> None:
        self.lr = learning_rate
        self.n_iter = n_iter
        self.batch_size = batch_size
        self.tol = tolerance
        self.verbose = verbose
        self.w: np.ndarray | None = None         # weight/coefficient (bias as first element)
        self.loss: list[float] = []              # loss per iteration
        self.x: np.ndarray | None = None         # matrix of inputs after standardisation
        self.y: np.ndarray | None = None         # target vector
        self.mean: np.ndarray | None = None      # used for standardisation
        self.std: np.ndarray | None = None       # standard deviation
    @staticmethod
    def sigmoid(z: np.ndarray) -> np.ndarray:
        """Sigmoid method for the logistic regression method."""
        return 1.0 / (1.0 + np.exp(-z)) # 1/(1+exp(-z))
    @staticmethod
    def cost(y: np.ndarray, p: np.ndarray) -> float:
        """Cross‑entropy loss is used for the cost calculation"""
        eps = 1e-15
        p = np.clip(p, eps, 1 - eps)
        return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p))
    def prepare(self, df: pd.DataFrame, target_col: str) -> None:
        """
        Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column.
        Then it does standardisation, adds bias and initializes the weight/coefficient.
        """
        if target_col not in df.columns:
            raise ValueError(f"Target column '{target_col}' not found in DataFrame.")
        self.y = df[target_col].values.astype(np.int64)
        x_raw = df.drop(columns=[target_col]).values.astype(np.float64)
        # standardisation
        self.mean = x_raw.mean(axis=0)
        self.std = x_raw.std(axis=0)
        self.std[self.std == 0] = 1.0
        x_scaled = (x_raw - self.mean) / self.std  # standardisation formula
        bias = np.ones((x_scaled.shape[0], 1), dtype=np.float64)  # adding bias
        self.x = np.hstack((bias, x_scaled))
        self.w = np.zeros(self.x.shape[1], dtype=np.float64) # initialize weight as zero
    def fit(self) -> None:
        """
        Fit method to fit X and Y datas through pandas and train the linear model by gradient descent.
        For the n iterations, it finds probabilities through sigmoid of linear prediction and does the
        gradient to calculate the loss.
        """
        if self.x is None or self.y is None: # if x or y are empty, throw error
            raise RuntimeError("Model is not fitted yet. Call `prepare` first.")
        n_samples = self.x.shape[0]
        batch_size = self.batch_size or n_samples
        for epoch in range(1, self.n_iter + 1):
            shuffled_idx = np.random.permutation(n_samples) # random permutation of the indices
            x_shuffled = self.x[shuffled_idx]
            y_shuffled = self.y[shuffled_idx]
            # process execution for each mini‑batch
            for b in range(0, n_samples, batch_size):
                start = b * batch_size
                end   = start + batch_size
                idx = shuffled_idx[start:end]
                x_batch = x_shuffled[idx]
                y_batch = y_shuffled[idx]
                z = x_batch.dot(self.w)
                p = self.sigmoid(z)
                grad = x_batch.T.dot(p - y_batch) / y_batch.size # gradient calculation formula
                self.w -= self.lr * grad # gradient multiplied by learning rate is removed from weight
            # cost is calculated through cross‑entropy and added for the current range
            loss = self.cost(self.y, self.sigmoid(self.x.dot(self.w)))
            self.loss.append(loss)
            # if verbose, it shows the loss every 100 iterations and displays it
            if self.verbose and epoch % 100 == 0:
                print(f"Iter {epoch:4d} – loss: {loss:.6f}")
            # tests whether the absolute change in loss is smaller than the tolerance
            if epoch > 1 and abs(self.loss[-2] - loss) < self.tol:
                if self.verbose:
                    print(f"Converged after {epoch} iterations.")
                break
    def predict(self, x: np.ndarray | pd.DataFrame) -> np.ndarray:
        """
            Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors
            and then calculates the model probability by applying sigmoid function.
        """
        if isinstance(x, pd.DataFrame): # verifies value type
            x = x.values.astype(np.float64)
        if x.ndim == 1:
            x = x.reshape(1, -1)
        z = x.dot(self.w)
        probs = self.sigmoid(z) # probability calculation through sigmoid method
        return (probs >= 0.5).astype(int) # 0.5 is commonly used to define positivity of the probability
    def score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
        """
            This method is used to calculate mean accuracy with the prediction of Y and actual Y values.
        """
        y_pred = self.predict(x)
        y_true = np.asarray(y).astype(int)
        return np.mean(y_pred == y_true) # mean is calculated if Y values match
 if __name__ == "__main__":
    columns = [
        'ID', 'Diagnosis',
        'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
-        'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
+        'compactness_mean', 'concavitymean', 'concave_points_mean', 'symmetrymean', 'fractal_dimension_mean',
        'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
-        'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se',
+        'compactness_se', 'concavityse', 'concave_points_se', 'symmetryse', 'fractal_dimension_se',
        'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
-        'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst'
+        'compactness_worst', 'concavityworst', 'concave_points_worst', 'symmetryworst', 'fractal_dimension_worst'
    ]
    df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
@ -39,7 +154,7 @@ if __name__ == "__main__":
        df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
    df.dropna(inplace=True) # remove null values
-    print(f"Rows remaining after drop of the null values: {len(df)}")
+    print(f"Rows remaining after drop of the null values: {len(df)}\n")
    for col in num_cols:
        df = df[df[col] >= 0]
@ -56,5 +171,40 @@ if __name__ == "__main__":
    # check if there are still null values
    assert df.isna().sum().sum() == 0, "There are still some null values."
-    df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0})  # making diagnosis numeric
+    # making diagnosis numeric
-    df['Diagnosis'] = df['Diagnosis'].astype('category')
+    df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category")
    rng = np.random.default_rng(seed=42)
    n_samples = len(df)
    indices = rng.permutation(n_samples)
    train_size = int(0.8 * n_samples)
    train_idx = indices[:train_size]
    test_idx = indices[train_size:]
    df_train = df.iloc[train_idx].reset_index(drop=True)
    df_test = df.iloc[test_idx].reset_index(drop=True)
    # training of the model
    model = LogisticRegression(learning_rate=0.00005, n_iter=5000, batch_size=64, tolerance=1e-6, verbose=True)
    # other values could be used, for example (lr=0.01, n_iter=2000, tolerance=1e-3, verbose=False)
    model.prepare(df_train, target_col="Diagnosis")
    model.fit()
    # evaluation of the model
    train_acc = model.score(model.x, model.y)
    print(f"\nMean accuracy on training data: {train_acc:.4f}")
    # copied prepare method for building test X data
    x_test_raw = df_test.drop(columns=['Diagnosis']).values.astype(np.float64)
    x_test_scaled = (x_test_raw - model.mean) / model.std
    bias_test = np.ones((x_test_scaled.shape[0], 1), dtype=np.float64)
    X_test = np.hstack((bias_test, x_test_scaled))
    y_test = df_test['Diagnosis'].values.astype(int)
    test_acc = model.score(X_test, y_test)
    print(f"Mean accuracy on testing data: {test_acc:.4f}")
    # predict Y values using the trained data
    first_10 = X_test[:10]
    y_hat = model.predict(first_10)
    print("\nFirst 10 predictions:", y_hat.ravel())
--- a/parkinsons_updrs.names
+++ b/parkinsons_updrs.names
@ -0,0 +1,107 @@
 Parkinsons Telemonitoring Data Set  
 Abstract: Oxford Parkinson's Disease Telemonitoring Dataset
 ============================================================
 Data Set Characteristics:  Multivariate
 Attribute Characteristics:  Integer, Real
 Associated Tasks:  Regression
 Number of Instances:  5875
 Number of Attributes:  26
 Area:  Life
 Date Donated:  2009-10-29
 ============================================================
 SOURCE:
 The dataset was created by Athanasios Tsanas (tsanasthanasis '@' gmail.com) 
 and Max Little (littlem '@' physics.ox.ac.uk) of the University of Oxford, in 
 collaboration with 10 medical centers in the US and Intel Corporation who 
 developed the telemonitoring device to record the speech signals. The 
 original study used a range of linear and nonlinear regression methods to 
 predict the clinician's Parkinson's disease symptom score on the UPDRS scale.
 ============================================================
 DATA SET INFORMATION:
 This dataset is composed of a range of biomedical voice measurements from 42 
 people with early-stage Parkinson's disease recruited to a six-month trial of 
 a telemonitoring device for remote symptom progression monitoring. The 
 recordings were automatically captured in the patient's homes.
 Columns in the table contain subject number, subject age, subject gender, 
 time interval from baseline recruitment date, motor UPDRS, total UPDRS, and 
 16 biomedical voice measures. Each row corresponds to one of 5,875 voice 
 recording from these individuals. The main aim of the data is to predict the 
 motor and total UPDRS scores ('motor_UPDRS' and 'total_UPDRS') from the 16 
 voice measures.
 The data is in ASCII CSV format. The rows of the CSV file contain an instance 
 corresponding to one voice recording. There are around 200 recordings per 
 patient, the subject number of the patient is identified in the first column. 
 For further information or to pass on comments, please contact Athanasios 
 Tsanas (tsanasthanasis '@' gmail.com) or Max Little (littlem '@' 
 physics.ox.ac.uk).
 Further details are contained in the following reference -- if you use this 
 dataset, please cite:
 Athanasios Tsanas, Max A. Little, Patrick E. McSharry, Lorraine O. Ramig (2009),
 'Accurate telemonitoring of Parkinson.s disease progression by non-invasive 
 speech tests',
 IEEE Transactions on Biomedical Engineering (to appear).
 Further details about the biomedical voice measures can be found in:
 Max A. Little, Patrick E. McSharry, Eric J. Hunter, Lorraine O. Ramig (2009),
 'Suitability of dysphonia measurements for telemonitoring of Parkinson's 
 disease',
 IEEE Transactions on Biomedical Engineering, 56(4):1015-1022 
 ===========================================================
 ATTRIBUTE INFORMATION:
 subject# - Integer that uniquely identifies each subject
 age - Subject age
 sex - Subject gender '0' - male, '1' - female
 test_time - Time since recruitment into the trial. The integer part is the 
 number of days since recruitment.
 motor_UPDRS - Clinician's motor UPDRS score, linearly interpolated
 total_UPDRS - Clinician's total UPDRS score, linearly interpolated
 Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP - Several measures of 
 variation in fundamental frequency
 Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA - 
 Several measures of variation in amplitude
 NHR,HNR - Two measures of ratio of noise to tonal components in the voice
 RPDE - A nonlinear dynamical complexity measure
 DFA - Signal fractal scaling exponent
 PPE - A nonlinear measure of fundamental frequency variation 
 ===========================================================
 RELEVANT PAPERS:
 Little MA, McSharry PE, Hunter EJ, Ramig LO (2009),
 'Suitability of dysphonia measurements for telemonitoring of Parkinson's 
 disease',
 IEEE Transactions on Biomedical Engineering, 56(4):1015-1022
 Little MA, McSharry PE, Roberts SJ, Costello DAE, Moroz IM.
 'Exploiting Nonlinear Recurrence and Fractal Scaling Properties for Voice 
 Disorder Detection',
 BioMedical Engineering OnLine 2007, 6:23 (26 June 2007) 
 ===========================================================
 CITATION REQUEST:
 If you use this dataset, please cite the following paper:
 A Tsanas, MA Little, PE McSharry, LO Ramig (2009)
 'Accurate telemonitoring of Parkinson.s disease progression by non-invasive 
 speech tests',
 IEEE Transactions on Biomedical Engineering (to appear). 
--- a/wdbc.names
+++ b/wdbc.names
@ -0,0 +1,140 @@
 1. Title: Wisconsin Diagnostic Breast Cancer (WDBC)
 2. Source Information
 a) Creators: 
 	Dr. William H. Wolberg, General Surgery Dept., University of
 	Wisconsin,  Clinical Sciences Center, Madison, WI 53792
 	wolberg@eagle.surgery.wisc.edu
 	W. Nick Street, Computer Sciences Dept., University of
 	Wisconsin, 1210 West Dayton St., Madison, WI 53706
 	street@cs.wisc.edu  608-262-6619
 	Olvi L. Mangasarian, Computer Sciences Dept., University of
 	Wisconsin, 1210 West Dayton St., Madison, WI 53706
 	olvi@cs.wisc.edu 
 b) Donor: Nick Street
 c) Date: November 1995
 3. Past Usage:
 first usage:
 	W.N. Street, W.H. Wolberg and O.L. Mangasarian 
 	Nuclear feature extraction for breast tumor diagnosis.
 	IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science
 	and Technology, volume 1905, pages 861-870, San Jose, CA, 1993.
 OR literature:
 	O.L. Mangasarian, W.N. Street and W.H. Wolberg. 
 	Breast cancer diagnosis and prognosis via linear programming. 
 	Operations Research, 43(4), pages 570-577, July-August 1995.
 Medical literature:
 	W.H. Wolberg, W.N. Street, and O.L. Mangasarian. 
 	Machine learning techniques to diagnose breast cancer from
 	fine-needle aspirates.  
 	Cancer Letters 77 (1994) 163-171.
 	W.H. Wolberg, W.N. Street, and O.L. Mangasarian. 
 	Image analysis and machine learning applied to breast cancer
 	diagnosis and prognosis.  
 	Analytical and Quantitative Cytology and Histology, Vol. 17
 	No. 2, pages 77-87, April 1995. 
 	W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian. 
 	Computerized breast cancer diagnosis and prognosis from fine
 	needle aspirates.  
 	Archives of Surgery 1995;130:511-516.
 	W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian. 
 	Computer-derived nuclear features distinguish malignant from
 	benign breast cytology.  
 	Human Pathology, 26:792--796, 1995.
 See also:
 	http://www.cs.wisc.edu/~olvi/uwmp/mpml.html
 	http://www.cs.wisc.edu/~olvi/uwmp/cancer.html
 Results:
 	- predicting field 2, diagnosis: B = benign, M = malignant
 	- sets are linearly separable using all 30 input features
 	- best predictive accuracy obtained using one separating plane
 		in the 3-D space of Worst Area, Worst Smoothness and
 		Mean Texture.  Estimated accuracy 97.5% using repeated
 		10-fold crossvalidations.  Classifier has correctly
 		diagnosed 176 consecutive new patients as of November
 		1995. 
 4. Relevant information
 	Features are computed from a digitized image of a fine needle
 	aspirate (FNA) of a breast mass.  They describe
 	characteristics of the cell nuclei present in the image.
 	A few of the images can be found at
 	http://www.cs.wisc.edu/~street/images/
 	Separating plane described above was obtained using
 	Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree
 	Construction Via Linear Programming." Proceedings of the 4th
 	Midwest Artificial Intelligence and Cognitive Science Society,
 	pp. 97-101, 1992], a classification method which uses linear
 	programming to construct a decision tree.  Relevant features
 	were selected using an exhaustive search in the space of 1-4
 	features and 1-3 separating planes.
 	The actual linear program used to obtain the separating plane
 	in the 3-dimensional space is that described in:
 	[K. P. Bennett and O. L. Mangasarian: "Robust Linear
 	Programming Discrimination of Two Linearly Inseparable Sets",
 	Optimization Methods and Software 1, 1992, 23-34].
 	This database is also available through the UW CS ftp server:
 	ftp ftp.cs.wisc.edu
 	cd math-prog/cpo-dataset/machine-learn/WDBC/
 5. Number of instances: 569 
 6. Number of attributes: 32 (ID, diagnosis, 30 real-valued input features)
 7. Attribute information
 1) ID number
 2) Diagnosis (M = malignant, B = benign)
 3-32)
 Ten real-valued features are computed for each cell nucleus:
 	a) radius (mean of distances from center to points on the perimeter)
 	b) texture (standard deviation of gray-scale values)
 	c) perimeter
 	d) area
 	e) smoothness (local variation in radius lengths)
 	f) compactness (perimeter^2 / area - 1.0)
 	g) concavity (severity of concave portions of the contour)
 	h) concave points (number of concave portions of the contour)
 	i) symmetry 
 	j) fractal dimension ("coastline approximation" - 1)
 Several of the papers listed above contain detailed descriptions of
 how these features are computed. 
 The mean, standard error, and "worst" or largest (mean of the three
 largest values) of these features were computed for each image,
 resulting in 30 features.  For instance, field 3 is Mean Radius, field
 13 is Radius SE, field 23 is Worst Radius.
 All feature values are recoded with four significant digits.
 8. Missing attribute values: none
 9. Class distribution: 357 benign, 212 malignant