diff --git a/linear-regression-parkinsons.py b/linear-regression-parkinsons.py index bc9a196..e2f73e7 100644 --- a/linear-regression-parkinsons.py +++ b/linear-regression-parkinsons.py @@ -129,7 +129,7 @@ if __name__ == "__main__": df.dropna(inplace=True) # remove null values print(f"Rows remaining after drop of the null values: {len(df)}") - # sanity checks for data validity + # sanity checks for data validity - realistic parkinson data range estimations df = df[(df['age'] >= 18) & (df['age'] <= 95)] df = df[(df['motor_UPDRS'] >= 0) & (df['motor_UPDRS'] <= 100)] df = df[(df['total_UPDRS'] >= 0) & (df['total_UPDRS'] <= 100)] diff --git a/logistic-regression-wdbc.py b/logistic-regression-wdbc.py index 820006e..601db11 100644 --- a/logistic-regression-wdbc.py +++ b/logistic-regression-wdbc.py @@ -1,18 +1,134 @@ import numpy as np import pandas as pd -''' -class LogisticRegression: - def __init__(self): +class LogisticRegressionGD: + """Binary logistic regression trained with batch gradient descent.""" + def __init__(self, + learning_rate: float = 0.01, + n_iter: int = 1000, + tolerance: float = 1e-5, + verbose: bool = False): + """ + Parameters + ---------- + learning_rate : float + Step size for weight updates. + n_iter : int + Maximum number of iterations. + tolerance : float + Stopping criterion: if the change in loss is < tolerance, stop. + verbose : bool + If True, prints loss at every 100 iterations. + """ + self.lr = learning_rate + self.n_iter = n_iter + self.tol = tolerance + self.verbose = verbose - def prepare(self): + # placeholders that will be filled during training + self.w_ = None # weights (including bias as w[0]) + self.loss_history_ = [] # loss at each iteration + self.X_ = None # feature matrix (after standardisation) + self.y_ = None # target vector (0/1) - def fit(self): + # ------------------------------------------------------------------ + # 2. Sigmoid helper (vectorised) + # ------------------------------------------------------------------ + @staticmethod + def _sigmoid(z: np.ndarray) -> np.ndarray: + return 1.0 / (1.0 + np.exp(-z)) - def predict(self): + # ------------------------------------------------------------------ + # 3. Cost function (cross‑entropy) + # ------------------------------------------------------------------ + @staticmethod + def _cost(y: np.ndarray, p: np.ndarray) -> float: + # avoid log(0) by clipping + eps = 1e-15 + p = np.clip(p, eps, 1 - eps) + return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p)) + + # ------------------------------------------------------------------ + # 4. Data preparation – this is where we split X / y, scale, etc. + # ------------------------------------------------------------------ + def prepare(self, df: pd.DataFrame, target_col: str = 'Diagnosis') -> None: + """ + Splits `df` into X and y, standardises X (mean=0, std=1), + and stores the result in the class attributes. + + Parameters + ---------- + df : pd.DataFrame + Cleaned data – *already* contains a numeric target in `target_col`. + target_col : str + Name of the binary target column. + """ + # target must be a 0/1 array + self.y_ = df[target_col].values.astype(np.int64) + + # X – all columns except the target + X_raw = df.drop(columns=[target_col]).values.astype(np.float64) + + # ----------------------------------------------------------------- + # 3.1 Feature scaling – we put the bias in the first column + # ----------------------------------------------------------------- + # compute mean / std on the whole training set (no train/val split yet) + self.mean_ = X_raw.mean(axis=0) + self.std_ = X_raw.std(axis=0) + # avoid division by zero + self.std_[self.std_ == 0] = 1.0 + + X_scaled = (X_raw - self.mean_) / self.std_ + # add bias column (all ones) + X_scaled = np.hstack([np.ones((X_scaled.shape[0], 1)), X_scaled]) + + self.X_ = X_scaled + self.w_ = np.zeros(X_scaled.shape[1]) # initialise weights + + # ------------------------------------------------------------------ + # 4. Fit – batch gradient descent + # ------------------------------------------------------------------ + def fit(self) -> None: + """Runs batch gradient descent for `n_iter` epochs.""" + for i in range(1, self.n_iter + 1): + z = np.dot(self.X_, self.w_) # linear part + p = self._sigmoid(z) # predicted probabilities + + # gradient of the log‑likelihood (including bias) + gradient = np.dot(self.X_.T, (p - self.y_)) / self.y_.size + + # weight update + self.w_ -= self.lr * gradient + + # record cost and check stopping criterion + loss = self._cost(self.y_, p) + self.loss_history_.append(loss) + + if self.verbose and i % 100 == 0: + print(f"Iteration {i:4d} – loss: {loss:.6f}") + + if i > 1 and abs(self.loss_history_[-2] - loss) < self.tol: + if self.verbose: + print(f"Converged after {i} iterations.") + break + + # ------------------------------------------------------------------ + # 5. Predict – binary class labels + # ------------------------------------------------------------------ + def predict(self, X: np.ndarray) -> np.ndarray: + """Return 0/1 predictions for a new X matrix (already scaled).""" + z = np.dot(X, self.w_) + probs = self._sigmoid(z) + return (probs >= 0.5).astype(int) + + # ------------------------------------------------------------------ + # 6. Score – accuracy on a given (X, y) pair + # ------------------------------------------------------------------ + def score(self, X: np.ndarray, y: np.ndarray) -> float: + """Return the classification accuracy.""" + y_pred = self.predict(X) + return np.mean(y_pred == y) - def score(self): -''' if __name__ == "__main__": columns = [ @@ -43,7 +159,7 @@ if __name__ == "__main__": for col in num_cols: df = df[df[col] >= 0] - # sanity checks for data validity + # sanity checks for data validity - max tumor sizes possible df = df[(df['radius_mean'] > 0) & (df['radius_mean'] <= 30)] df = df[(df['radius_worst'] > 0) & (df['radius_worst'] <= 30)] df = df[(df['texture_mean'] >= 0) & (df['texture_mean'] <= 100)] @@ -57,4 +173,32 @@ if __name__ == "__main__": assert df.isna().sum().sum() == 0, "There are still some null values." df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0}) # making diagnosis numeric - df['Diagnosis'] = df['Diagnosis'].astype('category') \ No newline at end of file + df['Diagnosis'] = df['Diagnosis'].astype('category') + + # ---- 7.2 Instantiate and train ------------------------------------ + model = LogisticRegressionGD(learning_rate=0.05, + n_iter=5000, + tolerance=1e-6, + verbose=True) + + # we need to split X / y here + X = df.drop(columns=['Diagnosis']) + y = df['Diagnosis'].cat.codes.values # 0/1 array + + # Standardise X inside the model for us – we’ll do it in `prepare` + model.X_ = (X - X.mean()) / X.std() # bias‑column will be added later + model.X_ = np.hstack([np.ones((model.X_.shape[0], 1)), model.X_]) # add bias + model.y_ = y + + # Fit the model + model.fit() + + # ------------------------------------------------- + # 8. Evaluate on the same data (you could split) + # ------------------------------------------------- + acc = model.score(model.X_, model.y_) + print(f"Training accuracy (on the whole cleaned set): {acc:.4f}") + + # Example: predict on the first 10 samples + y_hat = model.predict(model.X_[:10]) + print("First 10 predictions:", y_hat) diff --git a/mini-batch-sgd-linear-regression-parkinsons.py b/mini-batch-sgd-linear-regression-parkinsons.py index f46f8df..bf0349f 100644 --- a/mini-batch-sgd-linear-regression-parkinsons.py +++ b/mini-batch-sgd-linear-regression-parkinsons.py @@ -135,7 +135,7 @@ if __name__ == "__main__": df.dropna(inplace=True) # remove null values print(f"Rows remaining after drop of the null values: {len(df)}") - # sanity checks for data validity + # sanity checks for data validity - realistic parkinson data range estimations df = df[(df['age'] >= 18) & (df['age'] <= 95)] df = df[(df['motor_UPDRS'] >= 0) & (df['motor_UPDRS'] <= 100)] df = df[(df['total_UPDRS'] >= 0) & (df['total_UPDRS'] <= 100)] diff --git a/mini-batch-sgd-logistic-regression-wdbc.py b/mini-batch-sgd-logistic-regression-wdbc.py index 820006e..16b1f58 100644 --- a/mini-batch-sgd-logistic-regression-wdbc.py +++ b/mini-batch-sgd-logistic-regression-wdbc.py @@ -43,7 +43,7 @@ if __name__ == "__main__": for col in num_cols: df = df[df[col] >= 0] - # sanity checks for data validity + # sanity checks for data validity - max tumor sizes possible df = df[(df['radius_mean'] > 0) & (df['radius_mean'] <= 30)] df = df[(df['radius_worst'] > 0) & (df['radius_worst'] <= 30)] df = df[(df['texture_mean'] >= 0) & (df['texture_mean'] <= 100)]