From 455b48c89b2295c303213b32f84fa69eb552be46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Batuhan=20Berk=20Ba=C5=9Fo=C4=9Flu?= Date: Thu, 18 Sep 2025 20:58:02 -0400 Subject: [PATCH] Finished the implementation of the python code. --- linear-regression-parkinsons.py | 22 +- logistic-regression-wdbc.py | 235 +++++++++--------- ...-batch-sgd-linear-regression-parkinsons.py | 19 +- mini-batch-sgd-logistic-regression-wdbc.py | 176 ++++++++++++- parkinsons_updrs.names | 107 ++++++++ wdbc.names | 140 +++++++++++ 6 files changed, 540 insertions(+), 159 deletions(-) create mode 100755 parkinsons_updrs.names create mode 100755 wdbc.names diff --git a/linear-regression-parkinsons.py b/linear-regression-parkinsons.py index e2f73e7..8759023 100644 --- a/linear-regression-parkinsons.py +++ b/linear-regression-parkinsons.py @@ -27,8 +27,8 @@ class LinearRegression: class LinearRegression: ''' - Constructor for the Linear Regression with analytical. It uses bias. It also - initializes the weight, mean and std. + Constructor for the linear regression with analytical solution. It uses bias. It also + initializes the weight, mean and standard deviation. ''' def __init__(self, add_bias): self.add_bias = add_bias # bias to prepend a column of ones (the intercept term) @@ -60,7 +60,8 @@ class LinearRegression: def fit(self, x: pd.DataFrame, y: pd.Series) -> "LinearRegression": ''' Fit method to fit X and Y datas through pandas and train the linear model by analytical solution. - It uses pandas DataFrame for the X and Series for the Y. + It uses pandas DataFrame for the X and Series for the Y. It uses the linear regression formula + to calculate weight ''' x = self.prepare(x) y = pd.Series(y).astype("float64") @@ -84,7 +85,7 @@ class LinearRegression: def predict(self, x: pd.DataFrame) -> pd.Series: ''' - Predict method is used to test trained data to do X prediction by multiplying X and weight vectors. + Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors. ''' if self.w is None: # if weight is empty, throw error raise RuntimeError("Model is not fitted yet. Call `fit` first.") @@ -95,7 +96,7 @@ class LinearRegression: def score(self, x: pd.DataFrame, y: pd.Series) -> float: ''' This method is used to calculate coefficient of determination to assess the goodness - of fit from a regression model + of fit from the linear regression model ''' y_pred = self.predict(x) # predicts Y value with X predict method. y = pd.Series(y).astype('float64') @@ -127,7 +128,7 @@ if __name__ == "__main__": df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values df.dropna(inplace=True) # remove null values - print(f"Rows remaining after drop of the null values: {len(df)}") + print(f"Rows remaining after drop of the null values: {len(df)}\n") # sanity checks for data validity - realistic parkinson data range estimations df = df[(df['age'] >= 18) & (df['age'] <= 95)] @@ -157,12 +158,9 @@ if __name__ == "__main__": # evaluation of the model print("\nR² on training data:", model.score(x_train, y_train)) - print("\nR² on testing data:", model.score(x_test, y_test)) + print("R² on testing data:", model.score(x_test, y_test)) # predict Y values using the trained data preds = model.predict(x_test) - print("\nFirst 5 predictions:") - print(preds.head()) - - print("\nWeights:") - print(model.w.round(4)) + print("\nFirst 10 predictions:") + print(preds.head(10)) diff --git a/logistic-regression-wdbc.py b/logistic-regression-wdbc.py index 601db11..3e6b84f 100644 --- a/logistic-regression-wdbc.py +++ b/logistic-regression-wdbc.py @@ -1,144 +1,126 @@ import numpy as np import pandas as pd -class LogisticRegressionGD: - """Binary logistic regression trained with batch gradient descent.""" - def __init__(self, - learning_rate: float = 0.01, - n_iter: int = 1000, - tolerance: float = 1e-5, - verbose: bool = False): - """ - Parameters - ---------- - learning_rate : float - Step size for weight updates. - n_iter : int - Maximum number of iterations. - tolerance : float - Stopping criterion: if the change in loss is < tolerance, stop. - verbose : bool - If True, prints loss at every 100 iterations. - """ + +class LogisticRegression: + ''' + Constructor for the logistic regression with gradient descent. It uses learning rate, iteration number, + tolerance and verbose. It also initializes the weight, loss, x, y, mean and std. + ''' + + def __init__(self, learning_rate: float, n_iter: int, tolerance: float, verbose: bool) -> None: self.lr = learning_rate self.n_iter = n_iter self.tol = tolerance self.verbose = verbose + self.w: np.ndarray | None = None # weight/coefficient (bias as first element) + self.loss: list[float] = [] # loss per iteration + self.x: np.ndarray | None = None # matrix of inputs after standardisation + self.y: np.ndarray | None = None # target vector + self.mean: np.ndarray | None = None # used for standardisation + self.std: np.ndarray | None = None # standard deviation - # placeholders that will be filled during training - self.w_ = None # weights (including bias as w[0]) - self.loss_history_ = [] # loss at each iteration - self.X_ = None # feature matrix (after standardisation) - self.y_ = None # target vector (0/1) - - # ------------------------------------------------------------------ - # 2. Sigmoid helper (vectorised) - # ------------------------------------------------------------------ @staticmethod - def _sigmoid(z: np.ndarray) -> np.ndarray: - return 1.0 / (1.0 + np.exp(-z)) + def sigmoid(z: np.ndarray) -> np.ndarray: + """Sigmoid method for the logistic regression method.""" + return 1.0 / (1.0 + np.exp(-z)) # 1/(1+exp(-z)) - # ------------------------------------------------------------------ - # 3. Cost function (cross‑entropy) - # ------------------------------------------------------------------ @staticmethod - def _cost(y: np.ndarray, p: np.ndarray) -> float: - # avoid log(0) by clipping + def cost(y: np.ndarray, p: np.ndarray) -> float: + """Cross‑entropy loss is used for the cost calculation""" eps = 1e-15 p = np.clip(p, eps, 1 - eps) return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p)) - # ------------------------------------------------------------------ - # 4. Data preparation – this is where we split X / y, scale, etc. - # ------------------------------------------------------------------ - def prepare(self, df: pd.DataFrame, target_col: str = 'Diagnosis') -> None: + def prepare(self, df: pd.DataFrame, target_col: str) -> None: """ - Splits `df` into X and y, standardises X (mean=0, std=1), - and stores the result in the class attributes. - Parameters - ---------- - df : pd.DataFrame - Cleaned data – *already* contains a numeric target in `target_col`. - target_col : str - Name of the binary target column. + Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column. + Then it does standardisation, adds bias and initializes the weight/coefficient. + """ - # target must be a 0/1 array - self.y_ = df[target_col].values.astype(np.int64) + if target_col not in df.columns: + raise ValueError(f"Target column '{target_col}' not found in DataFrame.") - # X – all columns except the target - X_raw = df.drop(columns=[target_col]).values.astype(np.float64) + self.y = df[target_col].values.astype(np.int64) - # ----------------------------------------------------------------- - # 3.1 Feature scaling – we put the bias in the first column - # ----------------------------------------------------------------- - # compute mean / std on the whole training set (no train/val split yet) - self.mean_ = X_raw.mean(axis=0) - self.std_ = X_raw.std(axis=0) - # avoid division by zero - self.std_[self.std_ == 0] = 1.0 + x_raw = df.drop(columns=[target_col]).values.astype(np.float64) - X_scaled = (X_raw - self.mean_) / self.std_ - # add bias column (all ones) - X_scaled = np.hstack([np.ones((X_scaled.shape[0], 1)), X_scaled]) + # standardisation + self.mean = x_raw.mean(axis=0) + self.std = x_raw.std(axis=0) + self.std[self.std == 0] = 1.0 - self.X_ = X_scaled - self.w_ = np.zeros(X_scaled.shape[1]) # initialise weights + x_scaled = (x_raw - self.mean) / self.std # standardisation formula + + + bias = np.ones((x_scaled.shape[0], 1), dtype=np.float64) # adding bias + self.x = np.hstack((bias, x_scaled)) + + self.w = np.zeros(self.x.shape[1], dtype=np.float64) # initialize weight as zero - # ------------------------------------------------------------------ - # 4. Fit – batch gradient descent - # ------------------------------------------------------------------ def fit(self) -> None: - """Runs batch gradient descent for `n_iter` epochs.""" + """ + + Fit method to fit X and Y datas through pandas and train the linear model by gradient descent. + For the n iterations, it finds probabilities through sigmoid of linear prediction and does the + gradient to calculate the loss. + + """ + if self.x is None or self.y is None: # if x or y are empty, throw error + raise RuntimeError("Model is not fitted yet. Call `fit` first.") + for i in range(1, self.n_iter + 1): - z = np.dot(self.X_, self.w_) # linear part - p = self._sigmoid(z) # predicted probabilities + z = self.x.dot(self.w) # linear prediction + p = self.sigmoid(z) # probabilities of the model predictions - # gradient of the log‑likelihood (including bias) - gradient = np.dot(self.X_.T, (p - self.y_)) / self.y_.size + gradient = self.x.T.dot(p - self.y) / self.y.size # gradient calculation formula - # weight update - self.w_ -= self.lr * gradient + self.w -= self.lr * gradient # gradient multiplied by learning rate is removed from weight - # record cost and check stopping criterion - loss = self._cost(self.y_, p) - self.loss_history_.append(loss) + loss = self.cost(self.y, p) # cost is calculated through cross‑entropy and added for the current range + self.loss.append(loss) + # if verbose, it shows the loss every 100 iterations and displays it if self.verbose and i % 100 == 0: - print(f"Iteration {i:4d} – loss: {loss:.6f}") + print(f"Iter {i:4d} – loss: {loss:.6f}") - if i > 1 and abs(self.loss_history_[-2] - loss) < self.tol: + # tests whether the absolute change in loss is smaller than the tolerance + if i > 1 and abs(self.loss[-2] - loss) < self.tol: if self.verbose: print(f"Converged after {i} iterations.") - break + break # loss is stopped so further training would be unnecessary - # ------------------------------------------------------------------ - # 5. Predict – binary class labels - # ------------------------------------------------------------------ - def predict(self, X: np.ndarray) -> np.ndarray: - """Return 0/1 predictions for a new X matrix (already scaled).""" - z = np.dot(X, self.w_) - probs = self._sigmoid(z) - return (probs >= 0.5).astype(int) - - # ------------------------------------------------------------------ - # 6. Score – accuracy on a given (X, y) pair - # ------------------------------------------------------------------ - def score(self, X: np.ndarray, y: np.ndarray) -> float: - """Return the classification accuracy.""" - y_pred = self.predict(X) - return np.mean(y_pred == y) + def predict(self, x: np.ndarray | pd.DataFrame) -> np.ndarray: + """ + Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors + and then calculates the model probability by applying sigmoid function. + """ + if isinstance(x, pd.DataFrame): # verifies value type + x = x.values.astype(np.float64) + if x.ndim == 1: + x = x.reshape(1, -1) + z = x.dot(self.w) + probs = self.sigmoid(z) # probability calculation through sigmoid method + return (probs >= 0.5).astype(int) # 0.5 is commonly used to define positivity of the probability + def score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float: + """ + This method is used to calculate mean accuracy with the prediction of Y and actual Y values. + """ + y_pred = self.predict(x) + y_true = np.asarray(y).astype(int) + return np.mean(y_pred == y_true) # mean is calculated if Y values match if __name__ == "__main__": columns = [ 'ID', 'Diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', - 'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean', + 'compactness_mean', 'concavitymean', 'concave_points_mean', 'symmetrymean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', - 'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se', + 'compactness_se', 'concavityse', 'concave_points_se', 'symmetryse', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', - 'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst' + 'compactness_worst', 'concavityworst', 'concave_points_worst', 'symmetryworst', 'fractal_dimension_worst' ] df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str) @@ -155,7 +137,7 @@ if __name__ == "__main__": df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values df.dropna(inplace=True) # remove null values - print(f"Rows remaining after drop of the null values: {len(df)}") + print(f"Rows remaining after drop of the null values: {len(df)}\n") for col in num_cols: df = df[df[col] >= 0] @@ -172,33 +154,40 @@ if __name__ == "__main__": # check if there are still null values assert df.isna().sum().sum() == 0, "There are still some null values." - df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0}) # making diagnosis numeric - df['Diagnosis'] = df['Diagnosis'].astype('category') + # making diagnosis numeric + df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category") - # ---- 7.2 Instantiate and train ------------------------------------ - model = LogisticRegressionGD(learning_rate=0.05, - n_iter=5000, - tolerance=1e-6, - verbose=True) + rng = np.random.default_rng(seed=42) + n_train = len(df) + indices = rng.permutation(n_train) + train_size = int(0.8 * n_train) - # we need to split X / y here - X = df.drop(columns=['Diagnosis']) - y = df['Diagnosis'].cat.codes.values # 0/1 array + train_idx = indices[:train_size] + test_idx = indices[train_size:] - # Standardise X inside the model for us – we’ll do it in `prepare` - model.X_ = (X - X.mean()) / X.std() # bias‑column will be added later - model.X_ = np.hstack([np.ones((model.X_.shape[0], 1)), model.X_]) # add bias - model.y_ = y + df_train = df.iloc[train_idx].reset_index(drop=True) + df_test = df.iloc[test_idx].reset_index(drop=True) - # Fit the model + # training of the model + model = LogisticRegression(learning_rate=0.00005, n_iter=5000, tolerance=1e-6, verbose=True) + # other values could be used, for example (lr=0.01, n_iter=2000, tolerance=1e-3, verbose=False) + model.prepare(df_train, target_col="Diagnosis") model.fit() - # ------------------------------------------------- - # 8. Evaluate on the same data (you could split) - # ------------------------------------------------- - acc = model.score(model.X_, model.y_) - print(f"Training accuracy (on the whole cleaned set): {acc:.4f}") + # evaluation of the model + train_acc = model.score(model.x, model.y) + print(f"\nMean accuracy on training data: {train_acc:.4f}") - # Example: predict on the first 10 samples - y_hat = model.predict(model.X_[:10]) - print("First 10 predictions:", y_hat) + # copied prepare method for building test X data + x_test_raw = df_test.drop(columns=['Diagnosis']).values.astype(np.float64) + x_test_scaled = (x_test_raw - model.mean) / model.std + bias_test = np.ones((x_test_scaled.shape[0], 1), dtype=np.float64) + X_test = np.hstack((bias_test, x_test_scaled)) + y_test = df_test['Diagnosis'].values.astype(int) + test_acc = model.score(X_test, y_test) + print(f"Mean accuracy on testing data: {test_acc:.4f}") + + # predict Y values using the trained data + first_10 = X_test[:10] + y_hat = model.predict(first_10) + print("\nFirst 10 predictions:", y_hat.ravel()) \ No newline at end of file diff --git a/mini-batch-sgd-linear-regression-parkinsons.py b/mini-batch-sgd-linear-regression-parkinsons.py index bf0349f..66f3e7e 100644 --- a/mini-batch-sgd-linear-regression-parkinsons.py +++ b/mini-batch-sgd-linear-regression-parkinsons.py @@ -3,8 +3,8 @@ import pandas as pd class LinearRegression: ''' - Constructor for the Linear Regression with mini‑batch stochastic gradient descent. It uses learning rate, - iteration number, batch size, bias and verbose. It also initializes the weight, mean and std. + Constructor for the linear regression with mini‑batch stochastic gradient descent. It uses learning rate, + iteration number, batch size, bias and verbose. It also initializes the weight, mean and standard deviation. ''' def __init__(self, lr, n_iter, batch_size, add_bias, verbose): self.lr = lr # learning rate @@ -90,7 +90,7 @@ class LinearRegression: def predict(self, x: pd.DataFrame) -> pd.Series: ''' - Predict method makes X prediction by multiplying X and weight vectors. + Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors. ''' if self.w is None: # if weight is empty, throw error raise RuntimeError("Model is not fitted yet. Call `fit` first.") @@ -101,7 +101,7 @@ class LinearRegression: def score(self, x: pd.DataFrame, y: pd.Series) -> float: ''' This method is used to calculate coefficient of determination to assess the goodness - of fit from a regression model + of fit from the linear regression model ''' y_pred = self.predict(x) # predicts Y value with X predict method. y = pd.Series(y).astype('float64') @@ -133,7 +133,7 @@ if __name__ == "__main__": df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values df.dropna(inplace=True) # remove null values - print(f"Rows remaining after drop of the null values: {len(df)}") + print(f"Rows remaining after drop of the null values: {len(df)}\n") # sanity checks for data validity - realistic parkinson data range estimations df = df[(df['age'] >= 18) & (df['age'] <= 95)] @@ -164,12 +164,9 @@ if __name__ == "__main__": # evaluation of the model print("\nR² on training data:", model.score(x_train, y_train)) - print("\nR² on testing data:", model.score(x_test, y_test)) + print("R² on testing data:", model.score(x_test, y_test)) # predict Y values using the trained data preds = model.predict(x_test) - print("\nFirst 5 predictions:") - print(preds.head()) - - print("\nWeights:") - print(model.w.round(4)) \ No newline at end of file + print("\nFirst 10 predictions:") + print(preds.head(10)) \ No newline at end of file diff --git a/mini-batch-sgd-logistic-regression-wdbc.py b/mini-batch-sgd-logistic-regression-wdbc.py index 16b1f58..d2b238f 100644 --- a/mini-batch-sgd-logistic-regression-wdbc.py +++ b/mini-batch-sgd-logistic-regression-wdbc.py @@ -1,28 +1,143 @@ import numpy as np import pandas as pd -''' + class LogisticRegression: - def __init__(self): + ''' + Constructor for the logistic regression with gradient descent. It uses learning rate, iteration number, + tolerance and verbose. It also initializes the weight, loss, x, y, mean and std. + ''' - def prepare(self): + def __init__(self, learning_rate: float, n_iter: int, batch_size: int, tolerance: float, verbose: bool) -> None: + self.lr = learning_rate + self.n_iter = n_iter + self.batch_size = batch_size + self.tol = tolerance + self.verbose = verbose + self.w: np.ndarray | None = None # weight/coefficient (bias as first element) + self.loss: list[float] = [] # loss per iteration + self.x: np.ndarray | None = None # matrix of inputs after standardisation + self.y: np.ndarray | None = None # target vector + self.mean: np.ndarray | None = None # used for standardisation + self.std: np.ndarray | None = None # standard deviation - def fit(self): + @staticmethod + def sigmoid(z: np.ndarray) -> np.ndarray: + """Sigmoid method for the logistic regression method.""" + return 1.0 / (1.0 + np.exp(-z)) # 1/(1+exp(-z)) - def predict(self): + @staticmethod + def cost(y: np.ndarray, p: np.ndarray) -> float: + """Cross‑entropy loss is used for the cost calculation""" + eps = 1e-15 + p = np.clip(p, eps, 1 - eps) + return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p)) - def score(self): -''' + def prepare(self, df: pd.DataFrame, target_col: str) -> None: + """ + + Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column. + Then it does standardisation, adds bias and initializes the weight/coefficient. + + """ + if target_col not in df.columns: + raise ValueError(f"Target column '{target_col}' not found in DataFrame.") + + self.y = df[target_col].values.astype(np.int64) + + x_raw = df.drop(columns=[target_col]).values.astype(np.float64) + + # standardisation + self.mean = x_raw.mean(axis=0) + self.std = x_raw.std(axis=0) + self.std[self.std == 0] = 1.0 + + x_scaled = (x_raw - self.mean) / self.std # standardisation formula + + + bias = np.ones((x_scaled.shape[0], 1), dtype=np.float64) # adding bias + self.x = np.hstack((bias, x_scaled)) + + self.w = np.zeros(self.x.shape[1], dtype=np.float64) # initialize weight as zero + + def fit(self) -> None: + """ + + Fit method to fit X and Y datas through pandas and train the linear model by gradient descent. + For the n iterations, it finds probabilities through sigmoid of linear prediction and does the + gradient to calculate the loss. + + """ + if self.x is None or self.y is None: # if x or y are empty, throw error + raise RuntimeError("Model is not fitted yet. Call `prepare` first.") + + n_samples = self.x.shape[0] + batch_size = self.batch_size or n_samples + + for epoch in range(1, self.n_iter + 1): + shuffled_idx = np.random.permutation(n_samples) # random permutation of the indices + x_shuffled = self.x[shuffled_idx] + y_shuffled = self.y[shuffled_idx] + + # process execution for each mini‑batch + for b in range(0, n_samples, batch_size): + start = b * batch_size + end = start + batch_size + idx = shuffled_idx[start:end] + + x_batch = x_shuffled[idx] + y_batch = y_shuffled[idx] + + z = x_batch.dot(self.w) + p = self.sigmoid(z) + + grad = x_batch.T.dot(p - y_batch) / y_batch.size # gradient calculation formula + self.w -= self.lr * grad # gradient multiplied by learning rate is removed from weight + + # cost is calculated through cross‑entropy and added for the current range + loss = self.cost(self.y, self.sigmoid(self.x.dot(self.w))) + self.loss.append(loss) + + # if verbose, it shows the loss every 100 iterations and displays it + if self.verbose and epoch % 100 == 0: + print(f"Iter {epoch:4d} – loss: {loss:.6f}") + + # tests whether the absolute change in loss is smaller than the tolerance + if epoch > 1 and abs(self.loss[-2] - loss) < self.tol: + if self.verbose: + print(f"Converged after {epoch} iterations.") + break + + def predict(self, x: np.ndarray | pd.DataFrame) -> np.ndarray: + """ + Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors + and then calculates the model probability by applying sigmoid function. + """ + if isinstance(x, pd.DataFrame): # verifies value type + x = x.values.astype(np.float64) + if x.ndim == 1: + x = x.reshape(1, -1) + z = x.dot(self.w) + probs = self.sigmoid(z) # probability calculation through sigmoid method + return (probs >= 0.5).astype(int) # 0.5 is commonly used to define positivity of the probability + + def score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float: + """ + This method is used to calculate mean accuracy with the prediction of Y and actual Y values. + """ + y_pred = self.predict(x) + y_true = np.asarray(y).astype(int) + return np.mean(y_pred == y_true) # mean is calculated if Y values match if __name__ == "__main__": columns = [ 'ID', 'Diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', - 'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean', + 'compactness_mean', 'concavitymean', 'concave_points_mean', 'symmetrymean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', - 'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se', + 'compactness_se', 'concavityse', 'concave_points_se', 'symmetryse', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', - 'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst' + 'compactness_worst', 'concavityworst', 'concave_points_worst', 'symmetryworst', 'fractal_dimension_worst' ] df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str) @@ -39,7 +154,7 @@ if __name__ == "__main__": df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values df.dropna(inplace=True) # remove null values - print(f"Rows remaining after drop of the null values: {len(df)}") + print(f"Rows remaining after drop of the null values: {len(df)}\n") for col in num_cols: df = df[df[col] >= 0] @@ -56,5 +171,40 @@ if __name__ == "__main__": # check if there are still null values assert df.isna().sum().sum() == 0, "There are still some null values." - df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0}) # making diagnosis numeric - df['Diagnosis'] = df['Diagnosis'].astype('category') \ No newline at end of file + # making diagnosis numeric + df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category") + + rng = np.random.default_rng(seed=42) + n_samples = len(df) + indices = rng.permutation(n_samples) + train_size = int(0.8 * n_samples) + + train_idx = indices[:train_size] + test_idx = indices[train_size:] + + df_train = df.iloc[train_idx].reset_index(drop=True) + df_test = df.iloc[test_idx].reset_index(drop=True) + + # training of the model + model = LogisticRegression(learning_rate=0.00005, n_iter=5000, batch_size=64, tolerance=1e-6, verbose=True) + # other values could be used, for example (lr=0.01, n_iter=2000, tolerance=1e-3, verbose=False) + model.prepare(df_train, target_col="Diagnosis") + model.fit() + + # evaluation of the model + train_acc = model.score(model.x, model.y) + print(f"\nMean accuracy on training data: {train_acc:.4f}") + + # copied prepare method for building test X data + x_test_raw = df_test.drop(columns=['Diagnosis']).values.astype(np.float64) + x_test_scaled = (x_test_raw - model.mean) / model.std + bias_test = np.ones((x_test_scaled.shape[0], 1), dtype=np.float64) + X_test = np.hstack((bias_test, x_test_scaled)) + y_test = df_test['Diagnosis'].values.astype(int) + test_acc = model.score(X_test, y_test) + print(f"Mean accuracy on testing data: {test_acc:.4f}") + + # predict Y values using the trained data + first_10 = X_test[:10] + y_hat = model.predict(first_10) + print("\nFirst 10 predictions:", y_hat.ravel()) diff --git a/parkinsons_updrs.names b/parkinsons_updrs.names new file mode 100755 index 0000000..c769d3b --- /dev/null +++ b/parkinsons_updrs.names @@ -0,0 +1,107 @@ +Parkinsons Telemonitoring Data Set + +Abstract: Oxford Parkinson's Disease Telemonitoring Dataset + +============================================================ + +Data Set Characteristics: Multivariate +Attribute Characteristics: Integer, Real +Associated Tasks: Regression +Number of Instances: 5875 +Number of Attributes: 26 +Area: Life +Date Donated: 2009-10-29 + +============================================================ + +SOURCE: + +The dataset was created by Athanasios Tsanas (tsanasthanasis '@' gmail.com) +and Max Little (littlem '@' physics.ox.ac.uk) of the University of Oxford, in +collaboration with 10 medical centers in the US and Intel Corporation who +developed the telemonitoring device to record the speech signals. The +original study used a range of linear and nonlinear regression methods to +predict the clinician's Parkinson's disease symptom score on the UPDRS scale. + + +============================================================ + +DATA SET INFORMATION: + +This dataset is composed of a range of biomedical voice measurements from 42 +people with early-stage Parkinson's disease recruited to a six-month trial of +a telemonitoring device for remote symptom progression monitoring. The +recordings were automatically captured in the patient's homes. + +Columns in the table contain subject number, subject age, subject gender, +time interval from baseline recruitment date, motor UPDRS, total UPDRS, and +16 biomedical voice measures. Each row corresponds to one of 5,875 voice +recording from these individuals. The main aim of the data is to predict the +motor and total UPDRS scores ('motor_UPDRS' and 'total_UPDRS') from the 16 +voice measures. + +The data is in ASCII CSV format. The rows of the CSV file contain an instance +corresponding to one voice recording. There are around 200 recordings per +patient, the subject number of the patient is identified in the first column. +For further information or to pass on comments, please contact Athanasios +Tsanas (tsanasthanasis '@' gmail.com) or Max Little (littlem '@' +physics.ox.ac.uk). + +Further details are contained in the following reference -- if you use this +dataset, please cite: +Athanasios Tsanas, Max A. Little, Patrick E. McSharry, Lorraine O. Ramig (2009), +'Accurate telemonitoring of Parkinson.s disease progression by non-invasive +speech tests', +IEEE Transactions on Biomedical Engineering (to appear). + +Further details about the biomedical voice measures can be found in: +Max A. Little, Patrick E. McSharry, Eric J. Hunter, Lorraine O. Ramig (2009), +'Suitability of dysphonia measurements for telemonitoring of Parkinson's +disease', +IEEE Transactions on Biomedical Engineering, 56(4):1015-1022 + + +=========================================================== + +ATTRIBUTE INFORMATION: + +subject# - Integer that uniquely identifies each subject +age - Subject age +sex - Subject gender '0' - male, '1' - female +test_time - Time since recruitment into the trial. The integer part is the +number of days since recruitment. +motor_UPDRS - Clinician's motor UPDRS score, linearly interpolated +total_UPDRS - Clinician's total UPDRS score, linearly interpolated +Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP - Several measures of +variation in fundamental frequency +Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA - +Several measures of variation in amplitude +NHR,HNR - Two measures of ratio of noise to tonal components in the voice +RPDE - A nonlinear dynamical complexity measure +DFA - Signal fractal scaling exponent +PPE - A nonlinear measure of fundamental frequency variation + + +=========================================================== + +RELEVANT PAPERS: + +Little MA, McSharry PE, Hunter EJ, Ramig LO (2009), +'Suitability of dysphonia measurements for telemonitoring of Parkinson's +disease', +IEEE Transactions on Biomedical Engineering, 56(4):1015-1022 + +Little MA, McSharry PE, Roberts SJ, Costello DAE, Moroz IM. +'Exploiting Nonlinear Recurrence and Fractal Scaling Properties for Voice +Disorder Detection', +BioMedical Engineering OnLine 2007, 6:23 (26 June 2007) + +=========================================================== + +CITATION REQUEST: + +If you use this dataset, please cite the following paper: +A Tsanas, MA Little, PE McSharry, LO Ramig (2009) +'Accurate telemonitoring of Parkinson.s disease progression by non-invasive +speech tests', +IEEE Transactions on Biomedical Engineering (to appear). diff --git a/wdbc.names b/wdbc.names new file mode 100755 index 0000000..3af8990 --- /dev/null +++ b/wdbc.names @@ -0,0 +1,140 @@ +1. Title: Wisconsin Diagnostic Breast Cancer (WDBC) + +2. Source Information + +a) Creators: + + Dr. William H. Wolberg, General Surgery Dept., University of + Wisconsin, Clinical Sciences Center, Madison, WI 53792 + wolberg@eagle.surgery.wisc.edu + + W. Nick Street, Computer Sciences Dept., University of + Wisconsin, 1210 West Dayton St., Madison, WI 53706 + street@cs.wisc.edu 608-262-6619 + + Olvi L. Mangasarian, Computer Sciences Dept., University of + Wisconsin, 1210 West Dayton St., Madison, WI 53706 + olvi@cs.wisc.edu + +b) Donor: Nick Street + +c) Date: November 1995 + +3. Past Usage: + +first usage: + + W.N. Street, W.H. Wolberg and O.L. Mangasarian + Nuclear feature extraction for breast tumor diagnosis. + IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science + and Technology, volume 1905, pages 861-870, San Jose, CA, 1993. + +OR literature: + + O.L. Mangasarian, W.N. Street and W.H. Wolberg. + Breast cancer diagnosis and prognosis via linear programming. + Operations Research, 43(4), pages 570-577, July-August 1995. + +Medical literature: + + W.H. Wolberg, W.N. Street, and O.L. Mangasarian. + Machine learning techniques to diagnose breast cancer from + fine-needle aspirates. + Cancer Letters 77 (1994) 163-171. + + W.H. Wolberg, W.N. Street, and O.L. Mangasarian. + Image analysis and machine learning applied to breast cancer + diagnosis and prognosis. + Analytical and Quantitative Cytology and Histology, Vol. 17 + No. 2, pages 77-87, April 1995. + + W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian. + Computerized breast cancer diagnosis and prognosis from fine + needle aspirates. + Archives of Surgery 1995;130:511-516. + + W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian. + Computer-derived nuclear features distinguish malignant from + benign breast cytology. + Human Pathology, 26:792--796, 1995. + +See also: + http://www.cs.wisc.edu/~olvi/uwmp/mpml.html + http://www.cs.wisc.edu/~olvi/uwmp/cancer.html + +Results: + + - predicting field 2, diagnosis: B = benign, M = malignant + - sets are linearly separable using all 30 input features + - best predictive accuracy obtained using one separating plane + in the 3-D space of Worst Area, Worst Smoothness and + Mean Texture. Estimated accuracy 97.5% using repeated + 10-fold crossvalidations. Classifier has correctly + diagnosed 176 consecutive new patients as of November + 1995. + +4. Relevant information + + Features are computed from a digitized image of a fine needle + aspirate (FNA) of a breast mass. They describe + characteristics of the cell nuclei present in the image. + A few of the images can be found at + http://www.cs.wisc.edu/~street/images/ + + Separating plane described above was obtained using + Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree + Construction Via Linear Programming." Proceedings of the 4th + Midwest Artificial Intelligence and Cognitive Science Society, + pp. 97-101, 1992], a classification method which uses linear + programming to construct a decision tree. Relevant features + were selected using an exhaustive search in the space of 1-4 + features and 1-3 separating planes. + + The actual linear program used to obtain the separating plane + in the 3-dimensional space is that described in: + [K. P. Bennett and O. L. Mangasarian: "Robust Linear + Programming Discrimination of Two Linearly Inseparable Sets", + Optimization Methods and Software 1, 1992, 23-34]. + + + This database is also available through the UW CS ftp server: + + ftp ftp.cs.wisc.edu + cd math-prog/cpo-dataset/machine-learn/WDBC/ + +5. Number of instances: 569 + +6. Number of attributes: 32 (ID, diagnosis, 30 real-valued input features) + +7. Attribute information + +1) ID number +2) Diagnosis (M = malignant, B = benign) +3-32) + +Ten real-valued features are computed for each cell nucleus: + + a) radius (mean of distances from center to points on the perimeter) + b) texture (standard deviation of gray-scale values) + c) perimeter + d) area + e) smoothness (local variation in radius lengths) + f) compactness (perimeter^2 / area - 1.0) + g) concavity (severity of concave portions of the contour) + h) concave points (number of concave portions of the contour) + i) symmetry + j) fractal dimension ("coastline approximation" - 1) + +Several of the papers listed above contain detailed descriptions of +how these features are computed. + +The mean, standard error, and "worst" or largest (mean of the three +largest values) of these features were computed for each image, +resulting in 30 features. For instance, field 3 is Mean Radius, field +13 is Radius SE, field 23 is Worst Radius. + +All feature values are recoded with four significant digits. + +8. Missing attribute values: none + +9. Class distribution: 357 benign, 212 malignant \ No newline at end of file