From 7734802cd17fba2ad8012348677b1264d530a9b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Batuhan=20Berk=20Ba=C5=9Fo=C4=9Flu?= Date: Mon, 15 Sep 2025 21:48:45 -0400 Subject: [PATCH] Added the Linear Regression implementations. --- .idea/.gitignore | 3 + .idea/Parkinsons-data.iml | 8 + .../inspectionProfiles/profiles_settings.xml | 6 + .idea/misc.xml | 7 + .idea/modules.xml | 8 + .idea/vcs.xml | 6 + linear-regression-parkinsons.py | 133 ++++++++++++++ logistic-regression-wdbc.py | 48 +++++ ...-batch-sgd-linear-regression-parkinsons.py | 166 ++++++++++++++++++ mini-batch-sgd-logistic-regression-wdbc.py | 49 ++++++ mini-batch-stochastic-gradient-descent.py | 1 - 11 files changed, 434 insertions(+), 1 deletion(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/Parkinsons-data.iml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 mini-batch-sgd-linear-regression-parkinsons.py create mode 100644 mini-batch-sgd-logistic-regression-wdbc.py delete mode 100644 mini-batch-stochastic-gradient-descent.py diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/Parkinsons-data.iml b/.idea/Parkinsons-data.iml new file mode 100644 index 0000000..909438d --- /dev/null +++ b/.idea/Parkinsons-data.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..a6218fe --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..7b6d2d8 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/linear-regression-parkinsons.py b/linear-regression-parkinsons.py index bcb6f7e..536eb1c 100644 --- a/linear-regression-parkinsons.py +++ b/linear-regression-parkinsons.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd import matplotlib.pyplot as plt @@ -24,3 +25,135 @@ class LinearRegression: +class LinearRegression: + ''' + Constructor for the Linear Regression with analytical. It uses bias. It also + initializes the weight, mean and std. + ''' + def __init__(self, add_bias): + self.add_bias = add_bias # bias to prepend a column of ones (the intercept term) + self.w = None # weight/coefficient + self.mean = None # used for standardisation + self.std = None # standard deviation + + + def prepare(self, x: pd.DataFrame) -> pd.DataFrame: + ''' + Preparation method to ensure X is a float DataFrame, add a bias if it is true and standardise the X. + ''' + x = x.copy() + x = x.astype('float64') + + if self.mean is None: # standardisation + self.mean = x.mean() + self.std = x.std(ddof=0) + self.std.replace(0, 1, inplace=True) # guard against division by zero + + x = (x - self.mean) / self.std # standardisation formula + + if self.add_bias: # adding bias + x['bias'] = 1.0 + + return x + + + def fit(self, x: pd.DataFrame, y: pd.Series) -> "LinearRegression": + ''' + Fit method to fit X and Y datas through pandas and train the linear model by analytical solution. + It uses pandas DataFrame for the X and Series for the Y. + ''' + x = self.prepare(x) + y = pd.Series(y).astype("float64") + + # convert to numpy for speed + x_np = x.to_numpy() # n_samples, n_features + y_np = y.to_numpy()[:, None] # n_samples, 1 + + # w = (X^T*X)^-1*X^T*Y + xt_x = x_np.T.dot(x_np) + xt_y = x_np.T.dot(y_np) + w_np = np.linalg.pinv(xt_x).dot(xt_y) # n_features, 1 + + # store weights back as a pandas series + self.w = pd.Series( + w_np.ravel(), # flattens the array into 1-D array + index=x.columns + ) + return self + + + def predict(self, x: pd.DataFrame) -> pd.Series: + ''' + Predict method is used to test trained data to do X prediction by multiplying X and weight vectors. + ''' + if self.w is None: # if weight is empty, throw error + raise RuntimeError("Model is not fitted yet. Call `fit` first.") + + x = self.prepare(x) # standardisation and adding bias through prepare method + return x.dot(self.w) + + def score(self, x: pd.DataFrame, y: pd.Series) -> float: + ''' + This method is used to calculate coefficient of determination to assess the goodness + of fit from a regression model + ''' + y_pred = self.predict(x) # predicts Y value with X predict method. + y = pd.Series(y).astype('float64') + ss_res = ((y - y_pred) ** 2).sum() + # sum of squared residuals, residuals are difference between Y values and Y prediction values + ss_tot = ((y - y.mean()) ** 2).sum() + # total sum of squares, uses the difference between Y values and Y mean value + return 1.0 - ss_res / ss_tot + + +if __name__ == "__main__": + df = pd.read_csv('parkinsons_updrs.data', dtype=str) + + df.drop(columns=['subject#'], inplace=True) # drops subject# column + + missing_rows = df[df.isin(['?', 'NA', 'na', '']).any(axis=1)] # checks null values + print(f"Rows with null values: {len(missing_rows)}") + + df.replace(['?','NA', 'na', ''], pd.NA, inplace=True) # replace null values with NA identifier + + num_cols = [ + 'age', 'sex', 'test_time', 'motor_UPDRS', 'total_UPDRS', + 'Jitter(%)', 'Jitter(Abs)', 'Jitter:RAP', 'Jitter:PPQ5', 'Jitter:DDP', + 'Shimmer', 'Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', + 'Shimmer:APQ11', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'PPE' + ] + + for col in num_cols: + df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values + + df.dropna(inplace=True) # remove null values + print(f"Rows remaining after drop of the null values: {len(df)}") + + # check if there are still null values + assert df.isna().sum().sum() == 0, "There are still some null values." + + # split the X and Y values + target = 'total_UPDRS' + x = df.drop(columns=[target]) + y = df[target] + + # train / test splitting (80 / 20) + n_train = int(0.8 * len(x)) + x_train, x_test = x.iloc[:n_train], x.iloc[n_train:] + y_train, y_test = y.iloc[:n_train], y.iloc[n_train:] + + # training of the model + model = LinearRegression(add_bias=True) + model.fit(x_train, y_train) + + # evaluation of the model + print("\nR² on training data:", model.score(x_train, y_train)) + print("\nR² on testing data:", model.score(x_test, y_test)) + + # predict Y values using the trained data + preds = model.predict(x_test) + print("\nFirst 5 predictions:") + print(preds.head()) + + print("\nWeights:") + print(model.w.round(4)) diff --git a/logistic-regression-wdbc.py b/logistic-regression-wdbc.py index ffd919b..d6ad1e6 100644 --- a/logistic-regression-wdbc.py +++ b/logistic-regression-wdbc.py @@ -1 +1,49 @@ +import numpy as np import pandas as pd + +''' +class LogisticRegression: + def __init__(self): + + def prepare(self): + + def fit(self): + + def predict(self): + + def score(self): +''' + +if __name__ == "__main__": + columns = [ + 'ID', 'Diagnosis', + 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', + 'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean', + 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', + 'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se', + 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', + 'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst' + ] + + df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str) + + df.drop(columns=['ID'], inplace=True) # drops id column + + missing_rows = df[df.isin(['?', 'NA', 'na', '']).any(axis=1)] # checks null values + print(f"Rows with null values: {len(missing_rows)}") + + df.replace(['?','NA', 'na', ''], pd.NA, inplace=True) # replace null values with NA identifier + + num_cols = df.columns.difference(['Diagnosis']) + for col in num_cols: + df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values + + df.dropna(inplace=True) # remove null values + print(f"Rows remaining after drop of the null values: {len(df)}") + for col in num_cols: + df = df[df[col] >= 0] + + # check if there are still null values + assert df.isna().sum().sum() == 0, "There are still some null values." + + df['Diagnosis'] = df['Diagnosis'].astype('category') \ No newline at end of file diff --git a/mini-batch-sgd-linear-regression-parkinsons.py b/mini-batch-sgd-linear-regression-parkinsons.py new file mode 100644 index 0000000..e1f3adf --- /dev/null +++ b/mini-batch-sgd-linear-regression-parkinsons.py @@ -0,0 +1,166 @@ +import numpy as np +import pandas as pd + +class LinearRegression: + ''' + Constructor for the Linear Regression with mini‑batch stochastic gradient descent. It uses learning rate, + iteration number, batch size, bias and verbose. It also initializes the weight, mean and std. + ''' + def __init__(self, lr, n_iter, batch_size, add_bias, verbose): + self.lr = lr # learning rate + self.n_iter = n_iter # number of gradient-descent iterations + self.batch_size = batch_size # row number for each gradient step + self.add_bias = add_bias # bias to prepend a column of ones (the intercept term) + self.verbose = verbose # if true, prints the mean‑squared error every 100 iterations + self.w = None # weight/coefficient + self.mean = None # used for standardisation + self.std = None # standard deviation + + def prepare(self, x: pd.DataFrame) -> pd.DataFrame: + ''' + Preparation method to ensure X is a float DataFrame, add a bias if it is true and standardise the X. + ''' + x = x.copy() + x = x.astype('float64') + + if self.mean is None: # standardisation + self.mean = x.mean() + self.std = x.std(ddof=0) + self.std.replace(0, 1, inplace=True) # guard against division by zero + + x = (x - self.mean) / self.std # standardisation formula + + if self.add_bias: # adding bias + x['bias'] = 1.0 + + return x + + + def fit(self, x: pd.DataFrame, y: pd.Series) -> "LinearRegression": + ''' + Fit method to fit X and Y datas through pandas and train the linear model by gradient descent. + It uses pandas DataFrame for the X and Series for the Y. For the n iterations, it returns batch X and Y values + from random subset of indices calculates gradient from differences between predicted Y and batch Y values and + calculates the weight. If verbose it prints the mean square error for each 100 iterations. + ''' + x = self.prepare(x) # standardisation and adding bias through prepare method + y = pd.Series(y).astype('float64') # check if Y is series. + + x_np = x.to_numpy() + y_np = y.to_numpy() + + n_samples, n_features = x_np.shape # n samples + w_np = np.zeros(n_features) # initialize weight as zero + batch_size = self.batch_size + # defines n samples as batch size if size is none or bigger than n samples + if batch_size is None or batch_size >= n_samples: + batch_size = n_samples + + # number of batches per iteration + n_batches = int(np.ceil(n_samples / batch_size)) + + for epoch in range(1, self.n_iter + 1): + shuffled_idx = np.random.permutation(n_samples) # random permutation of the indices + for b in range(n_batches): + start = b * batch_size + end = start + batch_size + idx = shuffled_idx[start:end] + + x_batch = x_np[idx] + y_batch = y_np[idx] + # it returns X and Y batch values from a randomly permuted indices from start to end + + y_pred = x_batch.dot(w_np) + # makes Y prediction value for X batch value by multiplying X and weight vectors. + + error = y_batch - y_pred # error is difference between Y batch value and Y prediction value + grad = -2 * x_batch.T.dot(error) / batch_size + # gradient is calculated by multiplication of error, transposed X batch value and -2 divided by batch size + + w_np -= self.lr * grad # weight is decreased by multiplication of learning rate and gradient + + # if verbose, it calculates the mean squared error every 100 iterations and displays it + if self.verbose and epoch % 100 == 0: + y_full_pred = x.dot(w_np) + mse = ((y_np - y_full_pred) ** 2).mean() + print(f"Iter {epoch:5d} | MSE: {mse:.6f}") + + self.w = pd.Series(w_np, index=x.columns) # store weights back as a pandas series + return self + + def predict(self, x: pd.DataFrame) -> pd.Series: + ''' + Predict method makes X prediction by multiplying X and weight vectors. + ''' + if self.w is None: # if weight is empty, throw error + raise RuntimeError("Model is not fitted yet. Call `fit` first.") + + x = self.prepare(x) # standardisation and adding bias through prepare method + return x.dot(self.w) + + def score(self, x: pd.DataFrame, y: pd.Series) -> float: + ''' + This method is used to calculate coefficient of determination to assess the goodness + of fit from a regression model + ''' + y_pred = self.predict(x) # predicts Y value with X predict method. + y = pd.Series(y).astype('float64') + ss_res = ((y - y_pred) ** 2).sum() + # sum of squared residuals, residuals are difference between Y values and Y prediction values + ss_tot = ((y - y.mean()) ** 2).sum() + # total sum of squares, uses the difference between Y values and Y mean value + return 1.0 - ss_res / ss_tot + + +if __name__ == "__main__": + df = pd.read_csv('parkinsons_updrs.data', dtype=str) + + df.drop(columns=['subject#'], inplace=True) # drops subject# column + + missing_rows = df[df.isin(['?', 'NA', 'na', '']).any(axis=1)] # checks null values + print(f"Rows with null values: {len(missing_rows)}") + + df.replace(['?','NA', 'na', ''], pd.NA, inplace=True) # replace null values with NA identifier + + num_cols = [ + 'age', 'sex', 'test_time', 'motor_UPDRS', 'total_UPDRS', + 'Jitter(%)', 'Jitter(Abs)', 'Jitter:RAP', 'Jitter:PPQ5', 'Jitter:DDP', + 'Shimmer', 'Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', + 'Shimmer:APQ11', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'PPE' + ] + + for col in num_cols: + df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values + + df.dropna(inplace=True) # remove null values + print(f"Rows remaining after drop of the null values: {len(df)}") + + # check if there are still null values + assert df.isna().sum().sum() == 0, "There are still some null values." + + # split the X and Y values + target = 'total_UPDRS' + x = df.drop(columns=[target]) + y = df[target] + + # train / test splitting (80 / 20) + n_train = int(0.8 * len(x)) + x_train, x_test = x.iloc[:n_train], x.iloc[n_train:] + y_train, y_test = y.iloc[:n_train], y.iloc[n_train:] + + # training of the model + model = LinearRegression(lr=0.0001, n_iter=5000, batch_size=64, add_bias=True, verbose=True) + # other values could be used, for example (lr=0.01, n_iter=2000, batch_size=None, add_bias=True, verbose=False) + model.fit(x_train, y_train) + + # evaluation of the model + print("\nR² on training data:", model.score(x_train, y_train)) + print("\nR² on testing data:", model.score(x_test, y_test)) + + # predict Y values using the trained data + preds = model.predict(x_test) + print("\nFirst 5 predictions:") + print(preds.head()) + + print("\nWeights:") + print(model.w.round(4)) \ No newline at end of file diff --git a/mini-batch-sgd-logistic-regression-wdbc.py b/mini-batch-sgd-logistic-regression-wdbc.py new file mode 100644 index 0000000..d6ad1e6 --- /dev/null +++ b/mini-batch-sgd-logistic-regression-wdbc.py @@ -0,0 +1,49 @@ +import numpy as np +import pandas as pd + +''' +class LogisticRegression: + def __init__(self): + + def prepare(self): + + def fit(self): + + def predict(self): + + def score(self): +''' + +if __name__ == "__main__": + columns = [ + 'ID', 'Diagnosis', + 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', + 'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean', + 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', + 'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se', + 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', + 'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst' + ] + + df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str) + + df.drop(columns=['ID'], inplace=True) # drops id column + + missing_rows = df[df.isin(['?', 'NA', 'na', '']).any(axis=1)] # checks null values + print(f"Rows with null values: {len(missing_rows)}") + + df.replace(['?','NA', 'na', ''], pd.NA, inplace=True) # replace null values with NA identifier + + num_cols = df.columns.difference(['Diagnosis']) + for col in num_cols: + df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values + + df.dropna(inplace=True) # remove null values + print(f"Rows remaining after drop of the null values: {len(df)}") + for col in num_cols: + df = df[df[col] >= 0] + + # check if there are still null values + assert df.isna().sum().sum() == 0, "There are still some null values." + + df['Diagnosis'] = df['Diagnosis'].astype('category') \ No newline at end of file diff --git a/mini-batch-stochastic-gradient-descent.py b/mini-batch-stochastic-gradient-descent.py deleted file mode 100644 index 8b13789..0000000 --- a/mini-batch-stochastic-gradient-descent.py +++ /dev/null @@ -1 +0,0 @@ -