Finished the implementation of the python code.

This commit is contained in:
Batuhan Berk Başoğlu 2025-09-18 20:58:02 -04:00
parent 5702c3c1b8
commit 455b48c89b
Signed by: batuhan-basoglu
SSH key fingerprint: SHA256:kEsnuHX+qbwhxSAXPUQ4ox535wFHu/hIRaa53FzxRpo
6 changed files with 540 additions and 159 deletions

View file

@ -27,8 +27,8 @@ class LinearRegression:
class LinearRegression: class LinearRegression:
''' '''
Constructor for the Linear Regression with analytical. It uses bias. It also Constructor for the linear regression with analytical solution. It uses bias. It also
initializes the weight, mean and std. initializes the weight, mean and standard deviation.
''' '''
def __init__(self, add_bias): def __init__(self, add_bias):
self.add_bias = add_bias # bias to prepend a column of ones (the intercept term) self.add_bias = add_bias # bias to prepend a column of ones (the intercept term)
@ -60,7 +60,8 @@ class LinearRegression:
def fit(self, x: pd.DataFrame, y: pd.Series) -> "LinearRegression": def fit(self, x: pd.DataFrame, y: pd.Series) -> "LinearRegression":
''' '''
Fit method to fit X and Y datas through pandas and train the linear model by analytical solution. Fit method to fit X and Y datas through pandas and train the linear model by analytical solution.
It uses pandas DataFrame for the X and Series for the Y. It uses pandas DataFrame for the X and Series for the Y. It uses the linear regression formula
to calculate weight
''' '''
x = self.prepare(x) x = self.prepare(x)
y = pd.Series(y).astype("float64") y = pd.Series(y).astype("float64")
@ -84,7 +85,7 @@ class LinearRegression:
def predict(self, x: pd.DataFrame) -> pd.Series: def predict(self, x: pd.DataFrame) -> pd.Series:
''' '''
Predict method is used to test trained data to do X prediction by multiplying X and weight vectors. Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors.
''' '''
if self.w is None: # if weight is empty, throw error if self.w is None: # if weight is empty, throw error
raise RuntimeError("Model is not fitted yet. Call `fit` first.") raise RuntimeError("Model is not fitted yet. Call `fit` first.")
@ -95,7 +96,7 @@ class LinearRegression:
def score(self, x: pd.DataFrame, y: pd.Series) -> float: def score(self, x: pd.DataFrame, y: pd.Series) -> float:
''' '''
This method is used to calculate coefficient of determination to assess the goodness This method is used to calculate coefficient of determination to assess the goodness
of fit from a regression model of fit from the linear regression model
''' '''
y_pred = self.predict(x) # predicts Y value with X predict method. y_pred = self.predict(x) # predicts Y value with X predict method.
y = pd.Series(y).astype('float64') y = pd.Series(y).astype('float64')
@ -127,7 +128,7 @@ if __name__ == "__main__":
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
df.dropna(inplace=True) # remove null values df.dropna(inplace=True) # remove null values
print(f"Rows remaining after drop of the null values: {len(df)}") print(f"Rows remaining after drop of the null values: {len(df)}\n")
# sanity checks for data validity - realistic parkinson data range estimations # sanity checks for data validity - realistic parkinson data range estimations
df = df[(df['age'] >= 18) & (df['age'] <= 95)] df = df[(df['age'] >= 18) & (df['age'] <= 95)]
@ -157,12 +158,9 @@ if __name__ == "__main__":
# evaluation of the model # evaluation of the model
print("\nR² on training data:", model.score(x_train, y_train)) print("\nR² on training data:", model.score(x_train, y_train))
print("\nR² on testing data:", model.score(x_test, y_test)) print("R² on testing data:", model.score(x_test, y_test))
# predict Y values using the trained data # predict Y values using the trained data
preds = model.predict(x_test) preds = model.predict(x_test)
print("\nFirst 5 predictions:") print("\nFirst 10 predictions:")
print(preds.head()) print(preds.head(10))
print("\nWeights:")
print(model.w.round(4))

View file

@ -1,144 +1,126 @@
import numpy as np import numpy as np
import pandas as pd import pandas as pd
class LogisticRegressionGD:
"""Binary logistic regression trained with batch gradient descent.""" class LogisticRegression:
def __init__(self, '''
learning_rate: float = 0.01, Constructor for the logistic regression with gradient descent. It uses learning rate, iteration number,
n_iter: int = 1000, tolerance and verbose. It also initializes the weight, loss, x, y, mean and std.
tolerance: float = 1e-5, '''
verbose: bool = False):
""" def __init__(self, learning_rate: float, n_iter: int, tolerance: float, verbose: bool) -> None:
Parameters
----------
learning_rate : float
Step size for weight updates.
n_iter : int
Maximum number of iterations.
tolerance : float
Stopping criterion: if the change in loss is < tolerance, stop.
verbose : bool
If True, prints loss at every 100 iterations.
"""
self.lr = learning_rate self.lr = learning_rate
self.n_iter = n_iter self.n_iter = n_iter
self.tol = tolerance self.tol = tolerance
self.verbose = verbose self.verbose = verbose
self.w: np.ndarray | None = None # weight/coefficient (bias as first element)
self.loss: list[float] = [] # loss per iteration
self.x: np.ndarray | None = None # matrix of inputs after standardisation
self.y: np.ndarray | None = None # target vector
self.mean: np.ndarray | None = None # used for standardisation
self.std: np.ndarray | None = None # standard deviation
# placeholders that will be filled during training
self.w_ = None # weights (including bias as w[0])
self.loss_history_ = [] # loss at each iteration
self.X_ = None # feature matrix (after standardisation)
self.y_ = None # target vector (0/1)
# ------------------------------------------------------------------
# 2. Sigmoid helper (vectorised)
# ------------------------------------------------------------------
@staticmethod @staticmethod
def _sigmoid(z: np.ndarray) -> np.ndarray: def sigmoid(z: np.ndarray) -> np.ndarray:
return 1.0 / (1.0 + np.exp(-z)) """Sigmoid method for the logistic regression method."""
return 1.0 / (1.0 + np.exp(-z)) # 1/(1+exp(-z))
# ------------------------------------------------------------------
# 3. Cost function (crossentropy)
# ------------------------------------------------------------------
@staticmethod @staticmethod
def _cost(y: np.ndarray, p: np.ndarray) -> float: def cost(y: np.ndarray, p: np.ndarray) -> float:
# avoid log(0) by clipping """Crossentropy loss is used for the cost calculation"""
eps = 1e-15 eps = 1e-15
p = np.clip(p, eps, 1 - eps) p = np.clip(p, eps, 1 - eps)
return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p)) return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p))
# ------------------------------------------------------------------ def prepare(self, df: pd.DataFrame, target_col: str) -> None:
# 4. Data preparation this is where we split X / y, scale, etc.
# ------------------------------------------------------------------
def prepare(self, df: pd.DataFrame, target_col: str = 'Diagnosis') -> None:
""" """
Splits `df` into X and y, standardises X (mean=0, std=1),
and stores the result in the class attributes.
Parameters Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column.
---------- Then it does standardisation, adds bias and initializes the weight/coefficient.
df : pd.DataFrame
Cleaned data *already* contains a numeric target in `target_col`.
target_col : str
Name of the binary target column.
""" """
# target must be a 0/1 array if target_col not in df.columns:
self.y_ = df[target_col].values.astype(np.int64) raise ValueError(f"Target column '{target_col}' not found in DataFrame.")
# X all columns except the target self.y = df[target_col].values.astype(np.int64)
X_raw = df.drop(columns=[target_col]).values.astype(np.float64)
# ----------------------------------------------------------------- x_raw = df.drop(columns=[target_col]).values.astype(np.float64)
# 3.1 Feature scaling we put the bias in the first column
# -----------------------------------------------------------------
# compute mean / std on the whole training set (no train/val split yet)
self.mean_ = X_raw.mean(axis=0)
self.std_ = X_raw.std(axis=0)
# avoid division by zero
self.std_[self.std_ == 0] = 1.0
X_scaled = (X_raw - self.mean_) / self.std_ # standardisation
# add bias column (all ones) self.mean = x_raw.mean(axis=0)
X_scaled = np.hstack([np.ones((X_scaled.shape[0], 1)), X_scaled]) self.std = x_raw.std(axis=0)
self.std[self.std == 0] = 1.0
self.X_ = X_scaled x_scaled = (x_raw - self.mean) / self.std # standardisation formula
self.w_ = np.zeros(X_scaled.shape[1]) # initialise weights
bias = np.ones((x_scaled.shape[0], 1), dtype=np.float64) # adding bias
self.x = np.hstack((bias, x_scaled))
self.w = np.zeros(self.x.shape[1], dtype=np.float64) # initialize weight as zero
# ------------------------------------------------------------------
# 4. Fit batch gradient descent
# ------------------------------------------------------------------
def fit(self) -> None: def fit(self) -> None:
"""Runs batch gradient descent for `n_iter` epochs.""" """
Fit method to fit X and Y datas through pandas and train the linear model by gradient descent.
For the n iterations, it finds probabilities through sigmoid of linear prediction and does the
gradient to calculate the loss.
"""
if self.x is None or self.y is None: # if x or y are empty, throw error
raise RuntimeError("Model is not fitted yet. Call `fit` first.")
for i in range(1, self.n_iter + 1): for i in range(1, self.n_iter + 1):
z = np.dot(self.X_, self.w_) # linear part z = self.x.dot(self.w) # linear prediction
p = self._sigmoid(z) # predicted probabilities p = self.sigmoid(z) # probabilities of the model predictions
# gradient of the loglikelihood (including bias) gradient = self.x.T.dot(p - self.y) / self.y.size # gradient calculation formula
gradient = np.dot(self.X_.T, (p - self.y_)) / self.y_.size
# weight update self.w -= self.lr * gradient # gradient multiplied by learning rate is removed from weight
self.w_ -= self.lr * gradient
# record cost and check stopping criterion loss = self.cost(self.y, p) # cost is calculated through crossentropy and added for the current range
loss = self._cost(self.y_, p) self.loss.append(loss)
self.loss_history_.append(loss)
# if verbose, it shows the loss every 100 iterations and displays it
if self.verbose and i % 100 == 0: if self.verbose and i % 100 == 0:
print(f"Iteration {i:4d} loss: {loss:.6f}") print(f"Iter {i:4d} loss: {loss:.6f}")
if i > 1 and abs(self.loss_history_[-2] - loss) < self.tol: # tests whether the absolute change in loss is smaller than the tolerance
if i > 1 and abs(self.loss[-2] - loss) < self.tol:
if self.verbose: if self.verbose:
print(f"Converged after {i} iterations.") print(f"Converged after {i} iterations.")
break break # loss is stopped so further training would be unnecessary
# ------------------------------------------------------------------ def predict(self, x: np.ndarray | pd.DataFrame) -> np.ndarray:
# 5. Predict binary class labels """
# ------------------------------------------------------------------ Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors
def predict(self, X: np.ndarray) -> np.ndarray: and then calculates the model probability by applying sigmoid function.
"""Return 0/1 predictions for a new X matrix (already scaled).""" """
z = np.dot(X, self.w_) if isinstance(x, pd.DataFrame): # verifies value type
probs = self._sigmoid(z) x = x.values.astype(np.float64)
return (probs >= 0.5).astype(int) if x.ndim == 1:
x = x.reshape(1, -1)
# ------------------------------------------------------------------ z = x.dot(self.w)
# 6. Score accuracy on a given (X, y) pair probs = self.sigmoid(z) # probability calculation through sigmoid method
# ------------------------------------------------------------------ return (probs >= 0.5).astype(int) # 0.5 is commonly used to define positivity of the probability
def score(self, X: np.ndarray, y: np.ndarray) -> float:
"""Return the classification accuracy."""
y_pred = self.predict(X)
return np.mean(y_pred == y)
def score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
"""
This method is used to calculate mean accuracy with the prediction of Y and actual Y values.
"""
y_pred = self.predict(x)
y_true = np.asarray(y).astype(int)
return np.mean(y_pred == y_true) # mean is calculated if Y values match
if __name__ == "__main__": if __name__ == "__main__":
columns = [ columns = [
'ID', 'Diagnosis', 'ID', 'Diagnosis',
'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'compactness_mean', 'concavitymean', 'concave_points_mean', 'symmetrymean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se', 'compactness_se', 'concavityse', 'concave_points_se', 'symmetryse', 'fractal_dimension_se',
'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst' 'compactness_worst', 'concavityworst', 'concave_points_worst', 'symmetryworst', 'fractal_dimension_worst'
] ]
df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str) df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
@ -155,7 +137,7 @@ if __name__ == "__main__":
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
df.dropna(inplace=True) # remove null values df.dropna(inplace=True) # remove null values
print(f"Rows remaining after drop of the null values: {len(df)}") print(f"Rows remaining after drop of the null values: {len(df)}\n")
for col in num_cols: for col in num_cols:
df = df[df[col] >= 0] df = df[df[col] >= 0]
@ -172,33 +154,40 @@ if __name__ == "__main__":
# check if there are still null values # check if there are still null values
assert df.isna().sum().sum() == 0, "There are still some null values." assert df.isna().sum().sum() == 0, "There are still some null values."
df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0}) # making diagnosis numeric # making diagnosis numeric
df['Diagnosis'] = df['Diagnosis'].astype('category') df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category")
# ---- 7.2 Instantiate and train ------------------------------------ rng = np.random.default_rng(seed=42)
model = LogisticRegressionGD(learning_rate=0.05, n_train = len(df)
n_iter=5000, indices = rng.permutation(n_train)
tolerance=1e-6, train_size = int(0.8 * n_train)
verbose=True)
# we need to split X / y here train_idx = indices[:train_size]
X = df.drop(columns=['Diagnosis']) test_idx = indices[train_size:]
y = df['Diagnosis'].cat.codes.values # 0/1 array
# Standardise X inside the model for us well do it in `prepare` df_train = df.iloc[train_idx].reset_index(drop=True)
model.X_ = (X - X.mean()) / X.std() # biascolumn will be added later df_test = df.iloc[test_idx].reset_index(drop=True)
model.X_ = np.hstack([np.ones((model.X_.shape[0], 1)), model.X_]) # add bias
model.y_ = y
# Fit the model # training of the model
model = LogisticRegression(learning_rate=0.00005, n_iter=5000, tolerance=1e-6, verbose=True)
# other values could be used, for example (lr=0.01, n_iter=2000, tolerance=1e-3, verbose=False)
model.prepare(df_train, target_col="Diagnosis")
model.fit() model.fit()
# ------------------------------------------------- # evaluation of the model
# 8. Evaluate on the same data (you could split) train_acc = model.score(model.x, model.y)
# ------------------------------------------------- print(f"\nMean accuracy on training data: {train_acc:.4f}")
acc = model.score(model.X_, model.y_)
print(f"Training accuracy (on the whole cleaned set): {acc:.4f}")
# Example: predict on the first 10 samples # copied prepare method for building test X data
y_hat = model.predict(model.X_[:10]) x_test_raw = df_test.drop(columns=['Diagnosis']).values.astype(np.float64)
print("First 10 predictions:", y_hat) x_test_scaled = (x_test_raw - model.mean) / model.std
bias_test = np.ones((x_test_scaled.shape[0], 1), dtype=np.float64)
X_test = np.hstack((bias_test, x_test_scaled))
y_test = df_test['Diagnosis'].values.astype(int)
test_acc = model.score(X_test, y_test)
print(f"Mean accuracy on testing data: {test_acc:.4f}")
# predict Y values using the trained data
first_10 = X_test[:10]
y_hat = model.predict(first_10)
print("\nFirst 10 predictions:", y_hat.ravel())

View file

@ -3,8 +3,8 @@ import pandas as pd
class LinearRegression: class LinearRegression:
''' '''
Constructor for the Linear Regression with minibatch stochastic gradient descent. It uses learning rate, Constructor for the linear regression with minibatch stochastic gradient descent. It uses learning rate,
iteration number, batch size, bias and verbose. It also initializes the weight, mean and std. iteration number, batch size, bias and verbose. It also initializes the weight, mean and standard deviation.
''' '''
def __init__(self, lr, n_iter, batch_size, add_bias, verbose): def __init__(self, lr, n_iter, batch_size, add_bias, verbose):
self.lr = lr # learning rate self.lr = lr # learning rate
@ -90,7 +90,7 @@ class LinearRegression:
def predict(self, x: pd.DataFrame) -> pd.Series: def predict(self, x: pd.DataFrame) -> pd.Series:
''' '''
Predict method makes X prediction by multiplying X and weight vectors. Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors.
''' '''
if self.w is None: # if weight is empty, throw error if self.w is None: # if weight is empty, throw error
raise RuntimeError("Model is not fitted yet. Call `fit` first.") raise RuntimeError("Model is not fitted yet. Call `fit` first.")
@ -101,7 +101,7 @@ class LinearRegression:
def score(self, x: pd.DataFrame, y: pd.Series) -> float: def score(self, x: pd.DataFrame, y: pd.Series) -> float:
''' '''
This method is used to calculate coefficient of determination to assess the goodness This method is used to calculate coefficient of determination to assess the goodness
of fit from a regression model of fit from the linear regression model
''' '''
y_pred = self.predict(x) # predicts Y value with X predict method. y_pred = self.predict(x) # predicts Y value with X predict method.
y = pd.Series(y).astype('float64') y = pd.Series(y).astype('float64')
@ -133,7 +133,7 @@ if __name__ == "__main__":
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
df.dropna(inplace=True) # remove null values df.dropna(inplace=True) # remove null values
print(f"Rows remaining after drop of the null values: {len(df)}") print(f"Rows remaining after drop of the null values: {len(df)}\n")
# sanity checks for data validity - realistic parkinson data range estimations # sanity checks for data validity - realistic parkinson data range estimations
df = df[(df['age'] >= 18) & (df['age'] <= 95)] df = df[(df['age'] >= 18) & (df['age'] <= 95)]
@ -164,12 +164,9 @@ if __name__ == "__main__":
# evaluation of the model # evaluation of the model
print("\nR² on training data:", model.score(x_train, y_train)) print("\nR² on training data:", model.score(x_train, y_train))
print("\nR² on testing data:", model.score(x_test, y_test)) print("R² on testing data:", model.score(x_test, y_test))
# predict Y values using the trained data # predict Y values using the trained data
preds = model.predict(x_test) preds = model.predict(x_test)
print("\nFirst 5 predictions:") print("\nFirst 10 predictions:")
print(preds.head()) print(preds.head(10))
print("\nWeights:")
print(model.w.round(4))

View file

@ -1,28 +1,143 @@
import numpy as np import numpy as np
import pandas as pd import pandas as pd
'''
class LogisticRegression: class LogisticRegression:
def __init__(self):
def prepare(self):
def fit(self):
def predict(self):
def score(self):
''' '''
Constructor for the logistic regression with gradient descent. It uses learning rate, iteration number,
tolerance and verbose. It also initializes the weight, loss, x, y, mean and std.
'''
def __init__(self, learning_rate: float, n_iter: int, batch_size: int, tolerance: float, verbose: bool) -> None:
self.lr = learning_rate
self.n_iter = n_iter
self.batch_size = batch_size
self.tol = tolerance
self.verbose = verbose
self.w: np.ndarray | None = None # weight/coefficient (bias as first element)
self.loss: list[float] = [] # loss per iteration
self.x: np.ndarray | None = None # matrix of inputs after standardisation
self.y: np.ndarray | None = None # target vector
self.mean: np.ndarray | None = None # used for standardisation
self.std: np.ndarray | None = None # standard deviation
@staticmethod
def sigmoid(z: np.ndarray) -> np.ndarray:
"""Sigmoid method for the logistic regression method."""
return 1.0 / (1.0 + np.exp(-z)) # 1/(1+exp(-z))
@staticmethod
def cost(y: np.ndarray, p: np.ndarray) -> float:
"""Crossentropy loss is used for the cost calculation"""
eps = 1e-15
p = np.clip(p, eps, 1 - eps)
return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p))
def prepare(self, df: pd.DataFrame, target_col: str) -> None:
"""
Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column.
Then it does standardisation, adds bias and initializes the weight/coefficient.
"""
if target_col not in df.columns:
raise ValueError(f"Target column '{target_col}' not found in DataFrame.")
self.y = df[target_col].values.astype(np.int64)
x_raw = df.drop(columns=[target_col]).values.astype(np.float64)
# standardisation
self.mean = x_raw.mean(axis=0)
self.std = x_raw.std(axis=0)
self.std[self.std == 0] = 1.0
x_scaled = (x_raw - self.mean) / self.std # standardisation formula
bias = np.ones((x_scaled.shape[0], 1), dtype=np.float64) # adding bias
self.x = np.hstack((bias, x_scaled))
self.w = np.zeros(self.x.shape[1], dtype=np.float64) # initialize weight as zero
def fit(self) -> None:
"""
Fit method to fit X and Y datas through pandas and train the linear model by gradient descent.
For the n iterations, it finds probabilities through sigmoid of linear prediction and does the
gradient to calculate the loss.
"""
if self.x is None or self.y is None: # if x or y are empty, throw error
raise RuntimeError("Model is not fitted yet. Call `prepare` first.")
n_samples = self.x.shape[0]
batch_size = self.batch_size or n_samples
for epoch in range(1, self.n_iter + 1):
shuffled_idx = np.random.permutation(n_samples) # random permutation of the indices
x_shuffled = self.x[shuffled_idx]
y_shuffled = self.y[shuffled_idx]
# process execution for each minibatch
for b in range(0, n_samples, batch_size):
start = b * batch_size
end = start + batch_size
idx = shuffled_idx[start:end]
x_batch = x_shuffled[idx]
y_batch = y_shuffled[idx]
z = x_batch.dot(self.w)
p = self.sigmoid(z)
grad = x_batch.T.dot(p - y_batch) / y_batch.size # gradient calculation formula
self.w -= self.lr * grad # gradient multiplied by learning rate is removed from weight
# cost is calculated through crossentropy and added for the current range
loss = self.cost(self.y, self.sigmoid(self.x.dot(self.w)))
self.loss.append(loss)
# if verbose, it shows the loss every 100 iterations and displays it
if self.verbose and epoch % 100 == 0:
print(f"Iter {epoch:4d} loss: {loss:.6f}")
# tests whether the absolute change in loss is smaller than the tolerance
if epoch > 1 and abs(self.loss[-2] - loss) < self.tol:
if self.verbose:
print(f"Converged after {epoch} iterations.")
break
def predict(self, x: np.ndarray | pd.DataFrame) -> np.ndarray:
"""
Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors
and then calculates the model probability by applying sigmoid function.
"""
if isinstance(x, pd.DataFrame): # verifies value type
x = x.values.astype(np.float64)
if x.ndim == 1:
x = x.reshape(1, -1)
z = x.dot(self.w)
probs = self.sigmoid(z) # probability calculation through sigmoid method
return (probs >= 0.5).astype(int) # 0.5 is commonly used to define positivity of the probability
def score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
"""
This method is used to calculate mean accuracy with the prediction of Y and actual Y values.
"""
y_pred = self.predict(x)
y_true = np.asarray(y).astype(int)
return np.mean(y_pred == y_true) # mean is calculated if Y values match
if __name__ == "__main__": if __name__ == "__main__":
columns = [ columns = [
'ID', 'Diagnosis', 'ID', 'Diagnosis',
'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'compactness_mean', 'concavitymean', 'concave_points_mean', 'symmetrymean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se', 'compactness_se', 'concavityse', 'concave_points_se', 'symmetryse', 'fractal_dimension_se',
'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst' 'compactness_worst', 'concavityworst', 'concave_points_worst', 'symmetryworst', 'fractal_dimension_worst'
] ]
df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str) df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
@ -39,7 +154,7 @@ if __name__ == "__main__":
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
df.dropna(inplace=True) # remove null values df.dropna(inplace=True) # remove null values
print(f"Rows remaining after drop of the null values: {len(df)}") print(f"Rows remaining after drop of the null values: {len(df)}\n")
for col in num_cols: for col in num_cols:
df = df[df[col] >= 0] df = df[df[col] >= 0]
@ -56,5 +171,40 @@ if __name__ == "__main__":
# check if there are still null values # check if there are still null values
assert df.isna().sum().sum() == 0, "There are still some null values." assert df.isna().sum().sum() == 0, "There are still some null values."
df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0}) # making diagnosis numeric # making diagnosis numeric
df['Diagnosis'] = df['Diagnosis'].astype('category') df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category")
rng = np.random.default_rng(seed=42)
n_samples = len(df)
indices = rng.permutation(n_samples)
train_size = int(0.8 * n_samples)
train_idx = indices[:train_size]
test_idx = indices[train_size:]
df_train = df.iloc[train_idx].reset_index(drop=True)
df_test = df.iloc[test_idx].reset_index(drop=True)
# training of the model
model = LogisticRegression(learning_rate=0.00005, n_iter=5000, batch_size=64, tolerance=1e-6, verbose=True)
# other values could be used, for example (lr=0.01, n_iter=2000, tolerance=1e-3, verbose=False)
model.prepare(df_train, target_col="Diagnosis")
model.fit()
# evaluation of the model
train_acc = model.score(model.x, model.y)
print(f"\nMean accuracy on training data: {train_acc:.4f}")
# copied prepare method for building test X data
x_test_raw = df_test.drop(columns=['Diagnosis']).values.astype(np.float64)
x_test_scaled = (x_test_raw - model.mean) / model.std
bias_test = np.ones((x_test_scaled.shape[0], 1), dtype=np.float64)
X_test = np.hstack((bias_test, x_test_scaled))
y_test = df_test['Diagnosis'].values.astype(int)
test_acc = model.score(X_test, y_test)
print(f"Mean accuracy on testing data: {test_acc:.4f}")
# predict Y values using the trained data
first_10 = X_test[:10]
y_hat = model.predict(first_10)
print("\nFirst 10 predictions:", y_hat.ravel())

107
parkinsons_updrs.names Executable file
View file

@ -0,0 +1,107 @@
Parkinsons Telemonitoring Data Set
Abstract: Oxford Parkinson's Disease Telemonitoring Dataset
============================================================
Data Set Characteristics: Multivariate
Attribute Characteristics: Integer, Real
Associated Tasks: Regression
Number of Instances: 5875
Number of Attributes: 26
Area: Life
Date Donated: 2009-10-29
============================================================
SOURCE:
The dataset was created by Athanasios Tsanas (tsanasthanasis '@' gmail.com)
and Max Little (littlem '@' physics.ox.ac.uk) of the University of Oxford, in
collaboration with 10 medical centers in the US and Intel Corporation who
developed the telemonitoring device to record the speech signals. The
original study used a range of linear and nonlinear regression methods to
predict the clinician's Parkinson's disease symptom score on the UPDRS scale.
============================================================
DATA SET INFORMATION:
This dataset is composed of a range of biomedical voice measurements from 42
people with early-stage Parkinson's disease recruited to a six-month trial of
a telemonitoring device for remote symptom progression monitoring. The
recordings were automatically captured in the patient's homes.
Columns in the table contain subject number, subject age, subject gender,
time interval from baseline recruitment date, motor UPDRS, total UPDRS, and
16 biomedical voice measures. Each row corresponds to one of 5,875 voice
recording from these individuals. The main aim of the data is to predict the
motor and total UPDRS scores ('motor_UPDRS' and 'total_UPDRS') from the 16
voice measures.
The data is in ASCII CSV format. The rows of the CSV file contain an instance
corresponding to one voice recording. There are around 200 recordings per
patient, the subject number of the patient is identified in the first column.
For further information or to pass on comments, please contact Athanasios
Tsanas (tsanasthanasis '@' gmail.com) or Max Little (littlem '@'
physics.ox.ac.uk).
Further details are contained in the following reference -- if you use this
dataset, please cite:
Athanasios Tsanas, Max A. Little, Patrick E. McSharry, Lorraine O. Ramig (2009),
'Accurate telemonitoring of Parkinson.s disease progression by non-invasive
speech tests',
IEEE Transactions on Biomedical Engineering (to appear).
Further details about the biomedical voice measures can be found in:
Max A. Little, Patrick E. McSharry, Eric J. Hunter, Lorraine O. Ramig (2009),
'Suitability of dysphonia measurements for telemonitoring of Parkinson's
disease',
IEEE Transactions on Biomedical Engineering, 56(4):1015-1022
===========================================================
ATTRIBUTE INFORMATION:
subject# - Integer that uniquely identifies each subject
age - Subject age
sex - Subject gender '0' - male, '1' - female
test_time - Time since recruitment into the trial. The integer part is the
number of days since recruitment.
motor_UPDRS - Clinician's motor UPDRS score, linearly interpolated
total_UPDRS - Clinician's total UPDRS score, linearly interpolated
Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP - Several measures of
variation in fundamental frequency
Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA -
Several measures of variation in amplitude
NHR,HNR - Two measures of ratio of noise to tonal components in the voice
RPDE - A nonlinear dynamical complexity measure
DFA - Signal fractal scaling exponent
PPE - A nonlinear measure of fundamental frequency variation
===========================================================
RELEVANT PAPERS:
Little MA, McSharry PE, Hunter EJ, Ramig LO (2009),
'Suitability of dysphonia measurements for telemonitoring of Parkinson's
disease',
IEEE Transactions on Biomedical Engineering, 56(4):1015-1022
Little MA, McSharry PE, Roberts SJ, Costello DAE, Moroz IM.
'Exploiting Nonlinear Recurrence and Fractal Scaling Properties for Voice
Disorder Detection',
BioMedical Engineering OnLine 2007, 6:23 (26 June 2007)
===========================================================
CITATION REQUEST:
If you use this dataset, please cite the following paper:
A Tsanas, MA Little, PE McSharry, LO Ramig (2009)
'Accurate telemonitoring of Parkinson.s disease progression by non-invasive
speech tests',
IEEE Transactions on Biomedical Engineering (to appear).

140
wdbc.names Executable file
View file

@ -0,0 +1,140 @@
1. Title: Wisconsin Diagnostic Breast Cancer (WDBC)
2. Source Information
a) Creators:
Dr. William H. Wolberg, General Surgery Dept., University of
Wisconsin, Clinical Sciences Center, Madison, WI 53792
wolberg@eagle.surgery.wisc.edu
W. Nick Street, Computer Sciences Dept., University of
Wisconsin, 1210 West Dayton St., Madison, WI 53706
street@cs.wisc.edu 608-262-6619
Olvi L. Mangasarian, Computer Sciences Dept., University of
Wisconsin, 1210 West Dayton St., Madison, WI 53706
olvi@cs.wisc.edu
b) Donor: Nick Street
c) Date: November 1995
3. Past Usage:
first usage:
W.N. Street, W.H. Wolberg and O.L. Mangasarian
Nuclear feature extraction for breast tumor diagnosis.
IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science
and Technology, volume 1905, pages 861-870, San Jose, CA, 1993.
OR literature:
O.L. Mangasarian, W.N. Street and W.H. Wolberg.
Breast cancer diagnosis and prognosis via linear programming.
Operations Research, 43(4), pages 570-577, July-August 1995.
Medical literature:
W.H. Wolberg, W.N. Street, and O.L. Mangasarian.
Machine learning techniques to diagnose breast cancer from
fine-needle aspirates.
Cancer Letters 77 (1994) 163-171.
W.H. Wolberg, W.N. Street, and O.L. Mangasarian.
Image analysis and machine learning applied to breast cancer
diagnosis and prognosis.
Analytical and Quantitative Cytology and Histology, Vol. 17
No. 2, pages 77-87, April 1995.
W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian.
Computerized breast cancer diagnosis and prognosis from fine
needle aspirates.
Archives of Surgery 1995;130:511-516.
W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian.
Computer-derived nuclear features distinguish malignant from
benign breast cytology.
Human Pathology, 26:792--796, 1995.
See also:
http://www.cs.wisc.edu/~olvi/uwmp/mpml.html
http://www.cs.wisc.edu/~olvi/uwmp/cancer.html
Results:
- predicting field 2, diagnosis: B = benign, M = malignant
- sets are linearly separable using all 30 input features
- best predictive accuracy obtained using one separating plane
in the 3-D space of Worst Area, Worst Smoothness and
Mean Texture. Estimated accuracy 97.5% using repeated
10-fold crossvalidations. Classifier has correctly
diagnosed 176 consecutive new patients as of November
1995.
4. Relevant information
Features are computed from a digitized image of a fine needle
aspirate (FNA) of a breast mass. They describe
characteristics of the cell nuclei present in the image.
A few of the images can be found at
http://www.cs.wisc.edu/~street/images/
Separating plane described above was obtained using
Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree
Construction Via Linear Programming." Proceedings of the 4th
Midwest Artificial Intelligence and Cognitive Science Society,
pp. 97-101, 1992], a classification method which uses linear
programming to construct a decision tree. Relevant features
were selected using an exhaustive search in the space of 1-4
features and 1-3 separating planes.
The actual linear program used to obtain the separating plane
in the 3-dimensional space is that described in:
[K. P. Bennett and O. L. Mangasarian: "Robust Linear
Programming Discrimination of Two Linearly Inseparable Sets",
Optimization Methods and Software 1, 1992, 23-34].
This database is also available through the UW CS ftp server:
ftp ftp.cs.wisc.edu
cd math-prog/cpo-dataset/machine-learn/WDBC/
5. Number of instances: 569
6. Number of attributes: 32 (ID, diagnosis, 30 real-valued input features)
7. Attribute information
1) ID number
2) Diagnosis (M = malignant, B = benign)
3-32)
Ten real-valued features are computed for each cell nucleus:
a) radius (mean of distances from center to points on the perimeter)
b) texture (standard deviation of gray-scale values)
c) perimeter
d) area
e) smoothness (local variation in radius lengths)
f) compactness (perimeter^2 / area - 1.0)
g) concavity (severity of concave portions of the contour)
h) concave points (number of concave portions of the contour)
i) symmetry
j) fractal dimension ("coastline approximation" - 1)
Several of the papers listed above contain detailed descriptions of
how these features are computed.
The mean, standard error, and "worst" or largest (mean of the three
largest values) of these features were computed for each image,
resulting in 30 features. For instance, field 3 is Mean Radius, field
13 is Radius SE, field 23 is Worst Radius.
All feature values are recoded with four significant digits.
8. Missing attribute values: none
9. Class distribution: 357 benign, 212 malignant