Finished the implementation of the python code.
This commit is contained in:
parent
5702c3c1b8
commit
455b48c89b
6 changed files with 540 additions and 159 deletions
|
|
@ -27,8 +27,8 @@ class LinearRegression:
|
|||
|
||||
class LinearRegression:
|
||||
'''
|
||||
Constructor for the Linear Regression with analytical. It uses bias. It also
|
||||
initializes the weight, mean and std.
|
||||
Constructor for the linear regression with analytical solution. It uses bias. It also
|
||||
initializes the weight, mean and standard deviation.
|
||||
'''
|
||||
def __init__(self, add_bias):
|
||||
self.add_bias = add_bias # bias to prepend a column of ones (the intercept term)
|
||||
|
|
@ -60,7 +60,8 @@ class LinearRegression:
|
|||
def fit(self, x: pd.DataFrame, y: pd.Series) -> "LinearRegression":
|
||||
'''
|
||||
Fit method to fit X and Y datas through pandas and train the linear model by analytical solution.
|
||||
It uses pandas DataFrame for the X and Series for the Y.
|
||||
It uses pandas DataFrame for the X and Series for the Y. It uses the linear regression formula
|
||||
to calculate weight
|
||||
'''
|
||||
x = self.prepare(x)
|
||||
y = pd.Series(y).astype("float64")
|
||||
|
|
@ -84,7 +85,7 @@ class LinearRegression:
|
|||
|
||||
def predict(self, x: pd.DataFrame) -> pd.Series:
|
||||
'''
|
||||
Predict method is used to test trained data to do X prediction by multiplying X and weight vectors.
|
||||
Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors.
|
||||
'''
|
||||
if self.w is None: # if weight is empty, throw error
|
||||
raise RuntimeError("Model is not fitted yet. Call `fit` first.")
|
||||
|
|
@ -95,7 +96,7 @@ class LinearRegression:
|
|||
def score(self, x: pd.DataFrame, y: pd.Series) -> float:
|
||||
'''
|
||||
This method is used to calculate coefficient of determination to assess the goodness
|
||||
of fit from a regression model
|
||||
of fit from the linear regression model
|
||||
'''
|
||||
y_pred = self.predict(x) # predicts Y value with X predict method.
|
||||
y = pd.Series(y).astype('float64')
|
||||
|
|
@ -127,7 +128,7 @@ if __name__ == "__main__":
|
|||
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
||||
|
||||
df.dropna(inplace=True) # remove null values
|
||||
print(f"Rows remaining after drop of the null values: {len(df)}")
|
||||
print(f"Rows remaining after drop of the null values: {len(df)}\n")
|
||||
|
||||
# sanity checks for data validity - realistic parkinson data range estimations
|
||||
df = df[(df['age'] >= 18) & (df['age'] <= 95)]
|
||||
|
|
@ -157,12 +158,9 @@ if __name__ == "__main__":
|
|||
|
||||
# evaluation of the model
|
||||
print("\nR² on training data:", model.score(x_train, y_train))
|
||||
print("\nR² on testing data:", model.score(x_test, y_test))
|
||||
print("R² on testing data:", model.score(x_test, y_test))
|
||||
|
||||
# predict Y values using the trained data
|
||||
preds = model.predict(x_test)
|
||||
print("\nFirst 5 predictions:")
|
||||
print(preds.head())
|
||||
|
||||
print("\nWeights:")
|
||||
print(model.w.round(4))
|
||||
print("\nFirst 10 predictions:")
|
||||
print(preds.head(10))
|
||||
|
|
|
|||
|
|
@ -1,144 +1,126 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
class LogisticRegressionGD:
|
||||
"""Binary logistic regression trained with batch gradient descent."""
|
||||
def __init__(self,
|
||||
learning_rate: float = 0.01,
|
||||
n_iter: int = 1000,
|
||||
tolerance: float = 1e-5,
|
||||
verbose: bool = False):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
learning_rate : float
|
||||
Step size for weight updates.
|
||||
n_iter : int
|
||||
Maximum number of iterations.
|
||||
tolerance : float
|
||||
Stopping criterion: if the change in loss is < tolerance, stop.
|
||||
verbose : bool
|
||||
If True, prints loss at every 100 iterations.
|
||||
"""
|
||||
|
||||
class LogisticRegression:
|
||||
'''
|
||||
Constructor for the logistic regression with gradient descent. It uses learning rate, iteration number,
|
||||
tolerance and verbose. It also initializes the weight, loss, x, y, mean and std.
|
||||
'''
|
||||
|
||||
def __init__(self, learning_rate: float, n_iter: int, tolerance: float, verbose: bool) -> None:
|
||||
self.lr = learning_rate
|
||||
self.n_iter = n_iter
|
||||
self.tol = tolerance
|
||||
self.verbose = verbose
|
||||
self.w: np.ndarray | None = None # weight/coefficient (bias as first element)
|
||||
self.loss: list[float] = [] # loss per iteration
|
||||
self.x: np.ndarray | None = None # matrix of inputs after standardisation
|
||||
self.y: np.ndarray | None = None # target vector
|
||||
self.mean: np.ndarray | None = None # used for standardisation
|
||||
self.std: np.ndarray | None = None # standard deviation
|
||||
|
||||
# placeholders that will be filled during training
|
||||
self.w_ = None # weights (including bias as w[0])
|
||||
self.loss_history_ = [] # loss at each iteration
|
||||
self.X_ = None # feature matrix (after standardisation)
|
||||
self.y_ = None # target vector (0/1)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 2. Sigmoid helper (vectorised)
|
||||
# ------------------------------------------------------------------
|
||||
@staticmethod
|
||||
def _sigmoid(z: np.ndarray) -> np.ndarray:
|
||||
return 1.0 / (1.0 + np.exp(-z))
|
||||
def sigmoid(z: np.ndarray) -> np.ndarray:
|
||||
"""Sigmoid method for the logistic regression method."""
|
||||
return 1.0 / (1.0 + np.exp(-z)) # 1/(1+exp(-z))
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 3. Cost function (cross‑entropy)
|
||||
# ------------------------------------------------------------------
|
||||
@staticmethod
|
||||
def _cost(y: np.ndarray, p: np.ndarray) -> float:
|
||||
# avoid log(0) by clipping
|
||||
def cost(y: np.ndarray, p: np.ndarray) -> float:
|
||||
"""Cross‑entropy loss is used for the cost calculation"""
|
||||
eps = 1e-15
|
||||
p = np.clip(p, eps, 1 - eps)
|
||||
return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p))
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 4. Data preparation – this is where we split X / y, scale, etc.
|
||||
# ------------------------------------------------------------------
|
||||
def prepare(self, df: pd.DataFrame, target_col: str = 'Diagnosis') -> None:
|
||||
def prepare(self, df: pd.DataFrame, target_col: str) -> None:
|
||||
"""
|
||||
Splits `df` into X and y, standardises X (mean=0, std=1),
|
||||
and stores the result in the class attributes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : pd.DataFrame
|
||||
Cleaned data – *already* contains a numeric target in `target_col`.
|
||||
target_col : str
|
||||
Name of the binary target column.
|
||||
Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column.
|
||||
Then it does standardisation, adds bias and initializes the weight/coefficient.
|
||||
|
||||
"""
|
||||
# target must be a 0/1 array
|
||||
self.y_ = df[target_col].values.astype(np.int64)
|
||||
if target_col not in df.columns:
|
||||
raise ValueError(f"Target column '{target_col}' not found in DataFrame.")
|
||||
|
||||
# X – all columns except the target
|
||||
X_raw = df.drop(columns=[target_col]).values.astype(np.float64)
|
||||
self.y = df[target_col].values.astype(np.int64)
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 3.1 Feature scaling – we put the bias in the first column
|
||||
# -----------------------------------------------------------------
|
||||
# compute mean / std on the whole training set (no train/val split yet)
|
||||
self.mean_ = X_raw.mean(axis=0)
|
||||
self.std_ = X_raw.std(axis=0)
|
||||
# avoid division by zero
|
||||
self.std_[self.std_ == 0] = 1.0
|
||||
x_raw = df.drop(columns=[target_col]).values.astype(np.float64)
|
||||
|
||||
X_scaled = (X_raw - self.mean_) / self.std_
|
||||
# add bias column (all ones)
|
||||
X_scaled = np.hstack([np.ones((X_scaled.shape[0], 1)), X_scaled])
|
||||
# standardisation
|
||||
self.mean = x_raw.mean(axis=0)
|
||||
self.std = x_raw.std(axis=0)
|
||||
self.std[self.std == 0] = 1.0
|
||||
|
||||
self.X_ = X_scaled
|
||||
self.w_ = np.zeros(X_scaled.shape[1]) # initialise weights
|
||||
x_scaled = (x_raw - self.mean) / self.std # standardisation formula
|
||||
|
||||
|
||||
bias = np.ones((x_scaled.shape[0], 1), dtype=np.float64) # adding bias
|
||||
self.x = np.hstack((bias, x_scaled))
|
||||
|
||||
self.w = np.zeros(self.x.shape[1], dtype=np.float64) # initialize weight as zero
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 4. Fit – batch gradient descent
|
||||
# ------------------------------------------------------------------
|
||||
def fit(self) -> None:
|
||||
"""Runs batch gradient descent for `n_iter` epochs."""
|
||||
"""
|
||||
|
||||
Fit method to fit X and Y datas through pandas and train the linear model by gradient descent.
|
||||
For the n iterations, it finds probabilities through sigmoid of linear prediction and does the
|
||||
gradient to calculate the loss.
|
||||
|
||||
"""
|
||||
if self.x is None or self.y is None: # if x or y are empty, throw error
|
||||
raise RuntimeError("Model is not fitted yet. Call `fit` first.")
|
||||
|
||||
for i in range(1, self.n_iter + 1):
|
||||
z = np.dot(self.X_, self.w_) # linear part
|
||||
p = self._sigmoid(z) # predicted probabilities
|
||||
z = self.x.dot(self.w) # linear prediction
|
||||
p = self.sigmoid(z) # probabilities of the model predictions
|
||||
|
||||
# gradient of the log‑likelihood (including bias)
|
||||
gradient = np.dot(self.X_.T, (p - self.y_)) / self.y_.size
|
||||
gradient = self.x.T.dot(p - self.y) / self.y.size # gradient calculation formula
|
||||
|
||||
# weight update
|
||||
self.w_ -= self.lr * gradient
|
||||
self.w -= self.lr * gradient # gradient multiplied by learning rate is removed from weight
|
||||
|
||||
# record cost and check stopping criterion
|
||||
loss = self._cost(self.y_, p)
|
||||
self.loss_history_.append(loss)
|
||||
loss = self.cost(self.y, p) # cost is calculated through cross‑entropy and added for the current range
|
||||
self.loss.append(loss)
|
||||
|
||||
# if verbose, it shows the loss every 100 iterations and displays it
|
||||
if self.verbose and i % 100 == 0:
|
||||
print(f"Iteration {i:4d} – loss: {loss:.6f}")
|
||||
print(f"Iter {i:4d} – loss: {loss:.6f}")
|
||||
|
||||
if i > 1 and abs(self.loss_history_[-2] - loss) < self.tol:
|
||||
# tests whether the absolute change in loss is smaller than the tolerance
|
||||
if i > 1 and abs(self.loss[-2] - loss) < self.tol:
|
||||
if self.verbose:
|
||||
print(f"Converged after {i} iterations.")
|
||||
break
|
||||
break # loss is stopped so further training would be unnecessary
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 5. Predict – binary class labels
|
||||
# ------------------------------------------------------------------
|
||||
def predict(self, X: np.ndarray) -> np.ndarray:
|
||||
"""Return 0/1 predictions for a new X matrix (already scaled)."""
|
||||
z = np.dot(X, self.w_)
|
||||
probs = self._sigmoid(z)
|
||||
return (probs >= 0.5).astype(int)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 6. Score – accuracy on a given (X, y) pair
|
||||
# ------------------------------------------------------------------
|
||||
def score(self, X: np.ndarray, y: np.ndarray) -> float:
|
||||
"""Return the classification accuracy."""
|
||||
y_pred = self.predict(X)
|
||||
return np.mean(y_pred == y)
|
||||
def predict(self, x: np.ndarray | pd.DataFrame) -> np.ndarray:
|
||||
"""
|
||||
Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors
|
||||
and then calculates the model probability by applying sigmoid function.
|
||||
"""
|
||||
if isinstance(x, pd.DataFrame): # verifies value type
|
||||
x = x.values.astype(np.float64)
|
||||
if x.ndim == 1:
|
||||
x = x.reshape(1, -1)
|
||||
z = x.dot(self.w)
|
||||
probs = self.sigmoid(z) # probability calculation through sigmoid method
|
||||
return (probs >= 0.5).astype(int) # 0.5 is commonly used to define positivity of the probability
|
||||
|
||||
def score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
|
||||
"""
|
||||
This method is used to calculate mean accuracy with the prediction of Y and actual Y values.
|
||||
"""
|
||||
y_pred = self.predict(x)
|
||||
y_true = np.asarray(y).astype(int)
|
||||
return np.mean(y_pred == y_true) # mean is calculated if Y values match
|
||||
|
||||
if __name__ == "__main__":
|
||||
columns = [
|
||||
'ID', 'Diagnosis',
|
||||
'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
|
||||
'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
|
||||
'compactness_mean', 'concavitymean', 'concave_points_mean', 'symmetrymean', 'fractal_dimension_mean',
|
||||
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
|
||||
'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se',
|
||||
'compactness_se', 'concavityse', 'concave_points_se', 'symmetryse', 'fractal_dimension_se',
|
||||
'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
|
||||
'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst'
|
||||
'compactness_worst', 'concavityworst', 'concave_points_worst', 'symmetryworst', 'fractal_dimension_worst'
|
||||
]
|
||||
|
||||
df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
|
||||
|
|
@ -155,7 +137,7 @@ if __name__ == "__main__":
|
|||
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
||||
|
||||
df.dropna(inplace=True) # remove null values
|
||||
print(f"Rows remaining after drop of the null values: {len(df)}")
|
||||
print(f"Rows remaining after drop of the null values: {len(df)}\n")
|
||||
for col in num_cols:
|
||||
df = df[df[col] >= 0]
|
||||
|
||||
|
|
@ -172,33 +154,40 @@ if __name__ == "__main__":
|
|||
# check if there are still null values
|
||||
assert df.isna().sum().sum() == 0, "There are still some null values."
|
||||
|
||||
df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0}) # making diagnosis numeric
|
||||
df['Diagnosis'] = df['Diagnosis'].astype('category')
|
||||
# making diagnosis numeric
|
||||
df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category")
|
||||
|
||||
# ---- 7.2 Instantiate and train ------------------------------------
|
||||
model = LogisticRegressionGD(learning_rate=0.05,
|
||||
n_iter=5000,
|
||||
tolerance=1e-6,
|
||||
verbose=True)
|
||||
rng = np.random.default_rng(seed=42)
|
||||
n_train = len(df)
|
||||
indices = rng.permutation(n_train)
|
||||
train_size = int(0.8 * n_train)
|
||||
|
||||
# we need to split X / y here
|
||||
X = df.drop(columns=['Diagnosis'])
|
||||
y = df['Diagnosis'].cat.codes.values # 0/1 array
|
||||
train_idx = indices[:train_size]
|
||||
test_idx = indices[train_size:]
|
||||
|
||||
# Standardise X inside the model for us – we’ll do it in `prepare`
|
||||
model.X_ = (X - X.mean()) / X.std() # bias‑column will be added later
|
||||
model.X_ = np.hstack([np.ones((model.X_.shape[0], 1)), model.X_]) # add bias
|
||||
model.y_ = y
|
||||
df_train = df.iloc[train_idx].reset_index(drop=True)
|
||||
df_test = df.iloc[test_idx].reset_index(drop=True)
|
||||
|
||||
# Fit the model
|
||||
# training of the model
|
||||
model = LogisticRegression(learning_rate=0.00005, n_iter=5000, tolerance=1e-6, verbose=True)
|
||||
# other values could be used, for example (lr=0.01, n_iter=2000, tolerance=1e-3, verbose=False)
|
||||
model.prepare(df_train, target_col="Diagnosis")
|
||||
model.fit()
|
||||
|
||||
# -------------------------------------------------
|
||||
# 8. Evaluate on the same data (you could split)
|
||||
# -------------------------------------------------
|
||||
acc = model.score(model.X_, model.y_)
|
||||
print(f"Training accuracy (on the whole cleaned set): {acc:.4f}")
|
||||
# evaluation of the model
|
||||
train_acc = model.score(model.x, model.y)
|
||||
print(f"\nMean accuracy on training data: {train_acc:.4f}")
|
||||
|
||||
# Example: predict on the first 10 samples
|
||||
y_hat = model.predict(model.X_[:10])
|
||||
print("First 10 predictions:", y_hat)
|
||||
# copied prepare method for building test X data
|
||||
x_test_raw = df_test.drop(columns=['Diagnosis']).values.astype(np.float64)
|
||||
x_test_scaled = (x_test_raw - model.mean) / model.std
|
||||
bias_test = np.ones((x_test_scaled.shape[0], 1), dtype=np.float64)
|
||||
X_test = np.hstack((bias_test, x_test_scaled))
|
||||
y_test = df_test['Diagnosis'].values.astype(int)
|
||||
test_acc = model.score(X_test, y_test)
|
||||
print(f"Mean accuracy on testing data: {test_acc:.4f}")
|
||||
|
||||
# predict Y values using the trained data
|
||||
first_10 = X_test[:10]
|
||||
y_hat = model.predict(first_10)
|
||||
print("\nFirst 10 predictions:", y_hat.ravel())
|
||||
|
|
@ -3,8 +3,8 @@ import pandas as pd
|
|||
|
||||
class LinearRegression:
|
||||
'''
|
||||
Constructor for the Linear Regression with mini‑batch stochastic gradient descent. It uses learning rate,
|
||||
iteration number, batch size, bias and verbose. It also initializes the weight, mean and std.
|
||||
Constructor for the linear regression with mini‑batch stochastic gradient descent. It uses learning rate,
|
||||
iteration number, batch size, bias and verbose. It also initializes the weight, mean and standard deviation.
|
||||
'''
|
||||
def __init__(self, lr, n_iter, batch_size, add_bias, verbose):
|
||||
self.lr = lr # learning rate
|
||||
|
|
@ -90,7 +90,7 @@ class LinearRegression:
|
|||
|
||||
def predict(self, x: pd.DataFrame) -> pd.Series:
|
||||
'''
|
||||
Predict method makes X prediction by multiplying X and weight vectors.
|
||||
Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors.
|
||||
'''
|
||||
if self.w is None: # if weight is empty, throw error
|
||||
raise RuntimeError("Model is not fitted yet. Call `fit` first.")
|
||||
|
|
@ -101,7 +101,7 @@ class LinearRegression:
|
|||
def score(self, x: pd.DataFrame, y: pd.Series) -> float:
|
||||
'''
|
||||
This method is used to calculate coefficient of determination to assess the goodness
|
||||
of fit from a regression model
|
||||
of fit from the linear regression model
|
||||
'''
|
||||
y_pred = self.predict(x) # predicts Y value with X predict method.
|
||||
y = pd.Series(y).astype('float64')
|
||||
|
|
@ -133,7 +133,7 @@ if __name__ == "__main__":
|
|||
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
||||
|
||||
df.dropna(inplace=True) # remove null values
|
||||
print(f"Rows remaining after drop of the null values: {len(df)}")
|
||||
print(f"Rows remaining after drop of the null values: {len(df)}\n")
|
||||
|
||||
# sanity checks for data validity - realistic parkinson data range estimations
|
||||
df = df[(df['age'] >= 18) & (df['age'] <= 95)]
|
||||
|
|
@ -164,12 +164,9 @@ if __name__ == "__main__":
|
|||
|
||||
# evaluation of the model
|
||||
print("\nR² on training data:", model.score(x_train, y_train))
|
||||
print("\nR² on testing data:", model.score(x_test, y_test))
|
||||
print("R² on testing data:", model.score(x_test, y_test))
|
||||
|
||||
# predict Y values using the trained data
|
||||
preds = model.predict(x_test)
|
||||
print("\nFirst 5 predictions:")
|
||||
print(preds.head())
|
||||
|
||||
print("\nWeights:")
|
||||
print(model.w.round(4))
|
||||
print("\nFirst 10 predictions:")
|
||||
print(preds.head(10))
|
||||
|
|
@ -1,28 +1,143 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
'''
|
||||
|
||||
class LogisticRegression:
|
||||
def __init__(self):
|
||||
|
||||
def prepare(self):
|
||||
|
||||
def fit(self):
|
||||
|
||||
def predict(self):
|
||||
|
||||
def score(self):
|
||||
'''
|
||||
Constructor for the logistic regression with gradient descent. It uses learning rate, iteration number,
|
||||
tolerance and verbose. It also initializes the weight, loss, x, y, mean and std.
|
||||
'''
|
||||
|
||||
def __init__(self, learning_rate: float, n_iter: int, batch_size: int, tolerance: float, verbose: bool) -> None:
|
||||
self.lr = learning_rate
|
||||
self.n_iter = n_iter
|
||||
self.batch_size = batch_size
|
||||
self.tol = tolerance
|
||||
self.verbose = verbose
|
||||
self.w: np.ndarray | None = None # weight/coefficient (bias as first element)
|
||||
self.loss: list[float] = [] # loss per iteration
|
||||
self.x: np.ndarray | None = None # matrix of inputs after standardisation
|
||||
self.y: np.ndarray | None = None # target vector
|
||||
self.mean: np.ndarray | None = None # used for standardisation
|
||||
self.std: np.ndarray | None = None # standard deviation
|
||||
|
||||
@staticmethod
|
||||
def sigmoid(z: np.ndarray) -> np.ndarray:
|
||||
"""Sigmoid method for the logistic regression method."""
|
||||
return 1.0 / (1.0 + np.exp(-z)) # 1/(1+exp(-z))
|
||||
|
||||
@staticmethod
|
||||
def cost(y: np.ndarray, p: np.ndarray) -> float:
|
||||
"""Cross‑entropy loss is used for the cost calculation"""
|
||||
eps = 1e-15
|
||||
p = np.clip(p, eps, 1 - eps)
|
||||
return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p))
|
||||
|
||||
def prepare(self, df: pd.DataFrame, target_col: str) -> None:
|
||||
"""
|
||||
|
||||
Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column.
|
||||
Then it does standardisation, adds bias and initializes the weight/coefficient.
|
||||
|
||||
"""
|
||||
if target_col not in df.columns:
|
||||
raise ValueError(f"Target column '{target_col}' not found in DataFrame.")
|
||||
|
||||
self.y = df[target_col].values.astype(np.int64)
|
||||
|
||||
x_raw = df.drop(columns=[target_col]).values.astype(np.float64)
|
||||
|
||||
# standardisation
|
||||
self.mean = x_raw.mean(axis=0)
|
||||
self.std = x_raw.std(axis=0)
|
||||
self.std[self.std == 0] = 1.0
|
||||
|
||||
x_scaled = (x_raw - self.mean) / self.std # standardisation formula
|
||||
|
||||
|
||||
bias = np.ones((x_scaled.shape[0], 1), dtype=np.float64) # adding bias
|
||||
self.x = np.hstack((bias, x_scaled))
|
||||
|
||||
self.w = np.zeros(self.x.shape[1], dtype=np.float64) # initialize weight as zero
|
||||
|
||||
def fit(self) -> None:
|
||||
"""
|
||||
|
||||
Fit method to fit X and Y datas through pandas and train the linear model by gradient descent.
|
||||
For the n iterations, it finds probabilities through sigmoid of linear prediction and does the
|
||||
gradient to calculate the loss.
|
||||
|
||||
"""
|
||||
if self.x is None or self.y is None: # if x or y are empty, throw error
|
||||
raise RuntimeError("Model is not fitted yet. Call `prepare` first.")
|
||||
|
||||
n_samples = self.x.shape[0]
|
||||
batch_size = self.batch_size or n_samples
|
||||
|
||||
for epoch in range(1, self.n_iter + 1):
|
||||
shuffled_idx = np.random.permutation(n_samples) # random permutation of the indices
|
||||
x_shuffled = self.x[shuffled_idx]
|
||||
y_shuffled = self.y[shuffled_idx]
|
||||
|
||||
# process execution for each mini‑batch
|
||||
for b in range(0, n_samples, batch_size):
|
||||
start = b * batch_size
|
||||
end = start + batch_size
|
||||
idx = shuffled_idx[start:end]
|
||||
|
||||
x_batch = x_shuffled[idx]
|
||||
y_batch = y_shuffled[idx]
|
||||
|
||||
z = x_batch.dot(self.w)
|
||||
p = self.sigmoid(z)
|
||||
|
||||
grad = x_batch.T.dot(p - y_batch) / y_batch.size # gradient calculation formula
|
||||
self.w -= self.lr * grad # gradient multiplied by learning rate is removed from weight
|
||||
|
||||
# cost is calculated through cross‑entropy and added for the current range
|
||||
loss = self.cost(self.y, self.sigmoid(self.x.dot(self.w)))
|
||||
self.loss.append(loss)
|
||||
|
||||
# if verbose, it shows the loss every 100 iterations and displays it
|
||||
if self.verbose and epoch % 100 == 0:
|
||||
print(f"Iter {epoch:4d} – loss: {loss:.6f}")
|
||||
|
||||
# tests whether the absolute change in loss is smaller than the tolerance
|
||||
if epoch > 1 and abs(self.loss[-2] - loss) < self.tol:
|
||||
if self.verbose:
|
||||
print(f"Converged after {epoch} iterations.")
|
||||
break
|
||||
|
||||
def predict(self, x: np.ndarray | pd.DataFrame) -> np.ndarray:
|
||||
"""
|
||||
Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors
|
||||
and then calculates the model probability by applying sigmoid function.
|
||||
"""
|
||||
if isinstance(x, pd.DataFrame): # verifies value type
|
||||
x = x.values.astype(np.float64)
|
||||
if x.ndim == 1:
|
||||
x = x.reshape(1, -1)
|
||||
z = x.dot(self.w)
|
||||
probs = self.sigmoid(z) # probability calculation through sigmoid method
|
||||
return (probs >= 0.5).astype(int) # 0.5 is commonly used to define positivity of the probability
|
||||
|
||||
def score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
|
||||
"""
|
||||
This method is used to calculate mean accuracy with the prediction of Y and actual Y values.
|
||||
"""
|
||||
y_pred = self.predict(x)
|
||||
y_true = np.asarray(y).astype(int)
|
||||
return np.mean(y_pred == y_true) # mean is calculated if Y values match
|
||||
|
||||
if __name__ == "__main__":
|
||||
columns = [
|
||||
'ID', 'Diagnosis',
|
||||
'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
|
||||
'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
|
||||
'compactness_mean', 'concavitymean', 'concave_points_mean', 'symmetrymean', 'fractal_dimension_mean',
|
||||
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
|
||||
'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se',
|
||||
'compactness_se', 'concavityse', 'concave_points_se', 'symmetryse', 'fractal_dimension_se',
|
||||
'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
|
||||
'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst'
|
||||
'compactness_worst', 'concavityworst', 'concave_points_worst', 'symmetryworst', 'fractal_dimension_worst'
|
||||
]
|
||||
|
||||
df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
|
||||
|
|
@ -39,7 +154,7 @@ if __name__ == "__main__":
|
|||
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
||||
|
||||
df.dropna(inplace=True) # remove null values
|
||||
print(f"Rows remaining after drop of the null values: {len(df)}")
|
||||
print(f"Rows remaining after drop of the null values: {len(df)}\n")
|
||||
for col in num_cols:
|
||||
df = df[df[col] >= 0]
|
||||
|
||||
|
|
@ -56,5 +171,40 @@ if __name__ == "__main__":
|
|||
# check if there are still null values
|
||||
assert df.isna().sum().sum() == 0, "There are still some null values."
|
||||
|
||||
df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0}) # making diagnosis numeric
|
||||
df['Diagnosis'] = df['Diagnosis'].astype('category')
|
||||
# making diagnosis numeric
|
||||
df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category")
|
||||
|
||||
rng = np.random.default_rng(seed=42)
|
||||
n_samples = len(df)
|
||||
indices = rng.permutation(n_samples)
|
||||
train_size = int(0.8 * n_samples)
|
||||
|
||||
train_idx = indices[:train_size]
|
||||
test_idx = indices[train_size:]
|
||||
|
||||
df_train = df.iloc[train_idx].reset_index(drop=True)
|
||||
df_test = df.iloc[test_idx].reset_index(drop=True)
|
||||
|
||||
# training of the model
|
||||
model = LogisticRegression(learning_rate=0.00005, n_iter=5000, batch_size=64, tolerance=1e-6, verbose=True)
|
||||
# other values could be used, for example (lr=0.01, n_iter=2000, tolerance=1e-3, verbose=False)
|
||||
model.prepare(df_train, target_col="Diagnosis")
|
||||
model.fit()
|
||||
|
||||
# evaluation of the model
|
||||
train_acc = model.score(model.x, model.y)
|
||||
print(f"\nMean accuracy on training data: {train_acc:.4f}")
|
||||
|
||||
# copied prepare method for building test X data
|
||||
x_test_raw = df_test.drop(columns=['Diagnosis']).values.astype(np.float64)
|
||||
x_test_scaled = (x_test_raw - model.mean) / model.std
|
||||
bias_test = np.ones((x_test_scaled.shape[0], 1), dtype=np.float64)
|
||||
X_test = np.hstack((bias_test, x_test_scaled))
|
||||
y_test = df_test['Diagnosis'].values.astype(int)
|
||||
test_acc = model.score(X_test, y_test)
|
||||
print(f"Mean accuracy on testing data: {test_acc:.4f}")
|
||||
|
||||
# predict Y values using the trained data
|
||||
first_10 = X_test[:10]
|
||||
y_hat = model.predict(first_10)
|
||||
print("\nFirst 10 predictions:", y_hat.ravel())
|
||||
|
|
|
|||
107
parkinsons_updrs.names
Executable file
107
parkinsons_updrs.names
Executable file
|
|
@ -0,0 +1,107 @@
|
|||
Parkinsons Telemonitoring Data Set
|
||||
|
||||
Abstract: Oxford Parkinson's Disease Telemonitoring Dataset
|
||||
|
||||
============================================================
|
||||
|
||||
Data Set Characteristics: Multivariate
|
||||
Attribute Characteristics: Integer, Real
|
||||
Associated Tasks: Regression
|
||||
Number of Instances: 5875
|
||||
Number of Attributes: 26
|
||||
Area: Life
|
||||
Date Donated: 2009-10-29
|
||||
|
||||
============================================================
|
||||
|
||||
SOURCE:
|
||||
|
||||
The dataset was created by Athanasios Tsanas (tsanasthanasis '@' gmail.com)
|
||||
and Max Little (littlem '@' physics.ox.ac.uk) of the University of Oxford, in
|
||||
collaboration with 10 medical centers in the US and Intel Corporation who
|
||||
developed the telemonitoring device to record the speech signals. The
|
||||
original study used a range of linear and nonlinear regression methods to
|
||||
predict the clinician's Parkinson's disease symptom score on the UPDRS scale.
|
||||
|
||||
|
||||
============================================================
|
||||
|
||||
DATA SET INFORMATION:
|
||||
|
||||
This dataset is composed of a range of biomedical voice measurements from 42
|
||||
people with early-stage Parkinson's disease recruited to a six-month trial of
|
||||
a telemonitoring device for remote symptom progression monitoring. The
|
||||
recordings were automatically captured in the patient's homes.
|
||||
|
||||
Columns in the table contain subject number, subject age, subject gender,
|
||||
time interval from baseline recruitment date, motor UPDRS, total UPDRS, and
|
||||
16 biomedical voice measures. Each row corresponds to one of 5,875 voice
|
||||
recording from these individuals. The main aim of the data is to predict the
|
||||
motor and total UPDRS scores ('motor_UPDRS' and 'total_UPDRS') from the 16
|
||||
voice measures.
|
||||
|
||||
The data is in ASCII CSV format. The rows of the CSV file contain an instance
|
||||
corresponding to one voice recording. There are around 200 recordings per
|
||||
patient, the subject number of the patient is identified in the first column.
|
||||
For further information or to pass on comments, please contact Athanasios
|
||||
Tsanas (tsanasthanasis '@' gmail.com) or Max Little (littlem '@'
|
||||
physics.ox.ac.uk).
|
||||
|
||||
Further details are contained in the following reference -- if you use this
|
||||
dataset, please cite:
|
||||
Athanasios Tsanas, Max A. Little, Patrick E. McSharry, Lorraine O. Ramig (2009),
|
||||
'Accurate telemonitoring of Parkinson.s disease progression by non-invasive
|
||||
speech tests',
|
||||
IEEE Transactions on Biomedical Engineering (to appear).
|
||||
|
||||
Further details about the biomedical voice measures can be found in:
|
||||
Max A. Little, Patrick E. McSharry, Eric J. Hunter, Lorraine O. Ramig (2009),
|
||||
'Suitability of dysphonia measurements for telemonitoring of Parkinson's
|
||||
disease',
|
||||
IEEE Transactions on Biomedical Engineering, 56(4):1015-1022
|
||||
|
||||
|
||||
===========================================================
|
||||
|
||||
ATTRIBUTE INFORMATION:
|
||||
|
||||
subject# - Integer that uniquely identifies each subject
|
||||
age - Subject age
|
||||
sex - Subject gender '0' - male, '1' - female
|
||||
test_time - Time since recruitment into the trial. The integer part is the
|
||||
number of days since recruitment.
|
||||
motor_UPDRS - Clinician's motor UPDRS score, linearly interpolated
|
||||
total_UPDRS - Clinician's total UPDRS score, linearly interpolated
|
||||
Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP - Several measures of
|
||||
variation in fundamental frequency
|
||||
Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA -
|
||||
Several measures of variation in amplitude
|
||||
NHR,HNR - Two measures of ratio of noise to tonal components in the voice
|
||||
RPDE - A nonlinear dynamical complexity measure
|
||||
DFA - Signal fractal scaling exponent
|
||||
PPE - A nonlinear measure of fundamental frequency variation
|
||||
|
||||
|
||||
===========================================================
|
||||
|
||||
RELEVANT PAPERS:
|
||||
|
||||
Little MA, McSharry PE, Hunter EJ, Ramig LO (2009),
|
||||
'Suitability of dysphonia measurements for telemonitoring of Parkinson's
|
||||
disease',
|
||||
IEEE Transactions on Biomedical Engineering, 56(4):1015-1022
|
||||
|
||||
Little MA, McSharry PE, Roberts SJ, Costello DAE, Moroz IM.
|
||||
'Exploiting Nonlinear Recurrence and Fractal Scaling Properties for Voice
|
||||
Disorder Detection',
|
||||
BioMedical Engineering OnLine 2007, 6:23 (26 June 2007)
|
||||
|
||||
===========================================================
|
||||
|
||||
CITATION REQUEST:
|
||||
|
||||
If you use this dataset, please cite the following paper:
|
||||
A Tsanas, MA Little, PE McSharry, LO Ramig (2009)
|
||||
'Accurate telemonitoring of Parkinson.s disease progression by non-invasive
|
||||
speech tests',
|
||||
IEEE Transactions on Biomedical Engineering (to appear).
|
||||
140
wdbc.names
Executable file
140
wdbc.names
Executable file
|
|
@ -0,0 +1,140 @@
|
|||
1. Title: Wisconsin Diagnostic Breast Cancer (WDBC)
|
||||
|
||||
2. Source Information
|
||||
|
||||
a) Creators:
|
||||
|
||||
Dr. William H. Wolberg, General Surgery Dept., University of
|
||||
Wisconsin, Clinical Sciences Center, Madison, WI 53792
|
||||
wolberg@eagle.surgery.wisc.edu
|
||||
|
||||
W. Nick Street, Computer Sciences Dept., University of
|
||||
Wisconsin, 1210 West Dayton St., Madison, WI 53706
|
||||
street@cs.wisc.edu 608-262-6619
|
||||
|
||||
Olvi L. Mangasarian, Computer Sciences Dept., University of
|
||||
Wisconsin, 1210 West Dayton St., Madison, WI 53706
|
||||
olvi@cs.wisc.edu
|
||||
|
||||
b) Donor: Nick Street
|
||||
|
||||
c) Date: November 1995
|
||||
|
||||
3. Past Usage:
|
||||
|
||||
first usage:
|
||||
|
||||
W.N. Street, W.H. Wolberg and O.L. Mangasarian
|
||||
Nuclear feature extraction for breast tumor diagnosis.
|
||||
IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science
|
||||
and Technology, volume 1905, pages 861-870, San Jose, CA, 1993.
|
||||
|
||||
OR literature:
|
||||
|
||||
O.L. Mangasarian, W.N. Street and W.H. Wolberg.
|
||||
Breast cancer diagnosis and prognosis via linear programming.
|
||||
Operations Research, 43(4), pages 570-577, July-August 1995.
|
||||
|
||||
Medical literature:
|
||||
|
||||
W.H. Wolberg, W.N. Street, and O.L. Mangasarian.
|
||||
Machine learning techniques to diagnose breast cancer from
|
||||
fine-needle aspirates.
|
||||
Cancer Letters 77 (1994) 163-171.
|
||||
|
||||
W.H. Wolberg, W.N. Street, and O.L. Mangasarian.
|
||||
Image analysis and machine learning applied to breast cancer
|
||||
diagnosis and prognosis.
|
||||
Analytical and Quantitative Cytology and Histology, Vol. 17
|
||||
No. 2, pages 77-87, April 1995.
|
||||
|
||||
W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian.
|
||||
Computerized breast cancer diagnosis and prognosis from fine
|
||||
needle aspirates.
|
||||
Archives of Surgery 1995;130:511-516.
|
||||
|
||||
W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian.
|
||||
Computer-derived nuclear features distinguish malignant from
|
||||
benign breast cytology.
|
||||
Human Pathology, 26:792--796, 1995.
|
||||
|
||||
See also:
|
||||
http://www.cs.wisc.edu/~olvi/uwmp/mpml.html
|
||||
http://www.cs.wisc.edu/~olvi/uwmp/cancer.html
|
||||
|
||||
Results:
|
||||
|
||||
- predicting field 2, diagnosis: B = benign, M = malignant
|
||||
- sets are linearly separable using all 30 input features
|
||||
- best predictive accuracy obtained using one separating plane
|
||||
in the 3-D space of Worst Area, Worst Smoothness and
|
||||
Mean Texture. Estimated accuracy 97.5% using repeated
|
||||
10-fold crossvalidations. Classifier has correctly
|
||||
diagnosed 176 consecutive new patients as of November
|
||||
1995.
|
||||
|
||||
4. Relevant information
|
||||
|
||||
Features are computed from a digitized image of a fine needle
|
||||
aspirate (FNA) of a breast mass. They describe
|
||||
characteristics of the cell nuclei present in the image.
|
||||
A few of the images can be found at
|
||||
http://www.cs.wisc.edu/~street/images/
|
||||
|
||||
Separating plane described above was obtained using
|
||||
Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree
|
||||
Construction Via Linear Programming." Proceedings of the 4th
|
||||
Midwest Artificial Intelligence and Cognitive Science Society,
|
||||
pp. 97-101, 1992], a classification method which uses linear
|
||||
programming to construct a decision tree. Relevant features
|
||||
were selected using an exhaustive search in the space of 1-4
|
||||
features and 1-3 separating planes.
|
||||
|
||||
The actual linear program used to obtain the separating plane
|
||||
in the 3-dimensional space is that described in:
|
||||
[K. P. Bennett and O. L. Mangasarian: "Robust Linear
|
||||
Programming Discrimination of Two Linearly Inseparable Sets",
|
||||
Optimization Methods and Software 1, 1992, 23-34].
|
||||
|
||||
|
||||
This database is also available through the UW CS ftp server:
|
||||
|
||||
ftp ftp.cs.wisc.edu
|
||||
cd math-prog/cpo-dataset/machine-learn/WDBC/
|
||||
|
||||
5. Number of instances: 569
|
||||
|
||||
6. Number of attributes: 32 (ID, diagnosis, 30 real-valued input features)
|
||||
|
||||
7. Attribute information
|
||||
|
||||
1) ID number
|
||||
2) Diagnosis (M = malignant, B = benign)
|
||||
3-32)
|
||||
|
||||
Ten real-valued features are computed for each cell nucleus:
|
||||
|
||||
a) radius (mean of distances from center to points on the perimeter)
|
||||
b) texture (standard deviation of gray-scale values)
|
||||
c) perimeter
|
||||
d) area
|
||||
e) smoothness (local variation in radius lengths)
|
||||
f) compactness (perimeter^2 / area - 1.0)
|
||||
g) concavity (severity of concave portions of the contour)
|
||||
h) concave points (number of concave portions of the contour)
|
||||
i) symmetry
|
||||
j) fractal dimension ("coastline approximation" - 1)
|
||||
|
||||
Several of the papers listed above contain detailed descriptions of
|
||||
how these features are computed.
|
||||
|
||||
The mean, standard error, and "worst" or largest (mean of the three
|
||||
largest values) of these features were computed for each image,
|
||||
resulting in 30 features. For instance, field 3 is Mean Radius, field
|
||||
13 is Radius SE, field 23 is Worst Radius.
|
||||
|
||||
All feature values are recoded with four significant digits.
|
||||
|
||||
8. Missing attribute values: none
|
||||
|
||||
9. Class distribution: 357 benign, 212 malignant
|
||||
Loading…
Add table
Add a link
Reference in a new issue