Finished the implementation of the python code.
This commit is contained in:
parent
5702c3c1b8
commit
455b48c89b
6 changed files with 540 additions and 159 deletions
|
|
@ -27,8 +27,8 @@ class LinearRegression:
|
||||||
|
|
||||||
class LinearRegression:
|
class LinearRegression:
|
||||||
'''
|
'''
|
||||||
Constructor for the Linear Regression with analytical. It uses bias. It also
|
Constructor for the linear regression with analytical solution. It uses bias. It also
|
||||||
initializes the weight, mean and std.
|
initializes the weight, mean and standard deviation.
|
||||||
'''
|
'''
|
||||||
def __init__(self, add_bias):
|
def __init__(self, add_bias):
|
||||||
self.add_bias = add_bias # bias to prepend a column of ones (the intercept term)
|
self.add_bias = add_bias # bias to prepend a column of ones (the intercept term)
|
||||||
|
|
@ -60,7 +60,8 @@ class LinearRegression:
|
||||||
def fit(self, x: pd.DataFrame, y: pd.Series) -> "LinearRegression":
|
def fit(self, x: pd.DataFrame, y: pd.Series) -> "LinearRegression":
|
||||||
'''
|
'''
|
||||||
Fit method to fit X and Y datas through pandas and train the linear model by analytical solution.
|
Fit method to fit X and Y datas through pandas and train the linear model by analytical solution.
|
||||||
It uses pandas DataFrame for the X and Series for the Y.
|
It uses pandas DataFrame for the X and Series for the Y. It uses the linear regression formula
|
||||||
|
to calculate weight
|
||||||
'''
|
'''
|
||||||
x = self.prepare(x)
|
x = self.prepare(x)
|
||||||
y = pd.Series(y).astype("float64")
|
y = pd.Series(y).astype("float64")
|
||||||
|
|
@ -84,7 +85,7 @@ class LinearRegression:
|
||||||
|
|
||||||
def predict(self, x: pd.DataFrame) -> pd.Series:
|
def predict(self, x: pd.DataFrame) -> pd.Series:
|
||||||
'''
|
'''
|
||||||
Predict method is used to test trained data to do X prediction by multiplying X and weight vectors.
|
Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors.
|
||||||
'''
|
'''
|
||||||
if self.w is None: # if weight is empty, throw error
|
if self.w is None: # if weight is empty, throw error
|
||||||
raise RuntimeError("Model is not fitted yet. Call `fit` first.")
|
raise RuntimeError("Model is not fitted yet. Call `fit` first.")
|
||||||
|
|
@ -95,7 +96,7 @@ class LinearRegression:
|
||||||
def score(self, x: pd.DataFrame, y: pd.Series) -> float:
|
def score(self, x: pd.DataFrame, y: pd.Series) -> float:
|
||||||
'''
|
'''
|
||||||
This method is used to calculate coefficient of determination to assess the goodness
|
This method is used to calculate coefficient of determination to assess the goodness
|
||||||
of fit from a regression model
|
of fit from the linear regression model
|
||||||
'''
|
'''
|
||||||
y_pred = self.predict(x) # predicts Y value with X predict method.
|
y_pred = self.predict(x) # predicts Y value with X predict method.
|
||||||
y = pd.Series(y).astype('float64')
|
y = pd.Series(y).astype('float64')
|
||||||
|
|
@ -127,7 +128,7 @@ if __name__ == "__main__":
|
||||||
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
||||||
|
|
||||||
df.dropna(inplace=True) # remove null values
|
df.dropna(inplace=True) # remove null values
|
||||||
print(f"Rows remaining after drop of the null values: {len(df)}")
|
print(f"Rows remaining after drop of the null values: {len(df)}\n")
|
||||||
|
|
||||||
# sanity checks for data validity - realistic parkinson data range estimations
|
# sanity checks for data validity - realistic parkinson data range estimations
|
||||||
df = df[(df['age'] >= 18) & (df['age'] <= 95)]
|
df = df[(df['age'] >= 18) & (df['age'] <= 95)]
|
||||||
|
|
@ -157,12 +158,9 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# evaluation of the model
|
# evaluation of the model
|
||||||
print("\nR² on training data:", model.score(x_train, y_train))
|
print("\nR² on training data:", model.score(x_train, y_train))
|
||||||
print("\nR² on testing data:", model.score(x_test, y_test))
|
print("R² on testing data:", model.score(x_test, y_test))
|
||||||
|
|
||||||
# predict Y values using the trained data
|
# predict Y values using the trained data
|
||||||
preds = model.predict(x_test)
|
preds = model.predict(x_test)
|
||||||
print("\nFirst 5 predictions:")
|
print("\nFirst 10 predictions:")
|
||||||
print(preds.head())
|
print(preds.head(10))
|
||||||
|
|
||||||
print("\nWeights:")
|
|
||||||
print(model.w.round(4))
|
|
||||||
|
|
|
||||||
|
|
@ -1,144 +1,126 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
class LogisticRegressionGD:
|
|
||||||
"""Binary logistic regression trained with batch gradient descent."""
|
class LogisticRegression:
|
||||||
def __init__(self,
|
'''
|
||||||
learning_rate: float = 0.01,
|
Constructor for the logistic regression with gradient descent. It uses learning rate, iteration number,
|
||||||
n_iter: int = 1000,
|
tolerance and verbose. It also initializes the weight, loss, x, y, mean and std.
|
||||||
tolerance: float = 1e-5,
|
'''
|
||||||
verbose: bool = False):
|
|
||||||
"""
|
def __init__(self, learning_rate: float, n_iter: int, tolerance: float, verbose: bool) -> None:
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
learning_rate : float
|
|
||||||
Step size for weight updates.
|
|
||||||
n_iter : int
|
|
||||||
Maximum number of iterations.
|
|
||||||
tolerance : float
|
|
||||||
Stopping criterion: if the change in loss is < tolerance, stop.
|
|
||||||
verbose : bool
|
|
||||||
If True, prints loss at every 100 iterations.
|
|
||||||
"""
|
|
||||||
self.lr = learning_rate
|
self.lr = learning_rate
|
||||||
self.n_iter = n_iter
|
self.n_iter = n_iter
|
||||||
self.tol = tolerance
|
self.tol = tolerance
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
|
self.w: np.ndarray | None = None # weight/coefficient (bias as first element)
|
||||||
|
self.loss: list[float] = [] # loss per iteration
|
||||||
|
self.x: np.ndarray | None = None # matrix of inputs after standardisation
|
||||||
|
self.y: np.ndarray | None = None # target vector
|
||||||
|
self.mean: np.ndarray | None = None # used for standardisation
|
||||||
|
self.std: np.ndarray | None = None # standard deviation
|
||||||
|
|
||||||
# placeholders that will be filled during training
|
|
||||||
self.w_ = None # weights (including bias as w[0])
|
|
||||||
self.loss_history_ = [] # loss at each iteration
|
|
||||||
self.X_ = None # feature matrix (after standardisation)
|
|
||||||
self.y_ = None # target vector (0/1)
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# 2. Sigmoid helper (vectorised)
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _sigmoid(z: np.ndarray) -> np.ndarray:
|
def sigmoid(z: np.ndarray) -> np.ndarray:
|
||||||
return 1.0 / (1.0 + np.exp(-z))
|
"""Sigmoid method for the logistic regression method."""
|
||||||
|
return 1.0 / (1.0 + np.exp(-z)) # 1/(1+exp(-z))
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# 3. Cost function (cross‑entropy)
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _cost(y: np.ndarray, p: np.ndarray) -> float:
|
def cost(y: np.ndarray, p: np.ndarray) -> float:
|
||||||
# avoid log(0) by clipping
|
"""Cross‑entropy loss is used for the cost calculation"""
|
||||||
eps = 1e-15
|
eps = 1e-15
|
||||||
p = np.clip(p, eps, 1 - eps)
|
p = np.clip(p, eps, 1 - eps)
|
||||||
return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p))
|
return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p))
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
def prepare(self, df: pd.DataFrame, target_col: str) -> None:
|
||||||
# 4. Data preparation – this is where we split X / y, scale, etc.
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
def prepare(self, df: pd.DataFrame, target_col: str = 'Diagnosis') -> None:
|
|
||||||
"""
|
"""
|
||||||
Splits `df` into X and y, standardises X (mean=0, std=1),
|
|
||||||
and stores the result in the class attributes.
|
|
||||||
|
|
||||||
Parameters
|
Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column.
|
||||||
----------
|
Then it does standardisation, adds bias and initializes the weight/coefficient.
|
||||||
df : pd.DataFrame
|
|
||||||
Cleaned data – *already* contains a numeric target in `target_col`.
|
|
||||||
target_col : str
|
|
||||||
Name of the binary target column.
|
|
||||||
"""
|
"""
|
||||||
# target must be a 0/1 array
|
if target_col not in df.columns:
|
||||||
self.y_ = df[target_col].values.astype(np.int64)
|
raise ValueError(f"Target column '{target_col}' not found in DataFrame.")
|
||||||
|
|
||||||
# X – all columns except the target
|
self.y = df[target_col].values.astype(np.int64)
|
||||||
X_raw = df.drop(columns=[target_col]).values.astype(np.float64)
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------
|
x_raw = df.drop(columns=[target_col]).values.astype(np.float64)
|
||||||
# 3.1 Feature scaling – we put the bias in the first column
|
|
||||||
# -----------------------------------------------------------------
|
|
||||||
# compute mean / std on the whole training set (no train/val split yet)
|
|
||||||
self.mean_ = X_raw.mean(axis=0)
|
|
||||||
self.std_ = X_raw.std(axis=0)
|
|
||||||
# avoid division by zero
|
|
||||||
self.std_[self.std_ == 0] = 1.0
|
|
||||||
|
|
||||||
X_scaled = (X_raw - self.mean_) / self.std_
|
# standardisation
|
||||||
# add bias column (all ones)
|
self.mean = x_raw.mean(axis=0)
|
||||||
X_scaled = np.hstack([np.ones((X_scaled.shape[0], 1)), X_scaled])
|
self.std = x_raw.std(axis=0)
|
||||||
|
self.std[self.std == 0] = 1.0
|
||||||
|
|
||||||
self.X_ = X_scaled
|
x_scaled = (x_raw - self.mean) / self.std # standardisation formula
|
||||||
self.w_ = np.zeros(X_scaled.shape[1]) # initialise weights
|
|
||||||
|
|
||||||
|
bias = np.ones((x_scaled.shape[0], 1), dtype=np.float64) # adding bias
|
||||||
|
self.x = np.hstack((bias, x_scaled))
|
||||||
|
|
||||||
|
self.w = np.zeros(self.x.shape[1], dtype=np.float64) # initialize weight as zero
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# 4. Fit – batch gradient descent
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
def fit(self) -> None:
|
def fit(self) -> None:
|
||||||
"""Runs batch gradient descent for `n_iter` epochs."""
|
"""
|
||||||
|
|
||||||
|
Fit method to fit X and Y datas through pandas and train the linear model by gradient descent.
|
||||||
|
For the n iterations, it finds probabilities through sigmoid of linear prediction and does the
|
||||||
|
gradient to calculate the loss.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if self.x is None or self.y is None: # if x or y are empty, throw error
|
||||||
|
raise RuntimeError("Model is not fitted yet. Call `fit` first.")
|
||||||
|
|
||||||
for i in range(1, self.n_iter + 1):
|
for i in range(1, self.n_iter + 1):
|
||||||
z = np.dot(self.X_, self.w_) # linear part
|
z = self.x.dot(self.w) # linear prediction
|
||||||
p = self._sigmoid(z) # predicted probabilities
|
p = self.sigmoid(z) # probabilities of the model predictions
|
||||||
|
|
||||||
# gradient of the log‑likelihood (including bias)
|
gradient = self.x.T.dot(p - self.y) / self.y.size # gradient calculation formula
|
||||||
gradient = np.dot(self.X_.T, (p - self.y_)) / self.y_.size
|
|
||||||
|
|
||||||
# weight update
|
self.w -= self.lr * gradient # gradient multiplied by learning rate is removed from weight
|
||||||
self.w_ -= self.lr * gradient
|
|
||||||
|
|
||||||
# record cost and check stopping criterion
|
loss = self.cost(self.y, p) # cost is calculated through cross‑entropy and added for the current range
|
||||||
loss = self._cost(self.y_, p)
|
self.loss.append(loss)
|
||||||
self.loss_history_.append(loss)
|
|
||||||
|
|
||||||
|
# if verbose, it shows the loss every 100 iterations and displays it
|
||||||
if self.verbose and i % 100 == 0:
|
if self.verbose and i % 100 == 0:
|
||||||
print(f"Iteration {i:4d} – loss: {loss:.6f}")
|
print(f"Iter {i:4d} – loss: {loss:.6f}")
|
||||||
|
|
||||||
if i > 1 and abs(self.loss_history_[-2] - loss) < self.tol:
|
# tests whether the absolute change in loss is smaller than the tolerance
|
||||||
|
if i > 1 and abs(self.loss[-2] - loss) < self.tol:
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"Converged after {i} iterations.")
|
print(f"Converged after {i} iterations.")
|
||||||
break
|
break # loss is stopped so further training would be unnecessary
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
def predict(self, x: np.ndarray | pd.DataFrame) -> np.ndarray:
|
||||||
# 5. Predict – binary class labels
|
"""
|
||||||
# ------------------------------------------------------------------
|
Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors
|
||||||
def predict(self, X: np.ndarray) -> np.ndarray:
|
and then calculates the model probability by applying sigmoid function.
|
||||||
"""Return 0/1 predictions for a new X matrix (already scaled)."""
|
"""
|
||||||
z = np.dot(X, self.w_)
|
if isinstance(x, pd.DataFrame): # verifies value type
|
||||||
probs = self._sigmoid(z)
|
x = x.values.astype(np.float64)
|
||||||
return (probs >= 0.5).astype(int)
|
if x.ndim == 1:
|
||||||
|
x = x.reshape(1, -1)
|
||||||
# ------------------------------------------------------------------
|
z = x.dot(self.w)
|
||||||
# 6. Score – accuracy on a given (X, y) pair
|
probs = self.sigmoid(z) # probability calculation through sigmoid method
|
||||||
# ------------------------------------------------------------------
|
return (probs >= 0.5).astype(int) # 0.5 is commonly used to define positivity of the probability
|
||||||
def score(self, X: np.ndarray, y: np.ndarray) -> float:
|
|
||||||
"""Return the classification accuracy."""
|
|
||||||
y_pred = self.predict(X)
|
|
||||||
return np.mean(y_pred == y)
|
|
||||||
|
|
||||||
|
def score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
|
||||||
|
"""
|
||||||
|
This method is used to calculate mean accuracy with the prediction of Y and actual Y values.
|
||||||
|
"""
|
||||||
|
y_pred = self.predict(x)
|
||||||
|
y_true = np.asarray(y).astype(int)
|
||||||
|
return np.mean(y_pred == y_true) # mean is calculated if Y values match
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
columns = [
|
columns = [
|
||||||
'ID', 'Diagnosis',
|
'ID', 'Diagnosis',
|
||||||
'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
|
'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
|
||||||
'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
|
'compactness_mean', 'concavitymean', 'concave_points_mean', 'symmetrymean', 'fractal_dimension_mean',
|
||||||
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
|
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
|
||||||
'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se',
|
'compactness_se', 'concavityse', 'concave_points_se', 'symmetryse', 'fractal_dimension_se',
|
||||||
'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
|
'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
|
||||||
'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst'
|
'compactness_worst', 'concavityworst', 'concave_points_worst', 'symmetryworst', 'fractal_dimension_worst'
|
||||||
]
|
]
|
||||||
|
|
||||||
df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
|
df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
|
||||||
|
|
@ -155,7 +137,7 @@ if __name__ == "__main__":
|
||||||
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
||||||
|
|
||||||
df.dropna(inplace=True) # remove null values
|
df.dropna(inplace=True) # remove null values
|
||||||
print(f"Rows remaining after drop of the null values: {len(df)}")
|
print(f"Rows remaining after drop of the null values: {len(df)}\n")
|
||||||
for col in num_cols:
|
for col in num_cols:
|
||||||
df = df[df[col] >= 0]
|
df = df[df[col] >= 0]
|
||||||
|
|
||||||
|
|
@ -172,33 +154,40 @@ if __name__ == "__main__":
|
||||||
# check if there are still null values
|
# check if there are still null values
|
||||||
assert df.isna().sum().sum() == 0, "There are still some null values."
|
assert df.isna().sum().sum() == 0, "There are still some null values."
|
||||||
|
|
||||||
df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0}) # making diagnosis numeric
|
# making diagnosis numeric
|
||||||
df['Diagnosis'] = df['Diagnosis'].astype('category')
|
df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category")
|
||||||
|
|
||||||
# ---- 7.2 Instantiate and train ------------------------------------
|
rng = np.random.default_rng(seed=42)
|
||||||
model = LogisticRegressionGD(learning_rate=0.05,
|
n_train = len(df)
|
||||||
n_iter=5000,
|
indices = rng.permutation(n_train)
|
||||||
tolerance=1e-6,
|
train_size = int(0.8 * n_train)
|
||||||
verbose=True)
|
|
||||||
|
|
||||||
# we need to split X / y here
|
train_idx = indices[:train_size]
|
||||||
X = df.drop(columns=['Diagnosis'])
|
test_idx = indices[train_size:]
|
||||||
y = df['Diagnosis'].cat.codes.values # 0/1 array
|
|
||||||
|
|
||||||
# Standardise X inside the model for us – we’ll do it in `prepare`
|
df_train = df.iloc[train_idx].reset_index(drop=True)
|
||||||
model.X_ = (X - X.mean()) / X.std() # bias‑column will be added later
|
df_test = df.iloc[test_idx].reset_index(drop=True)
|
||||||
model.X_ = np.hstack([np.ones((model.X_.shape[0], 1)), model.X_]) # add bias
|
|
||||||
model.y_ = y
|
|
||||||
|
|
||||||
# Fit the model
|
# training of the model
|
||||||
|
model = LogisticRegression(learning_rate=0.00005, n_iter=5000, tolerance=1e-6, verbose=True)
|
||||||
|
# other values could be used, for example (lr=0.01, n_iter=2000, tolerance=1e-3, verbose=False)
|
||||||
|
model.prepare(df_train, target_col="Diagnosis")
|
||||||
model.fit()
|
model.fit()
|
||||||
|
|
||||||
# -------------------------------------------------
|
# evaluation of the model
|
||||||
# 8. Evaluate on the same data (you could split)
|
train_acc = model.score(model.x, model.y)
|
||||||
# -------------------------------------------------
|
print(f"\nMean accuracy on training data: {train_acc:.4f}")
|
||||||
acc = model.score(model.X_, model.y_)
|
|
||||||
print(f"Training accuracy (on the whole cleaned set): {acc:.4f}")
|
|
||||||
|
|
||||||
# Example: predict on the first 10 samples
|
# copied prepare method for building test X data
|
||||||
y_hat = model.predict(model.X_[:10])
|
x_test_raw = df_test.drop(columns=['Diagnosis']).values.astype(np.float64)
|
||||||
print("First 10 predictions:", y_hat)
|
x_test_scaled = (x_test_raw - model.mean) / model.std
|
||||||
|
bias_test = np.ones((x_test_scaled.shape[0], 1), dtype=np.float64)
|
||||||
|
X_test = np.hstack((bias_test, x_test_scaled))
|
||||||
|
y_test = df_test['Diagnosis'].values.astype(int)
|
||||||
|
test_acc = model.score(X_test, y_test)
|
||||||
|
print(f"Mean accuracy on testing data: {test_acc:.4f}")
|
||||||
|
|
||||||
|
# predict Y values using the trained data
|
||||||
|
first_10 = X_test[:10]
|
||||||
|
y_hat = model.predict(first_10)
|
||||||
|
print("\nFirst 10 predictions:", y_hat.ravel())
|
||||||
|
|
@ -3,8 +3,8 @@ import pandas as pd
|
||||||
|
|
||||||
class LinearRegression:
|
class LinearRegression:
|
||||||
'''
|
'''
|
||||||
Constructor for the Linear Regression with mini‑batch stochastic gradient descent. It uses learning rate,
|
Constructor for the linear regression with mini‑batch stochastic gradient descent. It uses learning rate,
|
||||||
iteration number, batch size, bias and verbose. It also initializes the weight, mean and std.
|
iteration number, batch size, bias and verbose. It also initializes the weight, mean and standard deviation.
|
||||||
'''
|
'''
|
||||||
def __init__(self, lr, n_iter, batch_size, add_bias, verbose):
|
def __init__(self, lr, n_iter, batch_size, add_bias, verbose):
|
||||||
self.lr = lr # learning rate
|
self.lr = lr # learning rate
|
||||||
|
|
@ -90,7 +90,7 @@ class LinearRegression:
|
||||||
|
|
||||||
def predict(self, x: pd.DataFrame) -> pd.Series:
|
def predict(self, x: pd.DataFrame) -> pd.Series:
|
||||||
'''
|
'''
|
||||||
Predict method makes X prediction by multiplying X and weight vectors.
|
Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors.
|
||||||
'''
|
'''
|
||||||
if self.w is None: # if weight is empty, throw error
|
if self.w is None: # if weight is empty, throw error
|
||||||
raise RuntimeError("Model is not fitted yet. Call `fit` first.")
|
raise RuntimeError("Model is not fitted yet. Call `fit` first.")
|
||||||
|
|
@ -101,7 +101,7 @@ class LinearRegression:
|
||||||
def score(self, x: pd.DataFrame, y: pd.Series) -> float:
|
def score(self, x: pd.DataFrame, y: pd.Series) -> float:
|
||||||
'''
|
'''
|
||||||
This method is used to calculate coefficient of determination to assess the goodness
|
This method is used to calculate coefficient of determination to assess the goodness
|
||||||
of fit from a regression model
|
of fit from the linear regression model
|
||||||
'''
|
'''
|
||||||
y_pred = self.predict(x) # predicts Y value with X predict method.
|
y_pred = self.predict(x) # predicts Y value with X predict method.
|
||||||
y = pd.Series(y).astype('float64')
|
y = pd.Series(y).astype('float64')
|
||||||
|
|
@ -133,7 +133,7 @@ if __name__ == "__main__":
|
||||||
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
||||||
|
|
||||||
df.dropna(inplace=True) # remove null values
|
df.dropna(inplace=True) # remove null values
|
||||||
print(f"Rows remaining after drop of the null values: {len(df)}")
|
print(f"Rows remaining after drop of the null values: {len(df)}\n")
|
||||||
|
|
||||||
# sanity checks for data validity - realistic parkinson data range estimations
|
# sanity checks for data validity - realistic parkinson data range estimations
|
||||||
df = df[(df['age'] >= 18) & (df['age'] <= 95)]
|
df = df[(df['age'] >= 18) & (df['age'] <= 95)]
|
||||||
|
|
@ -164,12 +164,9 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# evaluation of the model
|
# evaluation of the model
|
||||||
print("\nR² on training data:", model.score(x_train, y_train))
|
print("\nR² on training data:", model.score(x_train, y_train))
|
||||||
print("\nR² on testing data:", model.score(x_test, y_test))
|
print("R² on testing data:", model.score(x_test, y_test))
|
||||||
|
|
||||||
# predict Y values using the trained data
|
# predict Y values using the trained data
|
||||||
preds = model.predict(x_test)
|
preds = model.predict(x_test)
|
||||||
print("\nFirst 5 predictions:")
|
print("\nFirst 10 predictions:")
|
||||||
print(preds.head())
|
print(preds.head(10))
|
||||||
|
|
||||||
print("\nWeights:")
|
|
||||||
print(model.w.round(4))
|
|
||||||
|
|
@ -1,28 +1,143 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
'''
|
|
||||||
class LogisticRegression:
|
class LogisticRegression:
|
||||||
def __init__(self):
|
|
||||||
|
|
||||||
def prepare(self):
|
|
||||||
|
|
||||||
def fit(self):
|
|
||||||
|
|
||||||
def predict(self):
|
|
||||||
|
|
||||||
def score(self):
|
|
||||||
'''
|
'''
|
||||||
|
Constructor for the logistic regression with gradient descent. It uses learning rate, iteration number,
|
||||||
|
tolerance and verbose. It also initializes the weight, loss, x, y, mean and std.
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, learning_rate: float, n_iter: int, batch_size: int, tolerance: float, verbose: bool) -> None:
|
||||||
|
self.lr = learning_rate
|
||||||
|
self.n_iter = n_iter
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.tol = tolerance
|
||||||
|
self.verbose = verbose
|
||||||
|
self.w: np.ndarray | None = None # weight/coefficient (bias as first element)
|
||||||
|
self.loss: list[float] = [] # loss per iteration
|
||||||
|
self.x: np.ndarray | None = None # matrix of inputs after standardisation
|
||||||
|
self.y: np.ndarray | None = None # target vector
|
||||||
|
self.mean: np.ndarray | None = None # used for standardisation
|
||||||
|
self.std: np.ndarray | None = None # standard deviation
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def sigmoid(z: np.ndarray) -> np.ndarray:
|
||||||
|
"""Sigmoid method for the logistic regression method."""
|
||||||
|
return 1.0 / (1.0 + np.exp(-z)) # 1/(1+exp(-z))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def cost(y: np.ndarray, p: np.ndarray) -> float:
|
||||||
|
"""Cross‑entropy loss is used for the cost calculation"""
|
||||||
|
eps = 1e-15
|
||||||
|
p = np.clip(p, eps, 1 - eps)
|
||||||
|
return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p))
|
||||||
|
|
||||||
|
def prepare(self, df: pd.DataFrame, target_col: str) -> None:
|
||||||
|
"""
|
||||||
|
|
||||||
|
Preparation method splits df into x and y. It does define X and Y values from the dataframe and target column.
|
||||||
|
Then it does standardisation, adds bias and initializes the weight/coefficient.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if target_col not in df.columns:
|
||||||
|
raise ValueError(f"Target column '{target_col}' not found in DataFrame.")
|
||||||
|
|
||||||
|
self.y = df[target_col].values.astype(np.int64)
|
||||||
|
|
||||||
|
x_raw = df.drop(columns=[target_col]).values.astype(np.float64)
|
||||||
|
|
||||||
|
# standardisation
|
||||||
|
self.mean = x_raw.mean(axis=0)
|
||||||
|
self.std = x_raw.std(axis=0)
|
||||||
|
self.std[self.std == 0] = 1.0
|
||||||
|
|
||||||
|
x_scaled = (x_raw - self.mean) / self.std # standardisation formula
|
||||||
|
|
||||||
|
|
||||||
|
bias = np.ones((x_scaled.shape[0], 1), dtype=np.float64) # adding bias
|
||||||
|
self.x = np.hstack((bias, x_scaled))
|
||||||
|
|
||||||
|
self.w = np.zeros(self.x.shape[1], dtype=np.float64) # initialize weight as zero
|
||||||
|
|
||||||
|
def fit(self) -> None:
|
||||||
|
"""
|
||||||
|
|
||||||
|
Fit method to fit X and Y datas through pandas and train the linear model by gradient descent.
|
||||||
|
For the n iterations, it finds probabilities through sigmoid of linear prediction and does the
|
||||||
|
gradient to calculate the loss.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if self.x is None or self.y is None: # if x or y are empty, throw error
|
||||||
|
raise RuntimeError("Model is not fitted yet. Call `prepare` first.")
|
||||||
|
|
||||||
|
n_samples = self.x.shape[0]
|
||||||
|
batch_size = self.batch_size or n_samples
|
||||||
|
|
||||||
|
for epoch in range(1, self.n_iter + 1):
|
||||||
|
shuffled_idx = np.random.permutation(n_samples) # random permutation of the indices
|
||||||
|
x_shuffled = self.x[shuffled_idx]
|
||||||
|
y_shuffled = self.y[shuffled_idx]
|
||||||
|
|
||||||
|
# process execution for each mini‑batch
|
||||||
|
for b in range(0, n_samples, batch_size):
|
||||||
|
start = b * batch_size
|
||||||
|
end = start + batch_size
|
||||||
|
idx = shuffled_idx[start:end]
|
||||||
|
|
||||||
|
x_batch = x_shuffled[idx]
|
||||||
|
y_batch = y_shuffled[idx]
|
||||||
|
|
||||||
|
z = x_batch.dot(self.w)
|
||||||
|
p = self.sigmoid(z)
|
||||||
|
|
||||||
|
grad = x_batch.T.dot(p - y_batch) / y_batch.size # gradient calculation formula
|
||||||
|
self.w -= self.lr * grad # gradient multiplied by learning rate is removed from weight
|
||||||
|
|
||||||
|
# cost is calculated through cross‑entropy and added for the current range
|
||||||
|
loss = self.cost(self.y, self.sigmoid(self.x.dot(self.w)))
|
||||||
|
self.loss.append(loss)
|
||||||
|
|
||||||
|
# if verbose, it shows the loss every 100 iterations and displays it
|
||||||
|
if self.verbose and epoch % 100 == 0:
|
||||||
|
print(f"Iter {epoch:4d} – loss: {loss:.6f}")
|
||||||
|
|
||||||
|
# tests whether the absolute change in loss is smaller than the tolerance
|
||||||
|
if epoch > 1 and abs(self.loss[-2] - loss) < self.tol:
|
||||||
|
if self.verbose:
|
||||||
|
print(f"Converged after {epoch} iterations.")
|
||||||
|
break
|
||||||
|
|
||||||
|
def predict(self, x: np.ndarray | pd.DataFrame) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Predict method is used to test trained data to do Y prediction by multiplying X and weight vectors
|
||||||
|
and then calculates the model probability by applying sigmoid function.
|
||||||
|
"""
|
||||||
|
if isinstance(x, pd.DataFrame): # verifies value type
|
||||||
|
x = x.values.astype(np.float64)
|
||||||
|
if x.ndim == 1:
|
||||||
|
x = x.reshape(1, -1)
|
||||||
|
z = x.dot(self.w)
|
||||||
|
probs = self.sigmoid(z) # probability calculation through sigmoid method
|
||||||
|
return (probs >= 0.5).astype(int) # 0.5 is commonly used to define positivity of the probability
|
||||||
|
|
||||||
|
def score(self, x: np.ndarray | pd.DataFrame, y: np.ndarray | pd.Series) -> float:
|
||||||
|
"""
|
||||||
|
This method is used to calculate mean accuracy with the prediction of Y and actual Y values.
|
||||||
|
"""
|
||||||
|
y_pred = self.predict(x)
|
||||||
|
y_true = np.asarray(y).astype(int)
|
||||||
|
return np.mean(y_pred == y_true) # mean is calculated if Y values match
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
columns = [
|
columns = [
|
||||||
'ID', 'Diagnosis',
|
'ID', 'Diagnosis',
|
||||||
'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
|
'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
|
||||||
'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
|
'compactness_mean', 'concavitymean', 'concave_points_mean', 'symmetrymean', 'fractal_dimension_mean',
|
||||||
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
|
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
|
||||||
'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se',
|
'compactness_se', 'concavityse', 'concave_points_se', 'symmetryse', 'fractal_dimension_se',
|
||||||
'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
|
'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
|
||||||
'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst'
|
'compactness_worst', 'concavityworst', 'concave_points_worst', 'symmetryworst', 'fractal_dimension_worst'
|
||||||
]
|
]
|
||||||
|
|
||||||
df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
|
df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
|
||||||
|
|
@ -39,7 +154,7 @@ if __name__ == "__main__":
|
||||||
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
||||||
|
|
||||||
df.dropna(inplace=True) # remove null values
|
df.dropna(inplace=True) # remove null values
|
||||||
print(f"Rows remaining after drop of the null values: {len(df)}")
|
print(f"Rows remaining after drop of the null values: {len(df)}\n")
|
||||||
for col in num_cols:
|
for col in num_cols:
|
||||||
df = df[df[col] >= 0]
|
df = df[df[col] >= 0]
|
||||||
|
|
||||||
|
|
@ -56,5 +171,40 @@ if __name__ == "__main__":
|
||||||
# check if there are still null values
|
# check if there are still null values
|
||||||
assert df.isna().sum().sum() == 0, "There are still some null values."
|
assert df.isna().sum().sum() == 0, "There are still some null values."
|
||||||
|
|
||||||
df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0}) # making diagnosis numeric
|
# making diagnosis numeric
|
||||||
df['Diagnosis'] = df['Diagnosis'].astype('category')
|
df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category")
|
||||||
|
|
||||||
|
rng = np.random.default_rng(seed=42)
|
||||||
|
n_samples = len(df)
|
||||||
|
indices = rng.permutation(n_samples)
|
||||||
|
train_size = int(0.8 * n_samples)
|
||||||
|
|
||||||
|
train_idx = indices[:train_size]
|
||||||
|
test_idx = indices[train_size:]
|
||||||
|
|
||||||
|
df_train = df.iloc[train_idx].reset_index(drop=True)
|
||||||
|
df_test = df.iloc[test_idx].reset_index(drop=True)
|
||||||
|
|
||||||
|
# training of the model
|
||||||
|
model = LogisticRegression(learning_rate=0.00005, n_iter=5000, batch_size=64, tolerance=1e-6, verbose=True)
|
||||||
|
# other values could be used, for example (lr=0.01, n_iter=2000, tolerance=1e-3, verbose=False)
|
||||||
|
model.prepare(df_train, target_col="Diagnosis")
|
||||||
|
model.fit()
|
||||||
|
|
||||||
|
# evaluation of the model
|
||||||
|
train_acc = model.score(model.x, model.y)
|
||||||
|
print(f"\nMean accuracy on training data: {train_acc:.4f}")
|
||||||
|
|
||||||
|
# copied prepare method for building test X data
|
||||||
|
x_test_raw = df_test.drop(columns=['Diagnosis']).values.astype(np.float64)
|
||||||
|
x_test_scaled = (x_test_raw - model.mean) / model.std
|
||||||
|
bias_test = np.ones((x_test_scaled.shape[0], 1), dtype=np.float64)
|
||||||
|
X_test = np.hstack((bias_test, x_test_scaled))
|
||||||
|
y_test = df_test['Diagnosis'].values.astype(int)
|
||||||
|
test_acc = model.score(X_test, y_test)
|
||||||
|
print(f"Mean accuracy on testing data: {test_acc:.4f}")
|
||||||
|
|
||||||
|
# predict Y values using the trained data
|
||||||
|
first_10 = X_test[:10]
|
||||||
|
y_hat = model.predict(first_10)
|
||||||
|
print("\nFirst 10 predictions:", y_hat.ravel())
|
||||||
|
|
|
||||||
107
parkinsons_updrs.names
Executable file
107
parkinsons_updrs.names
Executable file
|
|
@ -0,0 +1,107 @@
|
||||||
|
Parkinsons Telemonitoring Data Set
|
||||||
|
|
||||||
|
Abstract: Oxford Parkinson's Disease Telemonitoring Dataset
|
||||||
|
|
||||||
|
============================================================
|
||||||
|
|
||||||
|
Data Set Characteristics: Multivariate
|
||||||
|
Attribute Characteristics: Integer, Real
|
||||||
|
Associated Tasks: Regression
|
||||||
|
Number of Instances: 5875
|
||||||
|
Number of Attributes: 26
|
||||||
|
Area: Life
|
||||||
|
Date Donated: 2009-10-29
|
||||||
|
|
||||||
|
============================================================
|
||||||
|
|
||||||
|
SOURCE:
|
||||||
|
|
||||||
|
The dataset was created by Athanasios Tsanas (tsanasthanasis '@' gmail.com)
|
||||||
|
and Max Little (littlem '@' physics.ox.ac.uk) of the University of Oxford, in
|
||||||
|
collaboration with 10 medical centers in the US and Intel Corporation who
|
||||||
|
developed the telemonitoring device to record the speech signals. The
|
||||||
|
original study used a range of linear and nonlinear regression methods to
|
||||||
|
predict the clinician's Parkinson's disease symptom score on the UPDRS scale.
|
||||||
|
|
||||||
|
|
||||||
|
============================================================
|
||||||
|
|
||||||
|
DATA SET INFORMATION:
|
||||||
|
|
||||||
|
This dataset is composed of a range of biomedical voice measurements from 42
|
||||||
|
people with early-stage Parkinson's disease recruited to a six-month trial of
|
||||||
|
a telemonitoring device for remote symptom progression monitoring. The
|
||||||
|
recordings were automatically captured in the patient's homes.
|
||||||
|
|
||||||
|
Columns in the table contain subject number, subject age, subject gender,
|
||||||
|
time interval from baseline recruitment date, motor UPDRS, total UPDRS, and
|
||||||
|
16 biomedical voice measures. Each row corresponds to one of 5,875 voice
|
||||||
|
recording from these individuals. The main aim of the data is to predict the
|
||||||
|
motor and total UPDRS scores ('motor_UPDRS' and 'total_UPDRS') from the 16
|
||||||
|
voice measures.
|
||||||
|
|
||||||
|
The data is in ASCII CSV format. The rows of the CSV file contain an instance
|
||||||
|
corresponding to one voice recording. There are around 200 recordings per
|
||||||
|
patient, the subject number of the patient is identified in the first column.
|
||||||
|
For further information or to pass on comments, please contact Athanasios
|
||||||
|
Tsanas (tsanasthanasis '@' gmail.com) or Max Little (littlem '@'
|
||||||
|
physics.ox.ac.uk).
|
||||||
|
|
||||||
|
Further details are contained in the following reference -- if you use this
|
||||||
|
dataset, please cite:
|
||||||
|
Athanasios Tsanas, Max A. Little, Patrick E. McSharry, Lorraine O. Ramig (2009),
|
||||||
|
'Accurate telemonitoring of Parkinson.s disease progression by non-invasive
|
||||||
|
speech tests',
|
||||||
|
IEEE Transactions on Biomedical Engineering (to appear).
|
||||||
|
|
||||||
|
Further details about the biomedical voice measures can be found in:
|
||||||
|
Max A. Little, Patrick E. McSharry, Eric J. Hunter, Lorraine O. Ramig (2009),
|
||||||
|
'Suitability of dysphonia measurements for telemonitoring of Parkinson's
|
||||||
|
disease',
|
||||||
|
IEEE Transactions on Biomedical Engineering, 56(4):1015-1022
|
||||||
|
|
||||||
|
|
||||||
|
===========================================================
|
||||||
|
|
||||||
|
ATTRIBUTE INFORMATION:
|
||||||
|
|
||||||
|
subject# - Integer that uniquely identifies each subject
|
||||||
|
age - Subject age
|
||||||
|
sex - Subject gender '0' - male, '1' - female
|
||||||
|
test_time - Time since recruitment into the trial. The integer part is the
|
||||||
|
number of days since recruitment.
|
||||||
|
motor_UPDRS - Clinician's motor UPDRS score, linearly interpolated
|
||||||
|
total_UPDRS - Clinician's total UPDRS score, linearly interpolated
|
||||||
|
Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP - Several measures of
|
||||||
|
variation in fundamental frequency
|
||||||
|
Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA -
|
||||||
|
Several measures of variation in amplitude
|
||||||
|
NHR,HNR - Two measures of ratio of noise to tonal components in the voice
|
||||||
|
RPDE - A nonlinear dynamical complexity measure
|
||||||
|
DFA - Signal fractal scaling exponent
|
||||||
|
PPE - A nonlinear measure of fundamental frequency variation
|
||||||
|
|
||||||
|
|
||||||
|
===========================================================
|
||||||
|
|
||||||
|
RELEVANT PAPERS:
|
||||||
|
|
||||||
|
Little MA, McSharry PE, Hunter EJ, Ramig LO (2009),
|
||||||
|
'Suitability of dysphonia measurements for telemonitoring of Parkinson's
|
||||||
|
disease',
|
||||||
|
IEEE Transactions on Biomedical Engineering, 56(4):1015-1022
|
||||||
|
|
||||||
|
Little MA, McSharry PE, Roberts SJ, Costello DAE, Moroz IM.
|
||||||
|
'Exploiting Nonlinear Recurrence and Fractal Scaling Properties for Voice
|
||||||
|
Disorder Detection',
|
||||||
|
BioMedical Engineering OnLine 2007, 6:23 (26 June 2007)
|
||||||
|
|
||||||
|
===========================================================
|
||||||
|
|
||||||
|
CITATION REQUEST:
|
||||||
|
|
||||||
|
If you use this dataset, please cite the following paper:
|
||||||
|
A Tsanas, MA Little, PE McSharry, LO Ramig (2009)
|
||||||
|
'Accurate telemonitoring of Parkinson.s disease progression by non-invasive
|
||||||
|
speech tests',
|
||||||
|
IEEE Transactions on Biomedical Engineering (to appear).
|
||||||
140
wdbc.names
Executable file
140
wdbc.names
Executable file
|
|
@ -0,0 +1,140 @@
|
||||||
|
1. Title: Wisconsin Diagnostic Breast Cancer (WDBC)
|
||||||
|
|
||||||
|
2. Source Information
|
||||||
|
|
||||||
|
a) Creators:
|
||||||
|
|
||||||
|
Dr. William H. Wolberg, General Surgery Dept., University of
|
||||||
|
Wisconsin, Clinical Sciences Center, Madison, WI 53792
|
||||||
|
wolberg@eagle.surgery.wisc.edu
|
||||||
|
|
||||||
|
W. Nick Street, Computer Sciences Dept., University of
|
||||||
|
Wisconsin, 1210 West Dayton St., Madison, WI 53706
|
||||||
|
street@cs.wisc.edu 608-262-6619
|
||||||
|
|
||||||
|
Olvi L. Mangasarian, Computer Sciences Dept., University of
|
||||||
|
Wisconsin, 1210 West Dayton St., Madison, WI 53706
|
||||||
|
olvi@cs.wisc.edu
|
||||||
|
|
||||||
|
b) Donor: Nick Street
|
||||||
|
|
||||||
|
c) Date: November 1995
|
||||||
|
|
||||||
|
3. Past Usage:
|
||||||
|
|
||||||
|
first usage:
|
||||||
|
|
||||||
|
W.N. Street, W.H. Wolberg and O.L. Mangasarian
|
||||||
|
Nuclear feature extraction for breast tumor diagnosis.
|
||||||
|
IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science
|
||||||
|
and Technology, volume 1905, pages 861-870, San Jose, CA, 1993.
|
||||||
|
|
||||||
|
OR literature:
|
||||||
|
|
||||||
|
O.L. Mangasarian, W.N. Street and W.H. Wolberg.
|
||||||
|
Breast cancer diagnosis and prognosis via linear programming.
|
||||||
|
Operations Research, 43(4), pages 570-577, July-August 1995.
|
||||||
|
|
||||||
|
Medical literature:
|
||||||
|
|
||||||
|
W.H. Wolberg, W.N. Street, and O.L. Mangasarian.
|
||||||
|
Machine learning techniques to diagnose breast cancer from
|
||||||
|
fine-needle aspirates.
|
||||||
|
Cancer Letters 77 (1994) 163-171.
|
||||||
|
|
||||||
|
W.H. Wolberg, W.N. Street, and O.L. Mangasarian.
|
||||||
|
Image analysis and machine learning applied to breast cancer
|
||||||
|
diagnosis and prognosis.
|
||||||
|
Analytical and Quantitative Cytology and Histology, Vol. 17
|
||||||
|
No. 2, pages 77-87, April 1995.
|
||||||
|
|
||||||
|
W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian.
|
||||||
|
Computerized breast cancer diagnosis and prognosis from fine
|
||||||
|
needle aspirates.
|
||||||
|
Archives of Surgery 1995;130:511-516.
|
||||||
|
|
||||||
|
W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian.
|
||||||
|
Computer-derived nuclear features distinguish malignant from
|
||||||
|
benign breast cytology.
|
||||||
|
Human Pathology, 26:792--796, 1995.
|
||||||
|
|
||||||
|
See also:
|
||||||
|
http://www.cs.wisc.edu/~olvi/uwmp/mpml.html
|
||||||
|
http://www.cs.wisc.edu/~olvi/uwmp/cancer.html
|
||||||
|
|
||||||
|
Results:
|
||||||
|
|
||||||
|
- predicting field 2, diagnosis: B = benign, M = malignant
|
||||||
|
- sets are linearly separable using all 30 input features
|
||||||
|
- best predictive accuracy obtained using one separating plane
|
||||||
|
in the 3-D space of Worst Area, Worst Smoothness and
|
||||||
|
Mean Texture. Estimated accuracy 97.5% using repeated
|
||||||
|
10-fold crossvalidations. Classifier has correctly
|
||||||
|
diagnosed 176 consecutive new patients as of November
|
||||||
|
1995.
|
||||||
|
|
||||||
|
4. Relevant information
|
||||||
|
|
||||||
|
Features are computed from a digitized image of a fine needle
|
||||||
|
aspirate (FNA) of a breast mass. They describe
|
||||||
|
characteristics of the cell nuclei present in the image.
|
||||||
|
A few of the images can be found at
|
||||||
|
http://www.cs.wisc.edu/~street/images/
|
||||||
|
|
||||||
|
Separating plane described above was obtained using
|
||||||
|
Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree
|
||||||
|
Construction Via Linear Programming." Proceedings of the 4th
|
||||||
|
Midwest Artificial Intelligence and Cognitive Science Society,
|
||||||
|
pp. 97-101, 1992], a classification method which uses linear
|
||||||
|
programming to construct a decision tree. Relevant features
|
||||||
|
were selected using an exhaustive search in the space of 1-4
|
||||||
|
features and 1-3 separating planes.
|
||||||
|
|
||||||
|
The actual linear program used to obtain the separating plane
|
||||||
|
in the 3-dimensional space is that described in:
|
||||||
|
[K. P. Bennett and O. L. Mangasarian: "Robust Linear
|
||||||
|
Programming Discrimination of Two Linearly Inseparable Sets",
|
||||||
|
Optimization Methods and Software 1, 1992, 23-34].
|
||||||
|
|
||||||
|
|
||||||
|
This database is also available through the UW CS ftp server:
|
||||||
|
|
||||||
|
ftp ftp.cs.wisc.edu
|
||||||
|
cd math-prog/cpo-dataset/machine-learn/WDBC/
|
||||||
|
|
||||||
|
5. Number of instances: 569
|
||||||
|
|
||||||
|
6. Number of attributes: 32 (ID, diagnosis, 30 real-valued input features)
|
||||||
|
|
||||||
|
7. Attribute information
|
||||||
|
|
||||||
|
1) ID number
|
||||||
|
2) Diagnosis (M = malignant, B = benign)
|
||||||
|
3-32)
|
||||||
|
|
||||||
|
Ten real-valued features are computed for each cell nucleus:
|
||||||
|
|
||||||
|
a) radius (mean of distances from center to points on the perimeter)
|
||||||
|
b) texture (standard deviation of gray-scale values)
|
||||||
|
c) perimeter
|
||||||
|
d) area
|
||||||
|
e) smoothness (local variation in radius lengths)
|
||||||
|
f) compactness (perimeter^2 / area - 1.0)
|
||||||
|
g) concavity (severity of concave portions of the contour)
|
||||||
|
h) concave points (number of concave portions of the contour)
|
||||||
|
i) symmetry
|
||||||
|
j) fractal dimension ("coastline approximation" - 1)
|
||||||
|
|
||||||
|
Several of the papers listed above contain detailed descriptions of
|
||||||
|
how these features are computed.
|
||||||
|
|
||||||
|
The mean, standard error, and "worst" or largest (mean of the three
|
||||||
|
largest values) of these features were computed for each image,
|
||||||
|
resulting in 30 features. For instance, field 3 is Mean Radius, field
|
||||||
|
13 is Radius SE, field 23 is Worst Radius.
|
||||||
|
|
||||||
|
All feature values are recoded with four significant digits.
|
||||||
|
|
||||||
|
8. Missing attribute values: none
|
||||||
|
|
||||||
|
9. Class distribution: 357 benign, 212 malignant
|
||||||
Loading…
Add table
Add a link
Reference in a new issue