From d516e979f9d4109b4397a6bedb3ca4bb5bebac28 Mon Sep 17 00:00:00 2001
From: ShaaniBel <bellemare.sks@gmail.com>
Date: Tue, 23 Sep 2025 09:15:20 -0400
Subject: [PATCH] .

---
 linear-regression-parkinsons.py | 68 +++++++++++++++++++++++++++++++++
 logistic-regression-wdbc.py     | 67 ++++++++++++++++++++++++++++++++
 2 files changed, 135 insertions(+)

diff --git a/linear-regression-parkinsons.py b/linear-regression-parkinsons.py
index 0104df7..fe2106b 100644
--- a/linear-regression-parkinsons.py
+++ b/linear-regression-parkinsons.py
@@ -93,6 +93,74 @@ if __name__ == "__main__":
 
     df.replace(['?','NA', 'na', ''], pd.NA, inplace=True) # replace null values with NA identifier
 
+    #___________________________
+    '''
+    # missing_parkinson = Parkinson[Parkinson.eq('?').any(axis=1)]
+    # print(len(missing_parkinson))
+    # no missing values in our dataset but still in case:
+    Parkinson = Parkinson[~Parkinson.eq('?').any(axis=1)]
+
+    # duplicates rows???
+    # duplicates = Parkinson.duplicated().sum()
+    # print(duplicates)
+    # no duplicates but just in case:
+    Parkinson = Parkinson.drop_duplicates()
+
+    # check data types --> no problem
+    # print(Parkinson.dtypes)
+
+    # check for highly correlated features --> ensure uniqueness of solution
+    # find them then note for 3rd phase
+    '''
+    """
+    #https://www.projectpro.io/recipes/drop-out-highly-correlated-features-in-python
+    #0 indicates no correlation and 1 indicates perfect correlation
+    corr_matrix = Parkinson.corr().abs()
+    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
+
+    # find features with correlation greater than 0.95
+    high_corr_features = []
+    for col in upper.columns:
+        high_corr = upper[col][upper[col] > 0.95]
+        if not high_corr.empty:
+            high_corr_features.append((col, high_corr.index.tolist()))
+
+    if high_corr_features:
+        print("correlated features (>0.95):")
+        for feature, correlated_with in high_corr_features:
+            print(f"  {feature} AND {correlated_with}")
+    """
+    '''
+    # repeated fields —> for now I removed them since might not be too relevant (need testing to see if we keep it later)
+    Parkinson = Parkinson.drop(Parkinson.columns[0:3], axis=1)
+
+    # ____________________________________________________________________________________
+    # HANDLE OUTLIERS AND INCONSISTENCIES
+    # https://medium.com/@heyamit10/pandas-outlier-detection-techniques-e9afece3d9e3
+    # if z-score more than 3 --> outllier
+    # print(Parkinson.head().to_string())
+
+    # ____________________________________________________________________________________
+
+    # Prepare Data for regression
+    # separate dependent VS independent variables
+    feature_columns = [col for col in Parkinson.columns if col not in ['motor_UPDRS', 'total_UPDRS', 'subject#']]
+    X = Parkinson[feature_columns]
+    y = Parkinson['motor_UPDRS']
+
+    # normalize / scale features? if not already done
+    # !!!!!!!!!!only for X not y!!!!!!!!!!!
+    # normalize = Parkinson.drop(Parkinson.columns[0:6], axis=1)
+    # normalize = (normalize - normalize.mean()) / normalize.std()
+    # Parkinson[Parkinson.columns[6:]] = normalize
+
+    # turn into array for regression
+    X = X.to_numpy()
+    y = y.to_numpy()
+
+    # split data into train 80% / tests datasets 20%
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
+'''
     num_cols = [
         'age', 'sex', 'test_time', 'motor_UPDRS', 'total_UPDRS',
         'Jitter(%)', 'Jitter(Abs)', 'Jitter:RAP', 'Jitter:PPQ5', 'Jitter:DDP',
diff --git a/logistic-regression-wdbc.py b/logistic-regression-wdbc.py
index 52d4664..46dd311 100644
--- a/logistic-regression-wdbc.py
+++ b/logistic-regression-wdbc.py
@@ -125,7 +125,74 @@ if __name__ == "__main__":
     df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
 
     df.drop(columns=['ID'], inplace=True) # drops id column
+    '''
+    # load data set into pandas objects --> easier to clean
+    url = 'https://raw.githubusercontent.com/ShaaniBel/datasets/refs/heads/main/wdbc.data'
+    cancer = pd.read_csv(url, header=None)
 
+    # ID should be dropped --> remove 1st row
+    cancer = cancer.drop(cancer.columns[0], axis=1)
+
+    # need to encode the B/M into 0/1
+    cancer[cancer.columns[0]] = cancer[cancer.columns[0]].map({'B': 0, 'M': 1})
+
+    # no missing values in our dataset but still in case:
+    cancer = cancer[~cancer.eq('?').any(axis=1)]
+
+    # no duplicate rows but just in case:
+    cancer = cancer.drop_duplicates()
+
+    # check data types: --> everything is good
+    # print(cancer.dtypes)
+    '''
+    # check for highly correlated features and write them down
+    '''
+    corr_matrix = cancer.corr().abs()
+    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
+
+    # find features with correlation greater than 0.95
+    high_corr_features = []
+    for col in upper.columns:
+        high_corr = upper[col][upper[col] > 0.95]
+        if not high_corr.empty:
+            high_corr_features.append((col, high_corr.index.tolist()))
+
+    if high_corr_features:
+        print("correlated features (>0.95):")
+        for feature, correlated_with in high_corr_features:
+            print(f"  {feature} AND {correlated_with}")
+    '''
+    '''
+    # ____________________________________________________________________________________
+    # HANDLE OUTLIERS AND INCONSISTENCIES
+    # https://medium.com/@heyamit10/pandas-outlier-detection-techniques-e9afece3d9e3
+    # if z-score more than 3 --> outllier
+    # print(cancer.head().to_string())
+
+    # ____________________________________________________________________________________
+
+    # separate dependent VS independent variables
+    X = cancer.drop(cancer.columns[0], axis=1)
+    y = cancer[1]
+
+    # print(X.head().to_string())
+
+    # normalize data
+    # normalize = cancer.drop(cancer.columns[0], axis=1)
+    # normalize = (normalize - normalize.mean()) / normalize.std()
+    # cancer[cancer.columns[1:]] = normalize
+    # print(cancer.head().to_string())
+
+    # turn into array for regression
+    X = X.to_numpy()
+    y = y.to_numpy()
+
+    # cancer_y = np.asarray(cancer2[0].tolist())
+    # cancer2.drop(cancer2[0], axis = 1, inplace = True)
+
+    # split data into train / tests datasets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
+'''
     missing_rows = df[df.isin(['?', 'NA', 'na', '']).any(axis=1)]  # checks null values
     print(f"Rows with null values: {len(missing_rows)}")