From 7268194beabaed3cb7cb5668aaab828e89b38307 Mon Sep 17 00:00:00 2001
From: ShaaniBel <bellemare.sks@gmail.com>
Date: Fri, 26 Sep 2025 17:14:56 -0400
Subject: [PATCH] .

---
 linear-regression-parkinsons.py | 49 ++++++++++------------------
 logistic-regression-wdbc.py     | 57 ++++++++++++++-------------------
 2 files changed, 41 insertions(+), 65 deletions(-)

diff --git a/linear-regression-parkinsons.py b/linear-regression-parkinsons.py
index e95e600..4274e17 100644
--- a/linear-regression-parkinsons.py
+++ b/linear-regression-parkinsons.py
@@ -93,29 +93,21 @@ if __name__ == "__main__":
 
     df.replace(['?','NA', 'na', ''], pd.NA, inplace=True) # replace null values with NA identifier
 
-    #___________________________
-    '''
-    # missing_parkinson = Parkinson[Parkinson.eq('?').any(axis=1)]
-    # print(len(missing_parkinson))
-    # no missing values in our dataset but still in case:
-    Parkinson = Parkinson[~Parkinson.eq('?').any(axis=1)]
+    # check data types --> no problem
+    # print(df.dtypes)
 
     # duplicates rows???
-    # duplicates = Parkinson.duplicated().sum()
-    # print(duplicates)
+    duplicates = df.duplicated().sum()
+    print(f"Num of duplicated rows:", duplicates)
     # no duplicates but just in case:
-    Parkinson = Parkinson.drop_duplicates()
-
-    # check data types --> no problem
-    # print(Parkinson.dtypes)
+    df = df.drop_duplicates()
 
     # check for highly correlated features --> ensure uniqueness of solution
     # find them then note for 3rd phase
-    '''
-    """
-    #https://www.projectpro.io/recipes/drop-out-highly-correlated-features-in-python
-    #0 indicates no correlation and 1 indicates perfect correlation
-    corr_matrix = Parkinson.corr().abs()
+
+    #Further experiments
+    # 0 indicates no correlation and 1 indicates perfect correlation
+    corr_matrix = df.corr().abs()
     upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
 
     # find features with correlation greater than 0.95
@@ -129,7 +121,12 @@ if __name__ == "__main__":
         print("correlated features (>0.95):")
         for feature, correlated_with in high_corr_features:
             print(f"  {feature} AND {correlated_with}")
-    """
+
+    # check for weak correlation with target
+    target_corr = df.corr()['motor_UPDRS'].abs().sort_values(ascending=False)
+    print("Correlation with target variable descending order:")
+    print(target_corr)
+
     '''
     # repeated fields —> for now I removed them since might not be too relevant (need testing to see if we keep it later)
     Parkinson = Parkinson.drop(Parkinson.columns[0:3], axis=1)
@@ -142,12 +139,6 @@ if __name__ == "__main__":
 
     # ____________________________________________________________________________________
 
-    # Prepare Data for regression
-    # separate dependent VS independent variables
-    feature_columns = [col for col in Parkinson.columns if col not in ['motor_UPDRS', 'total_UPDRS', 'subject#']]
-    X = Parkinson[feature_columns]
-    y = Parkinson['motor_UPDRS']
-
     # normalize / scale features? if not already done
     # !!!!!!!!!!only for X not y!!!!!!!!!!!
     # normalize = Parkinson.drop(Parkinson.columns[0:6], axis=1)
@@ -161,14 +152,7 @@ if __name__ == "__main__":
     # split data into train 80% / tests datasets 20%
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
 '''
-    num_cols = [
-        'age', 'sex', 'test_time', 'motor_UPDRS', 'total_UPDRS',
-        'Jitter(%)', 'Jitter(Abs)', 'Jitter:RAP', 'Jitter:PPQ5', 'Jitter:DDP',
-        'Shimmer', 'Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
-        'Shimmer:APQ11', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'PPE'
-    ]
-
-    for col in num_cols:
+    for col in df:
         df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
 
     df.dropna(inplace=True) # remove null values
@@ -191,6 +175,7 @@ if __name__ == "__main__":
     x = df[feature_columns]
     y = df['motor_UPDRS']
 
+
     # train / test splitting (80 / 20)
     n_train = int(0.8 * len(x))
     x_train, x_test = x.iloc[:n_train], x.iloc[n_train:]
diff --git a/logistic-regression-wdbc.py b/logistic-regression-wdbc.py
index 46dd311..bde2cf9 100644
--- a/logistic-regression-wdbc.py
+++ b/logistic-regression-wdbc.py
@@ -124,44 +124,14 @@ if __name__ == "__main__":
 
     df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
 
-    df.drop(columns=['ID'], inplace=True) # drops id column
-    '''
-    # load data set into pandas objects --> easier to clean
-    url = 'https://raw.githubusercontent.com/ShaaniBel/datasets/refs/heads/main/wdbc.data'
-    cancer = pd.read_csv(url, header=None)
-
     # ID should be dropped --> remove 1st row
-    cancer = cancer.drop(cancer.columns[0], axis=1)
-
-    # need to encode the B/M into 0/1
-    cancer[cancer.columns[0]] = cancer[cancer.columns[0]].map({'B': 0, 'M': 1})
-
-    # no missing values in our dataset but still in case:
-    cancer = cancer[~cancer.eq('?').any(axis=1)]
+    df.drop(columns=['ID'], inplace=True) # drops id column
 
     # no duplicate rows but just in case:
-    cancer = cancer.drop_duplicates()
-
+    df = df.drop_duplicates()
     # check data types: --> everything is good
-    # print(cancer.dtypes)
-    '''
-    # check for highly correlated features and write them down
-    '''
-    corr_matrix = cancer.corr().abs()
-    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
+    # print(df.dtypes)
 
-    # find features with correlation greater than 0.95
-    high_corr_features = []
-    for col in upper.columns:
-        high_corr = upper[col][upper[col] > 0.95]
-        if not high_corr.empty:
-            high_corr_features.append((col, high_corr.index.tolist()))
-
-    if high_corr_features:
-        print("correlated features (>0.95):")
-        for feature, correlated_with in high_corr_features:
-            print(f"  {feature} AND {correlated_with}")
-    '''
     '''
     # ____________________________________________________________________________________
     # HANDLE OUTLIERS AND INCONSISTENCIES
@@ -223,6 +193,27 @@ if __name__ == "__main__":
     # making diagnosis numeric
     df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category")
 
+    #check for correlation radius, are and perimeter have trivially a high correlation
+    corr_matrix = df.corr().abs()
+    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
+
+    # find features with correlation greater than 0.90
+    high_corr_features = []
+    for col in upper.columns:
+        high_corr = upper[col][upper[col] > 0.90]
+        if not high_corr.empty:
+            high_corr_features.append((col, high_corr.index.tolist()))
+
+    if high_corr_features:
+        print("correlated features (>0.95):")
+        for feature, correlated_with in high_corr_features:
+            print(f"  {feature} AND {correlated_with}")
+
+        # check for weak correlation with target --> worsts have the most impact
+        target_corr = df.corr()['Diagnosis'].abs().sort_values(ascending=False)
+        print("Correlation with target variable descending order:")
+        print(target_corr)
+
     rng = np.random.default_rng(seed=42)
     n_train = len(df)
     indices = rng.permutation(n_train)