From bc377aa9fae92136d2fe4ef4f01b35ef1e83bedc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Batuhan=20Berk=20Ba=C5=9Fo=C4=9Flu?= <batuhan@basoglu.ca>
Date: Fri, 26 Sep 2025 21:21:07 -0400
Subject: [PATCH] Fixed the logistic regression code as well.

---
 logistic-regression-wdbc.py                |  3 +-
 mini-batch-sgd-logistic-regression-wdbc.py | 65 +++++++++++++++++++++-
 2 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/logistic-regression-wdbc.py b/logistic-regression-wdbc.py
index bde2cf9..234c849 100644
--- a/logistic-regression-wdbc.py
+++ b/logistic-regression-wdbc.py
@@ -211,8 +211,9 @@ if __name__ == "__main__":
 
         # check for weak correlation with target --> worsts have the most impact
         target_corr = df.corr()['Diagnosis'].abs().sort_values(ascending=False)
-        print("Correlation with target variable descending order:")
+        print("\nCorrelation with target variable descending order:")
         print(target_corr)
+    print("") # \n splitter
 
     rng = np.random.default_rng(seed=42)
     n_train = len(df)
diff --git a/mini-batch-sgd-logistic-regression-wdbc.py b/mini-batch-sgd-logistic-regression-wdbc.py
index 66fb5b4..a37a511 100644
--- a/mini-batch-sgd-logistic-regression-wdbc.py
+++ b/mini-batch-sgd-logistic-regression-wdbc.py
@@ -140,8 +140,45 @@ if __name__ == "__main__":
 
     df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
 
+    # ID should be dropped --> remove 1st row
     df.drop(columns=['ID'], inplace=True) # drops id column
 
+    # no duplicate rows but just in case:
+    df = df.drop_duplicates()
+    # check data types: --> everything is good
+    # print(df.dtypes)
+
+    '''
+    # ____________________________________________________________________________________
+    # HANDLE OUTLIERS AND INCONSISTENCIES
+    # https://medium.com/@heyamit10/pandas-outlier-detection-techniques-e9afece3d9e3
+    # if z-score more than 3 --> outllier
+    # print(cancer.head().to_string())
+
+    # ____________________________________________________________________________________
+
+    # separate dependent VS independent variables
+    X = cancer.drop(cancer.columns[0], axis=1)
+    y = cancer[1]
+
+    # print(X.head().to_string())
+
+    # normalize data
+    # normalize = cancer.drop(cancer.columns[0], axis=1)
+    # normalize = (normalize - normalize.mean()) / normalize.std()
+    # cancer[cancer.columns[1:]] = normalize
+    # print(cancer.head().to_string())
+
+    # turn into array for regression
+    X = X.to_numpy()
+    y = y.to_numpy()
+
+    # cancer_y = np.asarray(cancer2[0].tolist())
+    # cancer2.drop(cancer2[0], axis = 1, inplace = True)
+
+    # split data into train / tests datasets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
+'''
     missing_rows = df[df.isin(['?', 'NA', 'na', '']).any(axis=1)]  # checks null values
     print(f"Rows with null values: {len(missing_rows)}")
 
@@ -172,10 +209,32 @@ if __name__ == "__main__":
     # making diagnosis numeric
     df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category")
 
+    #check for correlation radius, are and perimeter have trivially a high correlation
+    corr_matrix = df.corr().abs()
+    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
+
+    # find features with correlation greater than 0.90
+    high_corr_features = []
+    for col in upper.columns:
+        high_corr = upper[col][upper[col] > 0.90]
+        if not high_corr.empty:
+            high_corr_features.append((col, high_corr.index.tolist()))
+
+    if high_corr_features:
+        print("correlated features (>0.95):")
+        for feature, correlated_with in high_corr_features:
+            print(f"  {feature} AND {correlated_with}")
+
+        # check for weak correlation with target --> worsts have the most impact
+        target_corr = df.corr()['Diagnosis'].abs().sort_values(ascending=False)
+        print("\nCorrelation with target variable descending order:")
+        print(target_corr)
+    print("") # \n splitter
+
     rng = np.random.default_rng(seed=42)
-    n_samples = len(df)
-    indices = rng.permutation(n_samples)
-    train_size = int(0.8 * n_samples)
+    n_train = len(df)
+    indices = rng.permutation(n_train)
+    train_size = int(0.8 * n_train)
 
     train_idx = indices[:train_size]
     test_idx = indices[train_size:]