From 7268194beabaed3cb7cb5668aaab828e89b38307 Mon Sep 17 00:00:00 2001 From: ShaaniBel Date: Fri, 26 Sep 2025 17:14:56 -0400 Subject: [PATCH] . --- linear-regression-parkinsons.py | 49 ++++++++++------------------ logistic-regression-wdbc.py | 57 ++++++++++++++------------------- 2 files changed, 41 insertions(+), 65 deletions(-) diff --git a/linear-regression-parkinsons.py b/linear-regression-parkinsons.py index e95e600..4274e17 100644 --- a/linear-regression-parkinsons.py +++ b/linear-regression-parkinsons.py @@ -93,29 +93,21 @@ if __name__ == "__main__": df.replace(['?','NA', 'na', ''], pd.NA, inplace=True) # replace null values with NA identifier - #___________________________ - ''' - # missing_parkinson = Parkinson[Parkinson.eq('?').any(axis=1)] - # print(len(missing_parkinson)) - # no missing values in our dataset but still in case: - Parkinson = Parkinson[~Parkinson.eq('?').any(axis=1)] + # check data types --> no problem + # print(df.dtypes) # duplicates rows??? - # duplicates = Parkinson.duplicated().sum() - # print(duplicates) + duplicates = df.duplicated().sum() + print(f"Num of duplicated rows:", duplicates) # no duplicates but just in case: - Parkinson = Parkinson.drop_duplicates() - - # check data types --> no problem - # print(Parkinson.dtypes) + df = df.drop_duplicates() # check for highly correlated features --> ensure uniqueness of solution # find them then note for 3rd phase - ''' - """ - #https://www.projectpro.io/recipes/drop-out-highly-correlated-features-in-python - #0 indicates no correlation and 1 indicates perfect correlation - corr_matrix = Parkinson.corr().abs() + + #Further experiments + # 0 indicates no correlation and 1 indicates perfect correlation + corr_matrix = df.corr().abs() upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) # find features with correlation greater than 0.95 @@ -129,7 +121,12 @@ if __name__ == "__main__": print("correlated features (>0.95):") for feature, correlated_with in high_corr_features: print(f" {feature} AND {correlated_with}") - """ + + # check for weak correlation with target + target_corr = df.corr()['motor_UPDRS'].abs().sort_values(ascending=False) + print("Correlation with target variable descending order:") + print(target_corr) + ''' # repeated fields —> for now I removed them since might not be too relevant (need testing to see if we keep it later) Parkinson = Parkinson.drop(Parkinson.columns[0:3], axis=1) @@ -142,12 +139,6 @@ if __name__ == "__main__": # ____________________________________________________________________________________ - # Prepare Data for regression - # separate dependent VS independent variables - feature_columns = [col for col in Parkinson.columns if col not in ['motor_UPDRS', 'total_UPDRS', 'subject#']] - X = Parkinson[feature_columns] - y = Parkinson['motor_UPDRS'] - # normalize / scale features? if not already done # !!!!!!!!!!only for X not y!!!!!!!!!!! # normalize = Parkinson.drop(Parkinson.columns[0:6], axis=1) @@ -161,14 +152,7 @@ if __name__ == "__main__": # split data into train 80% / tests datasets 20% X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) ''' - num_cols = [ - 'age', 'sex', 'test_time', 'motor_UPDRS', 'total_UPDRS', - 'Jitter(%)', 'Jitter(Abs)', 'Jitter:RAP', 'Jitter:PPQ5', 'Jitter:DDP', - 'Shimmer', 'Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', - 'Shimmer:APQ11', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'PPE' - ] - - for col in num_cols: + for col in df: df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values df.dropna(inplace=True) # remove null values @@ -191,6 +175,7 @@ if __name__ == "__main__": x = df[feature_columns] y = df['motor_UPDRS'] + # train / test splitting (80 / 20) n_train = int(0.8 * len(x)) x_train, x_test = x.iloc[:n_train], x.iloc[n_train:] diff --git a/logistic-regression-wdbc.py b/logistic-regression-wdbc.py index 46dd311..bde2cf9 100644 --- a/logistic-regression-wdbc.py +++ b/logistic-regression-wdbc.py @@ -124,44 +124,14 @@ if __name__ == "__main__": df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str) - df.drop(columns=['ID'], inplace=True) # drops id column - ''' - # load data set into pandas objects --> easier to clean - url = 'https://raw.githubusercontent.com/ShaaniBel/datasets/refs/heads/main/wdbc.data' - cancer = pd.read_csv(url, header=None) - # ID should be dropped --> remove 1st row - cancer = cancer.drop(cancer.columns[0], axis=1) - - # need to encode the B/M into 0/1 - cancer[cancer.columns[0]] = cancer[cancer.columns[0]].map({'B': 0, 'M': 1}) - - # no missing values in our dataset but still in case: - cancer = cancer[~cancer.eq('?').any(axis=1)] + df.drop(columns=['ID'], inplace=True) # drops id column # no duplicate rows but just in case: - cancer = cancer.drop_duplicates() - + df = df.drop_duplicates() # check data types: --> everything is good - # print(cancer.dtypes) - ''' - # check for highly correlated features and write them down - ''' - corr_matrix = cancer.corr().abs() - upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) + # print(df.dtypes) - # find features with correlation greater than 0.95 - high_corr_features = [] - for col in upper.columns: - high_corr = upper[col][upper[col] > 0.95] - if not high_corr.empty: - high_corr_features.append((col, high_corr.index.tolist())) - - if high_corr_features: - print("correlated features (>0.95):") - for feature, correlated_with in high_corr_features: - print(f" {feature} AND {correlated_with}") - ''' ''' # ____________________________________________________________________________________ # HANDLE OUTLIERS AND INCONSISTENCIES @@ -223,6 +193,27 @@ if __name__ == "__main__": # making diagnosis numeric df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category") + #check for correlation radius, are and perimeter have trivially a high correlation + corr_matrix = df.corr().abs() + upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) + + # find features with correlation greater than 0.90 + high_corr_features = [] + for col in upper.columns: + high_corr = upper[col][upper[col] > 0.90] + if not high_corr.empty: + high_corr_features.append((col, high_corr.index.tolist())) + + if high_corr_features: + print("correlated features (>0.95):") + for feature, correlated_with in high_corr_features: + print(f" {feature} AND {correlated_with}") + + # check for weak correlation with target --> worsts have the most impact + target_corr = df.corr()['Diagnosis'].abs().sort_values(ascending=False) + print("Correlation with target variable descending order:") + print(target_corr) + rng = np.random.default_rng(seed=42) n_train = len(df) indices = rng.permutation(n_train)