.
This commit is contained in:
parent
f426cd914c
commit
7268194bea
2 changed files with 41 additions and 65 deletions
|
|
@ -93,29 +93,21 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
df.replace(['?','NA', 'na', ''], pd.NA, inplace=True) # replace null values with NA identifier
|
df.replace(['?','NA', 'na', ''], pd.NA, inplace=True) # replace null values with NA identifier
|
||||||
|
|
||||||
#___________________________
|
# check data types --> no problem
|
||||||
'''
|
# print(df.dtypes)
|
||||||
# missing_parkinson = Parkinson[Parkinson.eq('?').any(axis=1)]
|
|
||||||
# print(len(missing_parkinson))
|
|
||||||
# no missing values in our dataset but still in case:
|
|
||||||
Parkinson = Parkinson[~Parkinson.eq('?').any(axis=1)]
|
|
||||||
|
|
||||||
# duplicates rows???
|
# duplicates rows???
|
||||||
# duplicates = Parkinson.duplicated().sum()
|
duplicates = df.duplicated().sum()
|
||||||
# print(duplicates)
|
print(f"Num of duplicated rows:", duplicates)
|
||||||
# no duplicates but just in case:
|
# no duplicates but just in case:
|
||||||
Parkinson = Parkinson.drop_duplicates()
|
df = df.drop_duplicates()
|
||||||
|
|
||||||
# check data types --> no problem
|
|
||||||
# print(Parkinson.dtypes)
|
|
||||||
|
|
||||||
# check for highly correlated features --> ensure uniqueness of solution
|
# check for highly correlated features --> ensure uniqueness of solution
|
||||||
# find them then note for 3rd phase
|
# find them then note for 3rd phase
|
||||||
'''
|
|
||||||
"""
|
#Further experiments
|
||||||
#https://www.projectpro.io/recipes/drop-out-highly-correlated-features-in-python
|
# 0 indicates no correlation and 1 indicates perfect correlation
|
||||||
#0 indicates no correlation and 1 indicates perfect correlation
|
corr_matrix = df.corr().abs()
|
||||||
corr_matrix = Parkinson.corr().abs()
|
|
||||||
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
||||||
|
|
||||||
# find features with correlation greater than 0.95
|
# find features with correlation greater than 0.95
|
||||||
|
|
@ -129,7 +121,12 @@ if __name__ == "__main__":
|
||||||
print("correlated features (>0.95):")
|
print("correlated features (>0.95):")
|
||||||
for feature, correlated_with in high_corr_features:
|
for feature, correlated_with in high_corr_features:
|
||||||
print(f" {feature} AND {correlated_with}")
|
print(f" {feature} AND {correlated_with}")
|
||||||
"""
|
|
||||||
|
# check for weak correlation with target
|
||||||
|
target_corr = df.corr()['motor_UPDRS'].abs().sort_values(ascending=False)
|
||||||
|
print("Correlation with target variable descending order:")
|
||||||
|
print(target_corr)
|
||||||
|
|
||||||
'''
|
'''
|
||||||
# repeated fields —> for now I removed them since might not be too relevant (need testing to see if we keep it later)
|
# repeated fields —> for now I removed them since might not be too relevant (need testing to see if we keep it later)
|
||||||
Parkinson = Parkinson.drop(Parkinson.columns[0:3], axis=1)
|
Parkinson = Parkinson.drop(Parkinson.columns[0:3], axis=1)
|
||||||
|
|
@ -142,12 +139,6 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# ____________________________________________________________________________________
|
# ____________________________________________________________________________________
|
||||||
|
|
||||||
# Prepare Data for regression
|
|
||||||
# separate dependent VS independent variables
|
|
||||||
feature_columns = [col for col in Parkinson.columns if col not in ['motor_UPDRS', 'total_UPDRS', 'subject#']]
|
|
||||||
X = Parkinson[feature_columns]
|
|
||||||
y = Parkinson['motor_UPDRS']
|
|
||||||
|
|
||||||
# normalize / scale features? if not already done
|
# normalize / scale features? if not already done
|
||||||
# !!!!!!!!!!only for X not y!!!!!!!!!!!
|
# !!!!!!!!!!only for X not y!!!!!!!!!!!
|
||||||
# normalize = Parkinson.drop(Parkinson.columns[0:6], axis=1)
|
# normalize = Parkinson.drop(Parkinson.columns[0:6], axis=1)
|
||||||
|
|
@ -161,14 +152,7 @@ if __name__ == "__main__":
|
||||||
# split data into train 80% / tests datasets 20%
|
# split data into train 80% / tests datasets 20%
|
||||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
|
||||||
'''
|
'''
|
||||||
num_cols = [
|
for col in df:
|
||||||
'age', 'sex', 'test_time', 'motor_UPDRS', 'total_UPDRS',
|
|
||||||
'Jitter(%)', 'Jitter(Abs)', 'Jitter:RAP', 'Jitter:PPQ5', 'Jitter:DDP',
|
|
||||||
'Shimmer', 'Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
|
|
||||||
'Shimmer:APQ11', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'PPE'
|
|
||||||
]
|
|
||||||
|
|
||||||
for col in num_cols:
|
|
||||||
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
||||||
|
|
||||||
df.dropna(inplace=True) # remove null values
|
df.dropna(inplace=True) # remove null values
|
||||||
|
|
@ -191,6 +175,7 @@ if __name__ == "__main__":
|
||||||
x = df[feature_columns]
|
x = df[feature_columns]
|
||||||
y = df['motor_UPDRS']
|
y = df['motor_UPDRS']
|
||||||
|
|
||||||
|
|
||||||
# train / test splitting (80 / 20)
|
# train / test splitting (80 / 20)
|
||||||
n_train = int(0.8 * len(x))
|
n_train = int(0.8 * len(x))
|
||||||
x_train, x_test = x.iloc[:n_train], x.iloc[n_train:]
|
x_train, x_test = x.iloc[:n_train], x.iloc[n_train:]
|
||||||
|
|
|
||||||
|
|
@ -124,44 +124,14 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
|
df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
|
||||||
|
|
||||||
df.drop(columns=['ID'], inplace=True) # drops id column
|
|
||||||
'''
|
|
||||||
# load data set into pandas objects --> easier to clean
|
|
||||||
url = 'https://raw.githubusercontent.com/ShaaniBel/datasets/refs/heads/main/wdbc.data'
|
|
||||||
cancer = pd.read_csv(url, header=None)
|
|
||||||
|
|
||||||
# ID should be dropped --> remove 1st row
|
# ID should be dropped --> remove 1st row
|
||||||
cancer = cancer.drop(cancer.columns[0], axis=1)
|
df.drop(columns=['ID'], inplace=True) # drops id column
|
||||||
|
|
||||||
# need to encode the B/M into 0/1
|
|
||||||
cancer[cancer.columns[0]] = cancer[cancer.columns[0]].map({'B': 0, 'M': 1})
|
|
||||||
|
|
||||||
# no missing values in our dataset but still in case:
|
|
||||||
cancer = cancer[~cancer.eq('?').any(axis=1)]
|
|
||||||
|
|
||||||
# no duplicate rows but just in case:
|
# no duplicate rows but just in case:
|
||||||
cancer = cancer.drop_duplicates()
|
df = df.drop_duplicates()
|
||||||
|
|
||||||
# check data types: --> everything is good
|
# check data types: --> everything is good
|
||||||
# print(cancer.dtypes)
|
# print(df.dtypes)
|
||||||
'''
|
|
||||||
# check for highly correlated features and write them down
|
|
||||||
'''
|
|
||||||
corr_matrix = cancer.corr().abs()
|
|
||||||
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
|
||||||
|
|
||||||
# find features with correlation greater than 0.95
|
|
||||||
high_corr_features = []
|
|
||||||
for col in upper.columns:
|
|
||||||
high_corr = upper[col][upper[col] > 0.95]
|
|
||||||
if not high_corr.empty:
|
|
||||||
high_corr_features.append((col, high_corr.index.tolist()))
|
|
||||||
|
|
||||||
if high_corr_features:
|
|
||||||
print("correlated features (>0.95):")
|
|
||||||
for feature, correlated_with in high_corr_features:
|
|
||||||
print(f" {feature} AND {correlated_with}")
|
|
||||||
'''
|
|
||||||
'''
|
'''
|
||||||
# ____________________________________________________________________________________
|
# ____________________________________________________________________________________
|
||||||
# HANDLE OUTLIERS AND INCONSISTENCIES
|
# HANDLE OUTLIERS AND INCONSISTENCIES
|
||||||
|
|
@ -223,6 +193,27 @@ if __name__ == "__main__":
|
||||||
# making diagnosis numeric
|
# making diagnosis numeric
|
||||||
df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category")
|
df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0}).astype("category")
|
||||||
|
|
||||||
|
#check for correlation radius, are and perimeter have trivially a high correlation
|
||||||
|
corr_matrix = df.corr().abs()
|
||||||
|
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
||||||
|
|
||||||
|
# find features with correlation greater than 0.90
|
||||||
|
high_corr_features = []
|
||||||
|
for col in upper.columns:
|
||||||
|
high_corr = upper[col][upper[col] > 0.90]
|
||||||
|
if not high_corr.empty:
|
||||||
|
high_corr_features.append((col, high_corr.index.tolist()))
|
||||||
|
|
||||||
|
if high_corr_features:
|
||||||
|
print("correlated features (>0.95):")
|
||||||
|
for feature, correlated_with in high_corr_features:
|
||||||
|
print(f" {feature} AND {correlated_with}")
|
||||||
|
|
||||||
|
# check for weak correlation with target --> worsts have the most impact
|
||||||
|
target_corr = df.corr()['Diagnosis'].abs().sort_values(ascending=False)
|
||||||
|
print("Correlation with target variable descending order:")
|
||||||
|
print(target_corr)
|
||||||
|
|
||||||
rng = np.random.default_rng(seed=42)
|
rng = np.random.default_rng(seed=42)
|
||||||
n_train = len(df)
|
n_train = len(df)
|
||||||
indices = rng.permutation(n_train)
|
indices = rng.permutation(n_train)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue