.
This commit is contained in:
parent
bac9763b0a
commit
d516e979f9
2 changed files with 135 additions and 0 deletions
|
|
@ -93,6 +93,74 @@ if __name__ == "__main__":
|
|||
|
||||
df.replace(['?','NA', 'na', ''], pd.NA, inplace=True) # replace null values with NA identifier
|
||||
|
||||
#___________________________
|
||||
'''
|
||||
# missing_parkinson = Parkinson[Parkinson.eq('?').any(axis=1)]
|
||||
# print(len(missing_parkinson))
|
||||
# no missing values in our dataset but still in case:
|
||||
Parkinson = Parkinson[~Parkinson.eq('?').any(axis=1)]
|
||||
|
||||
# duplicates rows???
|
||||
# duplicates = Parkinson.duplicated().sum()
|
||||
# print(duplicates)
|
||||
# no duplicates but just in case:
|
||||
Parkinson = Parkinson.drop_duplicates()
|
||||
|
||||
# check data types --> no problem
|
||||
# print(Parkinson.dtypes)
|
||||
|
||||
# check for highly correlated features --> ensure uniqueness of solution
|
||||
# find them then note for 3rd phase
|
||||
'''
|
||||
"""
|
||||
#https://www.projectpro.io/recipes/drop-out-highly-correlated-features-in-python
|
||||
#0 indicates no correlation and 1 indicates perfect correlation
|
||||
corr_matrix = Parkinson.corr().abs()
|
||||
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
||||
|
||||
# find features with correlation greater than 0.95
|
||||
high_corr_features = []
|
||||
for col in upper.columns:
|
||||
high_corr = upper[col][upper[col] > 0.95]
|
||||
if not high_corr.empty:
|
||||
high_corr_features.append((col, high_corr.index.tolist()))
|
||||
|
||||
if high_corr_features:
|
||||
print("correlated features (>0.95):")
|
||||
for feature, correlated_with in high_corr_features:
|
||||
print(f" {feature} AND {correlated_with}")
|
||||
"""
|
||||
'''
|
||||
# repeated fields —> for now I removed them since might not be too relevant (need testing to see if we keep it later)
|
||||
Parkinson = Parkinson.drop(Parkinson.columns[0:3], axis=1)
|
||||
|
||||
# ____________________________________________________________________________________
|
||||
# HANDLE OUTLIERS AND INCONSISTENCIES
|
||||
# https://medium.com/@heyamit10/pandas-outlier-detection-techniques-e9afece3d9e3
|
||||
# if z-score more than 3 --> outllier
|
||||
# print(Parkinson.head().to_string())
|
||||
|
||||
# ____________________________________________________________________________________
|
||||
|
||||
# Prepare Data for regression
|
||||
# separate dependent VS independent variables
|
||||
feature_columns = [col for col in Parkinson.columns if col not in ['motor_UPDRS', 'total_UPDRS', 'subject#']]
|
||||
X = Parkinson[feature_columns]
|
||||
y = Parkinson['motor_UPDRS']
|
||||
|
||||
# normalize / scale features? if not already done
|
||||
# !!!!!!!!!!only for X not y!!!!!!!!!!!
|
||||
# normalize = Parkinson.drop(Parkinson.columns[0:6], axis=1)
|
||||
# normalize = (normalize - normalize.mean()) / normalize.std()
|
||||
# Parkinson[Parkinson.columns[6:]] = normalize
|
||||
|
||||
# turn into array for regression
|
||||
X = X.to_numpy()
|
||||
y = y.to_numpy()
|
||||
|
||||
# split data into train 80% / tests datasets 20%
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
|
||||
'''
|
||||
num_cols = [
|
||||
'age', 'sex', 'test_time', 'motor_UPDRS', 'total_UPDRS',
|
||||
'Jitter(%)', 'Jitter(Abs)', 'Jitter:RAP', 'Jitter:PPQ5', 'Jitter:DDP',
|
||||
|
|
|
|||
|
|
@ -125,7 +125,74 @@ if __name__ == "__main__":
|
|||
df = pd.read_csv('wdbc.data', header=None, names=columns, dtype=str)
|
||||
|
||||
df.drop(columns=['ID'], inplace=True) # drops id column
|
||||
'''
|
||||
# load data set into pandas objects --> easier to clean
|
||||
url = 'https://raw.githubusercontent.com/ShaaniBel/datasets/refs/heads/main/wdbc.data'
|
||||
cancer = pd.read_csv(url, header=None)
|
||||
|
||||
# ID should be dropped --> remove 1st row
|
||||
cancer = cancer.drop(cancer.columns[0], axis=1)
|
||||
|
||||
# need to encode the B/M into 0/1
|
||||
cancer[cancer.columns[0]] = cancer[cancer.columns[0]].map({'B': 0, 'M': 1})
|
||||
|
||||
# no missing values in our dataset but still in case:
|
||||
cancer = cancer[~cancer.eq('?').any(axis=1)]
|
||||
|
||||
# no duplicate rows but just in case:
|
||||
cancer = cancer.drop_duplicates()
|
||||
|
||||
# check data types: --> everything is good
|
||||
# print(cancer.dtypes)
|
||||
'''
|
||||
# check for highly correlated features and write them down
|
||||
'''
|
||||
corr_matrix = cancer.corr().abs()
|
||||
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
||||
|
||||
# find features with correlation greater than 0.95
|
||||
high_corr_features = []
|
||||
for col in upper.columns:
|
||||
high_corr = upper[col][upper[col] > 0.95]
|
||||
if not high_corr.empty:
|
||||
high_corr_features.append((col, high_corr.index.tolist()))
|
||||
|
||||
if high_corr_features:
|
||||
print("correlated features (>0.95):")
|
||||
for feature, correlated_with in high_corr_features:
|
||||
print(f" {feature} AND {correlated_with}")
|
||||
'''
|
||||
'''
|
||||
# ____________________________________________________________________________________
|
||||
# HANDLE OUTLIERS AND INCONSISTENCIES
|
||||
# https://medium.com/@heyamit10/pandas-outlier-detection-techniques-e9afece3d9e3
|
||||
# if z-score more than 3 --> outllier
|
||||
# print(cancer.head().to_string())
|
||||
|
||||
# ____________________________________________________________________________________
|
||||
|
||||
# separate dependent VS independent variables
|
||||
X = cancer.drop(cancer.columns[0], axis=1)
|
||||
y = cancer[1]
|
||||
|
||||
# print(X.head().to_string())
|
||||
|
||||
# normalize data
|
||||
# normalize = cancer.drop(cancer.columns[0], axis=1)
|
||||
# normalize = (normalize - normalize.mean()) / normalize.std()
|
||||
# cancer[cancer.columns[1:]] = normalize
|
||||
# print(cancer.head().to_string())
|
||||
|
||||
# turn into array for regression
|
||||
X = X.to_numpy()
|
||||
y = y.to_numpy()
|
||||
|
||||
# cancer_y = np.asarray(cancer2[0].tolist())
|
||||
# cancer2.drop(cancer2[0], axis = 1, inplace = True)
|
||||
|
||||
# split data into train / tests datasets
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
|
||||
'''
|
||||
missing_rows = df[df.isin(['?', 'NA', 'na', '']).any(axis=1)] # checks null values
|
||||
print(f"Rows with null values: {len(missing_rows)}")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue