removed unnecessary comments
This commit is contained in:
parent
193dcabbff
commit
91e98ba8bc
4 changed files with 0 additions and 117 deletions
|
|
@ -169,31 +169,6 @@ if __name__ == "__main__":
|
||||||
print("\nCorrelation with target variable descending order:")
|
print("\nCorrelation with target variable descending order:")
|
||||||
print(target_corr)
|
print(target_corr)
|
||||||
|
|
||||||
'''
|
|
||||||
# repeated fields —> for now I removed them since might not be too relevant (need testing to see if we keep it later)
|
|
||||||
Parkinson = Parkinson.drop(Parkinson.columns[0:3], axis=1)
|
|
||||||
|
|
||||||
# ____________________________________________________________________________________
|
|
||||||
# HANDLE OUTLIERS AND INCONSISTENCIES
|
|
||||||
# https://medium.com/@heyamit10/pandas-outlier-detection-techniques-e9afece3d9e3
|
|
||||||
# if z-score more than 3 --> outllier
|
|
||||||
# print(Parkinson.head().to_string())
|
|
||||||
|
|
||||||
# ____________________________________________________________________________________
|
|
||||||
|
|
||||||
# normalize / scale features? if not already done
|
|
||||||
# !!!!!!!!!!only for X not y!!!!!!!!!!!
|
|
||||||
# normalize = Parkinson.drop(Parkinson.columns[0:6], axis=1)
|
|
||||||
# normalize = (normalize - normalize.mean()) / normalize.std()
|
|
||||||
# Parkinson[Parkinson.columns[6:]] = normalize
|
|
||||||
|
|
||||||
# turn into array for regression
|
|
||||||
x = x.to_numpy()
|
|
||||||
y = y.to_numpy()
|
|
||||||
|
|
||||||
# split data into train 80% / tests datasets 20%
|
|
||||||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
|
|
||||||
'''
|
|
||||||
for col in df:
|
for col in df:
|
||||||
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -200,10 +200,6 @@ class LogisticRegression:
|
||||||
"""
|
"""
|
||||||
if isinstance(x, pd.DataFrame):
|
if isinstance(x, pd.DataFrame):
|
||||||
x = x.values
|
x = x.values
|
||||||
|
|
||||||
if self.w is None:
|
|
||||||
raise ValueError("Model not fitted yet")
|
|
||||||
|
|
||||||
# Add bias term if needed
|
# Add bias term if needed
|
||||||
if x.shape[1] == len(self.w) - 1:
|
if x.shape[1] == len(self.w) - 1:
|
||||||
x = np.column_stack([np.ones(x.shape[0]), x])
|
x = np.column_stack([np.ones(x.shape[0]), x])
|
||||||
|
|
@ -250,37 +246,7 @@ if __name__ == "__main__":
|
||||||
df = df.drop_duplicates()
|
df = df.drop_duplicates()
|
||||||
# check data types: --> everything is good
|
# check data types: --> everything is good
|
||||||
# print(df.dtypes)
|
# print(df.dtypes)
|
||||||
'''
|
|
||||||
# ____________________________________________________________________________________
|
|
||||||
# HANDLE OUTLIERS AND INCONSISTENCIES
|
|
||||||
# https://medium.com/@heyamit10/pandas-outlier-detection-techniques-e9afece3d9e3
|
|
||||||
# if z-score more than 3 --> outllier
|
|
||||||
# print(cancer.head().to_string())
|
|
||||||
|
|
||||||
# ____________________________________________________________________________________
|
|
||||||
|
|
||||||
# separate dependent VS independent variables
|
|
||||||
x = cancer.drop(cancer.columns[0], axis=1)
|
|
||||||
y = cancer[1]
|
|
||||||
|
|
||||||
# print(x.head().to_string())
|
|
||||||
|
|
||||||
# normalize data
|
|
||||||
# normalize = cancer.drop(cancer.columns[0], axis=1)
|
|
||||||
# normalize = (normalize - normalize.mean()) / normalize.std()
|
|
||||||
# cancer[cancer.columns[1:]] = normalize
|
|
||||||
# print(cancer.head().to_string())
|
|
||||||
|
|
||||||
# turn into array for regression
|
|
||||||
x = x.to_numpy()
|
|
||||||
y = y.to_numpy()
|
|
||||||
|
|
||||||
# cancer_y = np.asarray(cancer2[0].tolist())
|
|
||||||
# cancer2.drop(cancer2[0], axis = 1, inplace = True)
|
|
||||||
|
|
||||||
# split data into train / tests datasets
|
|
||||||
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
|
|
||||||
'''
|
|
||||||
missing_rows = df[df.isin(['?', 'NA', 'na', '']).any(axis=1)] # checks null values
|
missing_rows = df[df.isin(['?', 'NA', 'na', '']).any(axis=1)] # checks null values
|
||||||
print(f"Rows with null values: {len(missing_rows)}")
|
print(f"Rows with null values: {len(missing_rows)}")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -171,31 +171,7 @@ if __name__ == "__main__":
|
||||||
print("\nCorrelation with target variable descending order:")
|
print("\nCorrelation with target variable descending order:")
|
||||||
print(target_corr)
|
print(target_corr)
|
||||||
|
|
||||||
'''
|
|
||||||
# repeated fields —> for now I removed them since might not be too relevant (need testing to see if we keep it later)
|
|
||||||
Parkinson = Parkinson.drop(Parkinson.columns[0:3], axis=1)
|
|
||||||
|
|
||||||
# ____________________________________________________________________________________
|
|
||||||
# HANDLE OUTLIERS AND INCONSISTENCIES
|
|
||||||
# https://medium.com/@heyamit10/pandas-outlier-detection-techniques-e9afece3d9e3
|
|
||||||
# if z-score more than 3 --> outllier
|
|
||||||
# print(Parkinson.head().to_string())
|
|
||||||
|
|
||||||
# ____________________________________________________________________________________
|
|
||||||
|
|
||||||
# normalize / scale features? if not already done
|
|
||||||
# !!!!!!!!!!only for X not y!!!!!!!!!!!
|
|
||||||
# normalize = Parkinson.drop(Parkinson.columns[0:6], axis=1)
|
|
||||||
# normalize = (normalize - normalize.mean()) / normalize.std()
|
|
||||||
# Parkinson[Parkinson.columns[6:]] = normalize
|
|
||||||
|
|
||||||
# turn into array for regression
|
|
||||||
x = x.to_numpy()
|
|
||||||
y = y.to_numpy()
|
|
||||||
|
|
||||||
# split data into train 80% / tests datasets 20%
|
|
||||||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
|
|
||||||
'''
|
|
||||||
for col in df:
|
for col in df:
|
||||||
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
df[col] = pd.to_numeric(df[col], errors='coerce') # convert columns to numeric values
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -219,9 +219,6 @@ class LogisticRegression:
|
||||||
if isinstance(x, pd.DataFrame):
|
if isinstance(x, pd.DataFrame):
|
||||||
x = x.values
|
x = x.values
|
||||||
|
|
||||||
if self.w is None:
|
|
||||||
raise ValueError("Model not fitted yet")
|
|
||||||
|
|
||||||
# Add bias term if needed
|
# Add bias term if needed
|
||||||
if x.shape[1] == len(self.w) - 1:
|
if x.shape[1] == len(self.w) - 1:
|
||||||
x = np.column_stack([np.ones(x.shape[0]), x])
|
x = np.column_stack([np.ones(x.shape[0]), x])
|
||||||
|
|
@ -259,37 +256,6 @@ if __name__ == "__main__":
|
||||||
# check data types: --> everything is good
|
# check data types: --> everything is good
|
||||||
# print(df.dtypes)
|
# print(df.dtypes)
|
||||||
|
|
||||||
'''
|
|
||||||
# ____________________________________________________________________________________
|
|
||||||
# HANDLE OUTLIERS AND INCONSISTENCIES
|
|
||||||
# https://medium.com/@heyamit10/pandas-outlier-detection-techniques-e9afece3d9e3
|
|
||||||
# if z-score more than 3 --> outllier
|
|
||||||
# print(cancer.head().to_string())
|
|
||||||
|
|
||||||
# ____________________________________________________________________________________
|
|
||||||
|
|
||||||
# separate dependent VS independent variables
|
|
||||||
x = cancer.drop(cancer.columns[0], axis=1)
|
|
||||||
y = cancer[1]
|
|
||||||
|
|
||||||
# print(X.head().to_string())
|
|
||||||
|
|
||||||
# normalize data
|
|
||||||
# normalize = cancer.drop(cancer.columns[0], axis=1)
|
|
||||||
# normalize = (normalize - normalize.mean()) / normalize.std()
|
|
||||||
# cancer[cancer.columns[1:]] = normalize
|
|
||||||
# print(cancer.head().to_string())
|
|
||||||
|
|
||||||
# turn into array for regression
|
|
||||||
x = x.to_numpy()
|
|
||||||
y = y.to_numpy()
|
|
||||||
|
|
||||||
# cancer_y = np.asarray(cancer2[0].tolist())
|
|
||||||
# cancer2.drop(cancer2[0], axis = 1, inplace = True)
|
|
||||||
|
|
||||||
# split data into train / tests datasets
|
|
||||||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
|
|
||||||
'''
|
|
||||||
missing_rows = df[df.isin(['?', 'NA', 'na', '']).any(axis=1)] # checks null values
|
missing_rows = df[df.isin(['?', 'NA', 'na', '']).any(axis=1)] # checks null values
|
||||||
print(f"Rows with null values: {len(missing_rows)}")
|
print(f"Rows with null values: {len(missing_rows)}")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue