Updated the data cleaning.
This commit is contained in:
parent
7734802cd1
commit
8a01e1bef6
4 changed files with 44 additions and 0 deletions
|
|
@ -129,6 +129,15 @@ if __name__ == "__main__":
|
||||||
df.dropna(inplace=True) # remove null values
|
df.dropna(inplace=True) # remove null values
|
||||||
print(f"Rows remaining after drop of the null values: {len(df)}")
|
print(f"Rows remaining after drop of the null values: {len(df)}")
|
||||||
|
|
||||||
|
# sanity checks for data validity
|
||||||
|
df = df[(df['age'] >= 18) & (df['age'] <= 95)]
|
||||||
|
df = df[(df['motor_UPDRS'] >= 0) & (df['motor_UPDRS'] <= 100)]
|
||||||
|
df = df[(df['total_UPDRS'] >= 0) & (df['total_UPDRS'] <= 100)]
|
||||||
|
df = df[(df['Jitter(%)'] >= 0) & (df['Jitter(%)'] <= 10)]
|
||||||
|
df = df[(df['Shimmer(dB)'] >= 0) & (df['Shimmer(dB)'] <= 10)]
|
||||||
|
|
||||||
|
print(f"Rows after sanity checks: {len(df)}")
|
||||||
|
|
||||||
# check if there are still null values
|
# check if there are still null values
|
||||||
assert df.isna().sum().sum() == 0, "There are still some null values."
|
assert df.isna().sum().sum() == 0, "There are still some null values."
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,19 @@ if __name__ == "__main__":
|
||||||
for col in num_cols:
|
for col in num_cols:
|
||||||
df = df[df[col] >= 0]
|
df = df[df[col] >= 0]
|
||||||
|
|
||||||
|
df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0}) # making diagnosis numeric
|
||||||
|
df['Diagnosis'] = df['Diagnosis'].astype('category')
|
||||||
|
|
||||||
|
# sanity checks for data validity
|
||||||
|
df = df[(df['radius_mean'] > 0) & (df['radius_mean'] <= 30)]
|
||||||
|
df = df[(df['radius_worst'] > 0) & (df['radius_worst'] <= 30)]
|
||||||
|
df = df[(df['texture_mean'] >= 0) & (df['texture_mean'] <= 100)]
|
||||||
|
df = df[(df['texture_worst'] >= 0) & (df['texture_worst'] <= 100)]
|
||||||
|
df = df[(df['perimeter_mean'] > 0) & (df['perimeter_mean'] <= 200)]
|
||||||
|
df = df[(df['perimeter_worst'] > 0) & (df['perimeter_worst'] <= 200)]
|
||||||
|
df = df[(df['area_mean'] > 0) & (df['area_mean'] <= 600)]
|
||||||
|
df = df[(df['area_worst'] > 0) & (df['area_worst'] <= 600)]
|
||||||
|
|
||||||
# check if there are still null values
|
# check if there are still null values
|
||||||
assert df.isna().sum().sum() == 0, "There are still some null values."
|
assert df.isna().sum().sum() == 0, "There are still some null values."
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -135,6 +135,15 @@ if __name__ == "__main__":
|
||||||
df.dropna(inplace=True) # remove null values
|
df.dropna(inplace=True) # remove null values
|
||||||
print(f"Rows remaining after drop of the null values: {len(df)}")
|
print(f"Rows remaining after drop of the null values: {len(df)}")
|
||||||
|
|
||||||
|
# sanity checks for data validity
|
||||||
|
df = df[(df['age'] >= 18) & (df['age'] <= 95)]
|
||||||
|
df = df[(df['motor_UPDRS'] >= 0) & (df['motor_UPDRS'] <= 100)]
|
||||||
|
df = df[(df['total_UPDRS'] >= 0) & (df['total_UPDRS'] <= 100)]
|
||||||
|
df = df[(df['Jitter(%)'] >= 0) & (df['Jitter(%)'] <= 10)]
|
||||||
|
df = df[(df['Shimmer(dB)'] >= 0) & (df['Shimmer(dB)'] <= 10)]
|
||||||
|
|
||||||
|
print(f"Rows after sanity checks: {len(df)}")
|
||||||
|
|
||||||
# check if there are still null values
|
# check if there are still null values
|
||||||
assert df.isna().sum().sum() == 0, "There are still some null values."
|
assert df.isna().sum().sum() == 0, "There are still some null values."
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,19 @@ if __name__ == "__main__":
|
||||||
for col in num_cols:
|
for col in num_cols:
|
||||||
df = df[df[col] >= 0]
|
df = df[df[col] >= 0]
|
||||||
|
|
||||||
|
df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0}) # making diagnosis numeric
|
||||||
|
df['Diagnosis'] = df['Diagnosis'].astype('category')
|
||||||
|
|
||||||
|
# sanity checks for data validity
|
||||||
|
df = df[(df['radius_mean'] > 0) & (df['radius_mean'] <= 30)]
|
||||||
|
df = df[(df['radius_worst'] > 0) & (df['radius_worst'] <= 30)]
|
||||||
|
df = df[(df['texture_mean'] >= 0) & (df['texture_mean'] <= 100)]
|
||||||
|
df = df[(df['texture_worst'] >= 0) & (df['texture_worst'] <= 100)]
|
||||||
|
df = df[(df['perimeter_mean'] > 0) & (df['perimeter_mean'] <= 200)]
|
||||||
|
df = df[(df['perimeter_worst'] > 0) & (df['perimeter_worst'] <= 200)]
|
||||||
|
df = df[(df['area_mean'] > 0) & (df['area_mean'] <= 600)]
|
||||||
|
df = df[(df['area_worst'] > 0) & (df['area_worst'] <= 600)]
|
||||||
|
|
||||||
# check if there are still null values
|
# check if there are still null values
|
||||||
assert df.isna().sum().sum() == 0, "There are still some null values."
|
assert df.isna().sum().sum() == 0, "There are still some null values."
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue