diff --git a/linear-regression-parkinsons.py b/linear-regression-parkinsons.py index 536eb1c..bc9a196 100644 --- a/linear-regression-parkinsons.py +++ b/linear-regression-parkinsons.py @@ -129,6 +129,15 @@ if __name__ == "__main__": df.dropna(inplace=True) # remove null values print(f"Rows remaining after drop of the null values: {len(df)}") + # sanity checks for data validity + df = df[(df['age'] >= 18) & (df['age'] <= 95)] + df = df[(df['motor_UPDRS'] >= 0) & (df['motor_UPDRS'] <= 100)] + df = df[(df['total_UPDRS'] >= 0) & (df['total_UPDRS'] <= 100)] + df = df[(df['Jitter(%)'] >= 0) & (df['Jitter(%)'] <= 10)] + df = df[(df['Shimmer(dB)'] >= 0) & (df['Shimmer(dB)'] <= 10)] + + print(f"Rows after sanity checks: {len(df)}") + # check if there are still null values assert df.isna().sum().sum() == 0, "There are still some null values." diff --git a/logistic-regression-wdbc.py b/logistic-regression-wdbc.py index d6ad1e6..b0ec4e7 100644 --- a/logistic-regression-wdbc.py +++ b/logistic-regression-wdbc.py @@ -43,6 +43,19 @@ if __name__ == "__main__": for col in num_cols: df = df[df[col] >= 0] + df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0}) # making diagnosis numeric + df['Diagnosis'] = df['Diagnosis'].astype('category') + + # sanity checks for data validity + df = df[(df['radius_mean'] > 0) & (df['radius_mean'] <= 30)] + df = df[(df['radius_worst'] > 0) & (df['radius_worst'] <= 30)] + df = df[(df['texture_mean'] >= 0) & (df['texture_mean'] <= 100)] + df = df[(df['texture_worst'] >= 0) & (df['texture_worst'] <= 100)] + df = df[(df['perimeter_mean'] > 0) & (df['perimeter_mean'] <= 200)] + df = df[(df['perimeter_worst'] > 0) & (df['perimeter_worst'] <= 200)] + df = df[(df['area_mean'] > 0) & (df['area_mean'] <= 600)] + df = df[(df['area_worst'] > 0) & (df['area_worst'] <= 600)] + # check if there are still null values assert df.isna().sum().sum() == 0, "There are still some null values." diff --git a/mini-batch-sgd-linear-regression-parkinsons.py b/mini-batch-sgd-linear-regression-parkinsons.py index e1f3adf..f46f8df 100644 --- a/mini-batch-sgd-linear-regression-parkinsons.py +++ b/mini-batch-sgd-linear-regression-parkinsons.py @@ -135,6 +135,15 @@ if __name__ == "__main__": df.dropna(inplace=True) # remove null values print(f"Rows remaining after drop of the null values: {len(df)}") + # sanity checks for data validity + df = df[(df['age'] >= 18) & (df['age'] <= 95)] + df = df[(df['motor_UPDRS'] >= 0) & (df['motor_UPDRS'] <= 100)] + df = df[(df['total_UPDRS'] >= 0) & (df['total_UPDRS'] <= 100)] + df = df[(df['Jitter(%)'] >= 0) & (df['Jitter(%)'] <= 10)] + df = df[(df['Shimmer(dB)'] >= 0) & (df['Shimmer(dB)'] <= 10)] + + print(f"Rows after sanity checks: {len(df)}") + # check if there are still null values assert df.isna().sum().sum() == 0, "There are still some null values." diff --git a/mini-batch-sgd-logistic-regression-wdbc.py b/mini-batch-sgd-logistic-regression-wdbc.py index d6ad1e6..b0ec4e7 100644 --- a/mini-batch-sgd-logistic-regression-wdbc.py +++ b/mini-batch-sgd-logistic-regression-wdbc.py @@ -43,6 +43,19 @@ if __name__ == "__main__": for col in num_cols: df = df[df[col] >= 0] + df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0}) # making diagnosis numeric + df['Diagnosis'] = df['Diagnosis'].astype('category') + + # sanity checks for data validity + df = df[(df['radius_mean'] > 0) & (df['radius_mean'] <= 30)] + df = df[(df['radius_worst'] > 0) & (df['radius_worst'] <= 30)] + df = df[(df['texture_mean'] >= 0) & (df['texture_mean'] <= 100)] + df = df[(df['texture_worst'] >= 0) & (df['texture_worst'] <= 100)] + df = df[(df['perimeter_mean'] > 0) & (df['perimeter_mean'] <= 200)] + df = df[(df['perimeter_worst'] > 0) & (df['perimeter_worst'] <= 200)] + df = df[(df['area_mean'] > 0) & (df['area_mean'] <= 600)] + df = df[(df['area_worst'] > 0) & (df['area_worst'] <= 600)] + # check if there are still null values assert df.isna().sum().sum() == 0, "There are still some null values."