diff --git a/experiment-3-l2.py b/experiment-3-l2.py deleted file mode 100644 index 7805470..0000000 --- a/experiment-3-l2.py +++ /dev/null @@ -1,187 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt -from torchvision import datasets -import os - - -class MLP: - def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l2): - self.l2 = l2 - - # initializes weights and biases for each layer - self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale - self.b1 = np.zeros((1, hidden_size1)) - self.W2 = np.random.randn(hidden_size1, hidden_size2) * weight_scale - self.b2 = np.zeros((1, hidden_size2)) - self.W3 = np.random.randn(hidden_size2, output_size) * weight_scale - self.b3 = np.zeros((1, output_size)) - - def forward(self, x): - # forwards pass through the network - self.x = x # input for backpropagation - self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1 - self.a1 = self.relu(self.z1) # ReLU activation - self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2 - self.a2 = self.relu(self.z2) # ReLU activation - self.z3 = self.a2 @ self.W3 + self.b3 # linear transformation for layer 3 - self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities - return self.a3 # output of the network - - def backward(self, y, lr): - # backwards pass for weight updates using gradient descent - m = y.shape[0] - y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding - - # computes gradients for each layer - dz3 = self.a3 - y_one_hot # gradient for output layer - dw3 = (self.a2.T @ dz3) / m - db3 = np.sum(dz3, axis=0, keepdims=True) / m - - dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # gradient for layer 2 - dw2 = (self.a1.T @ dz2) / m - db2 = np.sum(dz2, axis=0, keepdims=True) / m - - dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # gradient for layer 1 - dw1 = (self.x.T @ dz1) / m - db1 = np.sum(dz1, axis=0, keepdims=True) / m - - dw3 += self.l2 * self.W3 - dw2 += self.l2 * self.W2 - dw1 += self.l2 * self.W1 - - # updates weights and biases using gradient descent - self.W3 -= lr * dw3 - self.b3 -= lr * db3 - self.W2 -= lr * dw2 - self.b2 -= lr * db2 - self.W1 -= lr * dw1 - self.b1 -= lr * db1 - - @staticmethod - def relu(x): - # ReLU activation - return np.maximum(0, x) - - @staticmethod - def relu_deriv(x): - # derivation of ReLU activation for backpropagation - return (x > 0).astype(float) - - @staticmethod - def softmax(x): - # softmax function normalizes outputs to probabilities - e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # exponentiates inputs - return e_x / np.sum(e_x, axis=1, keepdims=True) # normalizes to get probabilities - - @staticmethod - def one_hot_encode(y, num_classes): - # converts labels to one-hot encoded format - return np.eye(num_classes)[y] - - @staticmethod - def cross_entropy_loss(y, y_hat): - # computes cross-entropy loss between true labels and predicted probabilities - m = y.shape[0] - m = y.shape[0] - eps = 1e-12 - y_hat_clipped = np.clip(y_hat, eps, 1. - eps) - log_probs = -np.log(y_hat_clipped[np.arange(m), y]) - return np.mean(log_probs) - - def fit(self, x_train, y_train, x_val, y_val, lr, epochs, batch_size): - train_losses = [] - val_accuracies = [] - - for epoch in range(1, epochs + 1): - perm = np.random.permutation(x_train.shape[0]) # Shuffle the training data - x_train_shuffled, y_train_shuffled = x_train[perm], y_train[perm] - - epoch_loss = 0.0 - num_batches = int(np.ceil(x_train.shape[0] / batch_size)) - - for i in range(num_batches): - start = i * batch_size - end = start + batch_size - x_batch = x_train_shuffled[start:end] # batch of inputs - y_batch = y_train_shuffled[start:end] # batch of labels - - # Forward pass, backward pass, and weight update - self.forward(x_batch) - self.backward(y_batch, lr) - - epoch_loss += self.cross_entropy_loss(y_batch, self.a3) # updating the epoch loss - - epoch_loss /= num_batches # average loss is defined - train_losses.append(epoch_loss) - - val_pred = self.predict(x_val) - val_acc = np.mean(val_pred == y_val) - val_accuracies.append(val_acc) \ - - print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f} | Value Accuracy: {val_acc:.4f}") - - self.plot_graph(train_losses, val_accuracies) - return val_accuracies[-1] - - def plot_graph(self, train_losses, val_accuracies): - if not os.path.exists('results'): - os.makedirs('results') # creates results director - - fig, ax1 = plt.subplots() # initializes the plot - - ax1.set_xlabel('Epochs') - ax1.set_ylabel('Training Loss', color='tab:blue') - ax1.plot(range(1, len(train_losses) + 1), train_losses, color='tab:blue', label='Training Loss') - ax1.tick_params(axis='y', labelcolor='tab:blue') # defines loss subplot - - ax2 = ax1.twinx() - ax2.set_ylabel('Validation Accuracy', color='tab:orange') - ax2.plot(range(1, len(val_accuracies) + 1), val_accuracies, color='tab:orange', label='Validation Accuracy') - ax2.tick_params(axis='y', labelcolor='tab:orange') # defines accuracy subplot - - plt.title('Training Loss and Validation Accuracy over Epochs') - - result_path = 'results/experiment-3-l2.png' # defines the file name - fig.savefig(result_path) - print(f"Graph saved to: {result_path}") - - def predict(self, x): # predicts class labels for the input data - probs = self.forward(x) # forwards pass to get probabilities - return np.argmax(probs, axis=1) # returns the class with highest probability - -# acquiring the FashionMNIST dataset -train_set = datasets.FashionMNIST(root='.', train=True, download=True) -test_set = datasets.FashionMNIST(root='.', train=False, download=True) - -# preprocessing the data by flattening images and normalizing them. -x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0 -y_train = train_set.targets.numpy() - -x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0 -y_test = test_set.targets.numpy() - -# MLP Initialization -mlp = MLP( - input_size=28 * 28, - hidden_size1=256, - hidden_size2=256, - output_size=10, - weight_scale=1e-2, - l2 = 1e-4 -) - -# trains the model -mlp.fit( - x_train=x_train, - y_train=y_train, - x_val=x_test, - y_val=y_test, - lr=1e-2, - epochs=10, - batch_size=256 -) - -# tests the model -test_pred = mlp.predict(x_test) -test_acc = np.mean(test_pred == y_test) -print(f"\nFinal test accuracy: {test_acc:.4f}") \ No newline at end of file diff --git a/experiment-3-l1.py b/experiment-3.py similarity index 77% rename from experiment-3-l1.py rename to experiment-3.py index 7b535b2..0256ae4 100644 --- a/experiment-3-l1.py +++ b/experiment-3.py @@ -4,8 +4,9 @@ from torchvision import datasets import os class MLP: - def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l1): + def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l1=0, l2=0): self.l1 = l1 + self.l2 = l2 # initializes weights and biases for each layer self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale @@ -44,9 +45,16 @@ class MLP: dw1 = (self.x.T @ dz1) / m db1 = np.sum(dz1, axis=0, keepdims=True) / m - dw3 += self.l1 * np.sign(self.W3) - dw2 += self.l1 * np.sign(self.W2) - dw1 += self.l1 * np.sign(self.W1) + # applying L1 and L2 regularization + if self.l1 > 0: + dw3 += self.l1 * np.sign(self.W3) + dw2 += self.l1 * np.sign(self.W2) + dw1 += self.l1 * np.sign(self.W1) + + if self.l2 > 0: + dw3 += self.l2 * self.W3 + dw2 += self.l2 * self.W2 + dw1 += self.l2 * self.W1 # updates weights and biases using gradient descent self.W3 -= lr * dw3 @@ -140,7 +148,14 @@ class MLP: plt.title('Training Loss and Validation Accuracy over Epochs') - result_path = 'results/experiment-3-l1.png' # defines the file name + # dynamically sets the filename + if self.l1 > 0 and self.l2 > 0: + result_path = 'results/experiment-3-l1-and-l2.png' + elif self.l1 > 0: + result_path = 'results/experiment-3-l1.png' + elif self.l2 > 0: + result_path = 'results/experiment-3-l2.png' + fig.savefig(result_path) print(f"Graph saved to: {result_path}") @@ -159,18 +174,18 @@ y_train = train_set.targets.numpy() x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0 y_test = test_set.targets.numpy() -# MLP Initialization -mlp = MLP( +# MLP initialization (L1 regularization) +mlp_l1 = MLP( input_size=28 * 28, hidden_size1=256, hidden_size2=256, output_size=10, weight_scale=1e-2, - l1 = 1e-6, + l1 = 1e-6 ) # trains the model -mlp.fit( +mlp_l1.fit( x_train=x_train, y_train=y_train, x_val=x_test, @@ -181,6 +196,59 @@ mlp.fit( ) # tests the model -test_pred = mlp.predict(x_test) -test_acc = np.mean(test_pred == y_test) -print(f"\nFinal test accuracy: {test_acc:.4f}") \ No newline at end of file +test_pred_l1 = mlp_l1.predict(x_test) +test_acc_l1 = np.mean(test_pred_l1 == y_test) +print(f"\nFinal test accuracy: {test_acc_l1:.4f}") + +# MLP initialization (L2 regularization) +mlp_l2 = MLP( + input_size=28 * 28, + hidden_size1=256, + hidden_size2=256, + output_size=10, + weight_scale=1e-2, + l2 = 1e-4 +) + +# trains the model +mlp_l2.fit( + x_train=x_train, + y_train=y_train, + x_val=x_test, + y_val=y_test, + lr=1e-2, + epochs=10, + batch_size=256 +) + +# tests the model +test_pred_l2 = mlp_l2.predict(x_test) +test_acc_l2 = np.mean(test_pred_l2 == y_test) +print(f"\nFinal test accuracy: {test_acc_l2:.4f}") + +# MLP initialization (L1 and L2 regularization) +mlp_l1_l2 = MLP( + input_size=28 * 28, + hidden_size1=256, + hidden_size2=256, + output_size=10, + weight_scale=1e-2, + l1 = 1e-6, + l2 = 1e-4 +) + +# trains the model +mlp_l1_l2.fit( + x_train=x_train, + y_train=y_train, + x_val=x_test, + y_val=y_test, + lr=1e-2, + epochs=10, + batch_size=256 +) + +# tests the model +test_pred_l1_l2 = mlp_l1_l2.predict(x_test) +test_acc_l1_l2 = np.mean(test_pred_l1_l2 == y_test) +print(f"\nFinal test accuracy: {test_acc_l1_l2:.4f}") \ No newline at end of file diff --git a/experiment-4.py b/experiment-4.py index 01852fd..058502a 100644 --- a/experiment-4.py +++ b/experiment-4.py @@ -3,9 +3,8 @@ import matplotlib.pyplot as plt from torchvision import datasets import os - class MLP: - def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l1, l2): + def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l1=0, l2=0): self.l1 = l1 self.l2 = l2 @@ -46,18 +45,16 @@ class MLP: dw1 = (self.x.T @ dz1) / m db1 = np.sum(dz1, axis=0, keepdims=True) / m + # applying L1 and L2 regularization + if self.l1 > 0: + dw3 += self.l1 * np.sign(self.W3) + dw2 += self.l1 * np.sign(self.W2) + dw1 += self.l1 * np.sign(self.W1) - - dw3 += self.l2 * self.W3 - dw2 += self.l2 * self.W2 - dw1 += self.l2 * self.W1 - - - dw3 += self.l1 * np.sign(self.W3) - - dw2 += self.l1 * np.sign(self.W2) - - dw1 += self.l1 * np.sign(self.W1) + if self.l2 > 0: + dw3 += self.l2 * self.W3 + dw2 += self.l2 * self.W2 + dw1 += self.l2 * self.W1 # updates weights and biases using gradient descent self.W3 -= lr * dw3 @@ -151,7 +148,14 @@ class MLP: plt.title('Training Loss and Validation Accuracy over Epochs') - result_path = 'results/experiment-4.png' # defines the file name + # dynamically sets the filename + if self.l1 > 0 and self.l2 > 0: + result_path = 'results/experiment-4-l1-and-l2.png' + elif self.l1 > 0: + result_path = 'results/experiment-4-l1.png' + elif self.l2 > 0: + result_path = 'results/experiment-4-l2.png' + fig.savefig(result_path) print(f"Graph saved to: {result_path}") @@ -170,19 +174,18 @@ y_train = train_set.targets.numpy() x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) y_test = test_set.targets.numpy() -# MLP Initialization -mlp = MLP( +# MLP initialization (L1 regularization) +mlp_l1 = MLP( input_size=28 * 28, hidden_size1=256, hidden_size2=256, output_size=10, weight_scale=1e-2, - l1 = 1e-6, - l2 = 1e-4 + l1 = 1e-6 ) # trains the model -mlp.fit( +mlp_l1.fit( x_train=x_train, y_train=y_train, x_val=x_test, @@ -193,6 +196,59 @@ mlp.fit( ) # tests the model -test_pred = mlp.predict(x_test) -test_acc = np.mean(test_pred == y_test) -print(f"\nFinal test accuracy: {test_acc:.4f}") \ No newline at end of file +test_pred_l1 = mlp_l1.predict(x_test) +test_acc_l1 = np.mean(test_pred_l1 == y_test) +print(f"\nFinal test accuracy: {test_acc_l1:.4f}") + +# MLP initialization (L2 regularization) +mlp_l2 = MLP( + input_size=28 * 28, + hidden_size1=256, + hidden_size2=256, + output_size=10, + weight_scale=1e-2, + l2 = 1e-4 +) + +# trains the model +mlp_l2.fit( + x_train=x_train, + y_train=y_train, + x_val=x_test, + y_val=y_test, + lr=1e-2, + epochs=10, + batch_size=256 +) + +# tests the model +test_pred_l2 = mlp_l2.predict(x_test) +test_acc_l2 = np.mean(test_pred_l2 == y_test) +print(f"\nFinal test accuracy: {test_acc_l2:.4f}") + +# MLP initialization (L1 and L2 regularization) +mlp_l1_l2 = MLP( + input_size=28 * 28, + hidden_size1=256, + hidden_size2=256, + output_size=10, + weight_scale=1e-2, + l1 = 1e-6, + l2 = 1e-4 +) + +# trains the model +mlp_l1_l2.fit( + x_train=x_train, + y_train=y_train, + x_val=x_test, + y_val=y_test, + lr=1e-2, + epochs=10, + batch_size=256 +) + +# tests the model +test_pred_l1_l2 = mlp_l1_l2.predict(x_test) +test_acc_l1_l2 = np.mean(test_pred_l1_l2 == y_test) +print(f"\nFinal test accuracy: {test_acc_l1_l2:.4f}") \ No newline at end of file diff --git a/experiment-5.py b/experiment-5.py index 8d1bf9b..36b426a 100644 --- a/experiment-5.py +++ b/experiment-5.py @@ -1,18 +1,13 @@ import numpy as np import matplotlib.pyplot as plt -from torchvision import datasets -from torchvision import transforms +from torchvision import datasets, transforms import os - - class MLP: - def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l1, l2): - + def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l1=0, l2=0): self.l1 = l1 self.l2 = l2 - # initializes weights and biases for each layer self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale self.b1 = np.zeros((1, hidden_size1)) @@ -50,18 +45,16 @@ class MLP: dw1 = (self.x.T @ dz1) / m db1 = np.sum(dz1, axis=0, keepdims=True) / m + # applying L1 and L2 regularization + if self.l1 > 0: + dw3 += self.l1 * np.sign(self.W3) + dw2 += self.l1 * np.sign(self.W2) + dw1 += self.l1 * np.sign(self.W1) - - dw3 += self.l2 * self.W3 - dw2 += self.l2 * self.W2 - dw1 += self.l2 * self.W1 - - - dw3 += self.l1 * np.sign(self.W3) - - dw2 += self.l1 * np.sign(self.W2) - - dw1 += self.l1 * np.sign(self.W1) + if self.l2 > 0: + dw3 += self.l2 * self.W3 + dw2 += self.l2 * self.W2 + dw1 += self.l2 * self.W1 # updates weights and biases using gradient descent self.W3 -= lr * dw3 @@ -155,7 +148,14 @@ class MLP: plt.title('Training Loss and Validation Accuracy over Epochs') - result_path = 'results/experiment-5.png' # defines the file name + # dynamically sets the filename + if self.l1 > 0 and self.l2 > 0: + result_path = 'results/experiment-5-l1-and-l2.png' + elif self.l1 > 0: + result_path = 'results/experiment-5-l1.png' + elif self.l2 > 0: + result_path = 'results/experiment-5-l2.png' + fig.savefig(result_path) print(f"Graph saved to: {result_path}") @@ -163,31 +163,41 @@ class MLP: probs = self.forward(x) # forwards pass to get probabilities return np.argmax(probs, axis=1) # returns the class with highest probability +# defining the data augmentation transformations for the training set +transform_train = transforms.Compose([ + transforms.RandomRotation(20), # random rotations between -20 and 20 degrees + transforms.RandomHorizontalFlip(), # random horizontal flip + transforms.ToTensor(), # converting images to tensor and normalizing to [0, 1] +]) + +# no augmentation for the test set, just converting to tensor +transform_test = transforms.Compose([ + transforms.ToTensor(), +]) + # acquiring the FashionMNIST dataset -transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]) -train_set = datasets.FashionMNIST(root='.', train=True, download=True, transform = transform) -test_set = datasets.FashionMNIST(root='.', train=False, download=True, transform = transform) +train_set = datasets.FashionMNIST(root='.', train=True, download=True, transform=transform_train) +test_set = datasets.FashionMNIST(root='.', train=False, download=True, transform=transform_test) # preprocessing the data by flattening images and normalizing them. -x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) +x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0 y_train = train_set.targets.numpy() -x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) +x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0 y_test = test_set.targets.numpy() -# MLP Initialization -mlp = MLP( +# MLP initialization (L1 regularization) +mlp_l1 = MLP( input_size=28 * 28, hidden_size1=256, hidden_size2=256, output_size=10, weight_scale=1e-2, - l1 = 1e-6, - l2 = 1e-4 + l1 = 1e-6 ) # trains the model -mlp.fit( +mlp_l1.fit( x_train=x_train, y_train=y_train, x_val=x_test, @@ -198,6 +208,59 @@ mlp.fit( ) # tests the model -test_pred = mlp.predict(x_test) -test_acc = np.mean(test_pred == y_test) -print(f"\nFinal test accuracy: {test_acc:.4f}") \ No newline at end of file +test_pred_l1 = mlp_l1.predict(x_test) +test_acc_l1 = np.mean(test_pred_l1 == y_test) +print(f"\nFinal test accuracy: {test_acc_l1:.4f}") + +# MLP initialization (L2 regularization) +mlp_l2 = MLP( + input_size=28 * 28, + hidden_size1=256, + hidden_size2=256, + output_size=10, + weight_scale=1e-2, + l2 = 1e-4 +) + +# trains the model +mlp_l2.fit( + x_train=x_train, + y_train=y_train, + x_val=x_test, + y_val=y_test, + lr=1e-2, + epochs=10, + batch_size=256 +) + +# tests the model +test_pred_l2 = mlp_l2.predict(x_test) +test_acc_l2 = np.mean(test_pred_l2 == y_test) +print(f"\nFinal test accuracy: {test_acc_l2:.4f}") + +# MLP initialization (L1 and L2 regularization) +mlp_l1_l2 = MLP( + input_size=28 * 28, + hidden_size1=256, + hidden_size2=256, + output_size=10, + weight_scale=1e-2, + l1 = 1e-6, + l2 = 1e-4 +) + +# trains the model +mlp_l1_l2.fit( + x_train=x_train, + y_train=y_train, + x_val=x_test, + y_val=y_test, + lr=1e-2, + epochs=10, + batch_size=256 +) + +# tests the model +test_pred_l1_l2 = mlp_l1_l2.predict(x_test) +test_acc_l1_l2 = np.mean(test_pred_l1_l2 == y_test) +print(f"\nFinal test accuracy: {test_acc_l1_l2:.4f}") \ No newline at end of file diff --git a/results/experiment-3-l1-and-l2.png b/results/experiment-3-l1-and-l2.png new file mode 100644 index 0000000..4b5f27b Binary files /dev/null and b/results/experiment-3-l1-and-l2.png differ diff --git a/results/experiment-3-l1.png b/results/experiment-3-l1.png index 056b09c..ad3df77 100644 Binary files a/results/experiment-3-l1.png and b/results/experiment-3-l1.png differ diff --git a/results/experiment-3-l2.png b/results/experiment-3-l2.png index eb1931f..4335d47 100644 Binary files a/results/experiment-3-l2.png and b/results/experiment-3-l2.png differ diff --git a/results/experiment-4-l1-and-l2.png b/results/experiment-4-l1-and-l2.png new file mode 100644 index 0000000..e01e258 Binary files /dev/null and b/results/experiment-4-l1-and-l2.png differ diff --git a/results/experiment-4-l1.png b/results/experiment-4-l1.png new file mode 100644 index 0000000..6e174b1 Binary files /dev/null and b/results/experiment-4-l1.png differ diff --git a/results/experiment-4-l2.png b/results/experiment-4-l2.png new file mode 100644 index 0000000..b3b912b Binary files /dev/null and b/results/experiment-4-l2.png differ diff --git a/results/experiment-5-l1-and-l2.png b/results/experiment-5-l1-and-l2.png new file mode 100644 index 0000000..77a12ec Binary files /dev/null and b/results/experiment-5-l1-and-l2.png differ diff --git a/results/experiment-5-l1.png b/results/experiment-5-l1.png new file mode 100644 index 0000000..8f173a1 Binary files /dev/null and b/results/experiment-5-l1.png differ diff --git a/results/experiment-5-l2.png b/results/experiment-5-l2.png new file mode 100644 index 0000000..f435858 Binary files /dev/null and b/results/experiment-5-l2.png differ diff --git a/results/experiment-5.png b/results/experiment-5.png deleted file mode 100644 index 6fdf148..0000000 Binary files a/results/experiment-5.png and /dev/null differ