diff --git a/.idea/workspace.xml b/.idea/workspace.xml index d9d9e33..6ba2fea 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -5,8 +5,26 @@ + + + + + + + - + + + + + + + + + + + + - { - "keyToString": { - "ModuleVcsDetector.initialDetectionPerformed": "true", - "Python.Unnamed.executor": "Run", - "Python.multilayer-perceptron.executor": "Run", - "RunOnceActivity.ShowReadmeOnStart": "true", - "RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager.252": "true", - "RunOnceActivity.git.unshallow": "true", - "git-widget-placeholder": "master", - "last_opened_file_path": "/home/arctichawk1/Desktop/Projects/Private/Classification-of-Image-Data-with-MLP-and-CNN" + +}]]> diff --git a/experiment-1.py b/experiment-1.py index 359efdd..5c6d74f 100644 --- a/experiment-1.py +++ b/experiment-1.py @@ -22,43 +22,43 @@ class MLP: self.b3 = np.zeros((1, output_size)) def forward(self, x): - # Forward pass through the network + # forwards pass through the network self.x = x # input for backpropagation - self.z1 = x @ self.W1 + self.b1 # Linear transformation for first layer + self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1 self.a1 = self.relu(self.z1) # ReLU activation if self.has_hidden_layer2: - self.z2 = self.a1 @ self.W2 + self.b2 # Linear transformation for second layer + self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2 self.a2 = self.relu(self.z2) # ReLU activation self.z3 = self.a2 @ self.W3 + self.b3 # Linear transformation for output layer else: self.z3 = self.a1 @ self.W3 + self.b3 # No second layer, directly to output - self.a3 = self.softmax(self.z3) # Softmax to get class probabilities + self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities return self.a3 def backward(self, y, lr): - # Backward pass for weight updates using gradient descent + # backwards pass for weight updates using gradient descent m = y.shape[0] - y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # Converts labels to one-hot encoding + y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding - # Gradient for output layer - dz3 = self.a3 - y_one_hot + # computes gradients for each layer + dz3 = self.a3 - y_one_hot # gradient for output layer dw3 = (self.a2.T if self.has_hidden_layer2 else self.a1.T) @ dz3 / m db3 = np.sum(dz3, axis=0, keepdims=True) / m if self.has_hidden_layer2: - dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # Gradient for second hidden layer + dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # gradient for second hidden layer dw2 = (self.a1.T @ dz2) / m db2 = np.sum(dz2, axis=0, keepdims=True) / m - dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # Gradient for first hidden layer + dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # gradient for one hidden layer else: - dz1 = (dz3 @ self.W3.T) * self.relu_deriv(self.z1) # No second hidden layer + dz1 = (dz3 @ self.W3.T) * self.relu_deriv(self.z1) # no second hidden layer dw1 = (self.x.T @ dz1) / m db1 = np.sum(dz1, axis=0, keepdims=True) / m - # Update weights and biases using gradient descent + # updates weights and biases using gradient descent self.W3 -= lr * dw3 self.b3 -= lr * db3 if self.has_hidden_layer2: diff --git a/experiment-2-tanh.py b/experiment-2-tanh.py deleted file mode 100644 index e69de29..0000000 diff --git a/experiment-2.py b/experiment-2.py new file mode 100644 index 0000000..d71c469 --- /dev/null +++ b/experiment-2.py @@ -0,0 +1,240 @@ +import numpy as np +import matplotlib.pyplot as plt +from torchvision import datasets +import os + +class MLP_leaky_tanh: + def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, activation_type): + self.activation_type = activation_type + + # initializes weights and biases for each layer + self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale + self.b1 = np.zeros((1, hidden_size1)) + self.W2 = np.random.randn(hidden_size1, hidden_size2) * weight_scale + self.b2 = np.zeros((1, hidden_size2)) + self.W3 = np.random.randn(hidden_size2, output_size) * weight_scale + self.b3 = np.zeros((1, output_size)) + + def forward(self, x, alpha=0): + # forwards pass through the network + self.x = x # input for backpropagation + self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1 + self.a1 = self.activation(self.z1, alpha) # ReLU activation + self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2 + self.a2 = self.activation(self.z2, alpha) # ReLU activation + self.z3 = self.a2 @ self.W3 + self.b3 # linear transformation for layer 3 + self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities + return self.a3 # output of the network + + def backward(self, y, lr, alpha=0): + # backwards pass for weight updates using gradient descent + m = y.shape[0] + y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding + + # computes gradients for each layer + dz3 = self.a3 - y_one_hot # gradient for output layer + dw3 = (self.a2.T @ dz3) / m + db3 = np.sum(dz3, axis=0, keepdims=True) / m + + dz2 = (dz3 @ self.W3.T) * self.activation_deriv(self.z2, alpha) # gradient for layer 2 + dw2 = (self.a1.T @ dz2) / m + db2 = np.sum(dz2, axis=0, keepdims=True) / m + + dz1 = (dz2 @ self.W2.T) * self.activation_deriv(self.z1, alpha) # gradient for layer 1 + dw1 = (self.x.T @ dz1) / m + db1 = np.sum(dz1, axis=0, keepdims=True) / m + + # updates weights and biases using gradient descent + self.W3 -= lr * dw3 + self.b3 -= lr * db3 + self.W2 -= lr * dw2 + self.b2 -= lr * db2 + self.W1 -= lr * dw1 + self.b1 -= lr * db1 + + def activation(self, x, alpha=0): + # chooses activation function based on `activation_type` + if self.activation_type == 'leaky-relu': + return self.Lrelu(x, alpha) + elif self.activation_type == 'tanh': + return self.tanh(x) + else: + raise ValueError("Invalid activation type") + + def activation_deriv(self, x, alpha): + # derivatives for the chosen activation function + if self.activation_type == 'leaky-relu': + return self.Lrelu_deriv(x, alpha) + elif self.activation_type == 'tanh': + return self.tanh_deriv(x) + else: + raise ValueError("Invalid activation type") + + @staticmethod + def Lrelu(x, alpha=0): + # leaky ReLU activation + return np.where(x > 0, x, alpha * x) + + @staticmethod + def Lrelu_deriv(x, alpha=0): + # derivation of leaky ReLU activation for backpropagation + return np.where(x > 0, 1, alpha) + + @staticmethod + def tanh(x): + # tanh formula + return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x)) + + @staticmethod + def tanh_deriv(x): + # derivation of tanh for backpropagation + return 1 - ((np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))) ** 2 + + @staticmethod + def softmax(x): + # softmax function normalizes outputs to probabilities + e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # exponentiates inputs + return e_x / np.sum(e_x, axis=1, keepdims=True) # normalizes to get probabilities + + @staticmethod + def one_hot_encode(y, num_classes): + # converts labels to one-hot encoded format + return np.eye(num_classes)[y] + + @staticmethod + def cross_entropy_loss(y, y_hat): + # computes cross-entropy loss between true labels and predicted probabilities + m = y.shape[0] + m = y.shape[0] + eps = 1e-12 + y_hat_clipped = np.clip(y_hat, eps, 1. - eps) + log_probs = -np.log(y_hat_clipped[np.arange(m), y]) + return np.mean(log_probs) + + def fit(self, x_train, y_train, x_val, y_val, lr, epochs, batch_size, activation_type, alpha=0): + train_losses = [] + val_accuracies = [] + + for epoch in range(1, epochs + 1): + perm = np.random.permutation(x_train.shape[0]) # Shuffle the training data + x_train_shuffled, y_train_shuffled = x_train[perm], y_train[perm] + + epoch_loss = 0.0 + num_batches = int(np.ceil(x_train.shape[0] / batch_size)) + + for i in range(num_batches): + start = i * batch_size + end = start + batch_size + x_batch = x_train_shuffled[start:end] # batch of inputs + y_batch = y_train_shuffled[start:end] # batch of labels + + # Forward pass, backward pass, and weight update + self.forward(x_batch, alpha) + self.backward(y_batch, lr, alpha) + + epoch_loss += self.cross_entropy_loss(y_batch, self.a3) # updating the epoch loss + + epoch_loss /= num_batches # average loss is defined + train_losses.append(epoch_loss) + + val_pred = self.predict(x_val, alpha) + val_acc = np.mean(val_pred == y_val) + val_accuracies.append(val_acc) \ + + print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f} | Value Accuracy: {val_acc:.4f}") + + self.plot_graph(train_losses, val_accuracies, activation_type) + return val_accuracies[-1] + + def plot_graph(self, train_losses, val_accuracies, activation_type): + if not os.path.exists('results'): + os.makedirs('results') # creates results director + + fig, ax1 = plt.subplots() # initializes the plot + + ax1.set_xlabel('Epochs') + ax1.set_ylabel('Training Loss', color='tab:blue') + ax1.plot(range(1, len(train_losses) + 1), train_losses, color='tab:blue', label='Training Loss') + ax1.tick_params(axis='y', labelcolor='tab:blue') # defines loss subplot + + ax2 = ax1.twinx() + ax2.set_ylabel('Validation Accuracy', color='tab:orange') + ax2.plot(range(1, len(val_accuracies) + 1), val_accuracies, color='tab:orange', label='Validation Accuracy') + ax2.tick_params(axis='y', labelcolor='tab:orange') # defines accuracy subplot + + plt.title('Training Loss and Validation Accuracy over Epochs') + + result_path = 'results/experiment-2-' + activation_type + '.png' # defines the file name + fig.savefig(result_path) + print(f"Graph saved to: {result_path}") + + def predict(self, x, alpha=0): # predicts class labels for the input data + probs = self.forward(x, alpha) # forwards pass to get probabilities + return np.argmax(probs, axis=1) # returns the class with highest probability + +# acquiring the FashionMNIST dataset +train_set = datasets.FashionMNIST(root='.', train=True, download=True) +test_set = datasets.FashionMNIST(root='.', train=False, download=True) + +# preprocessing the data by flattening images and normalizing them. +x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0 +y_train = train_set.targets.numpy() + +x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0 +y_test = test_set.targets.numpy() + +# MLP initialization (tanh instead of ReLu) +mlp_tanh = MLP_leaky_tanh( + input_size=28 * 28, + hidden_size1=256, + hidden_size2=256, + output_size=10, + weight_scale=1e-2, + activation_type='tanh' +) + +# trains the model +mlp_tanh.fit( + x_train=x_train, + y_train=y_train, + x_val=x_test, + y_val=y_test, + lr=1e-2, + epochs=10, + batch_size=256, + activation_type='tanh' +) + +# tests the model +test_pred_tanh = mlp_tanh.predict(x_test) +test_acc_tanh = np.mean(test_pred_tanh == y_test) +print(f"\nFinal test accuracy: {test_acc_tanh:.4f}") + +# MLP initialization (leaky ReLu instead of ReLu) +mlp_Lrelu = MLP_leaky_tanh( + input_size=28 * 28, + hidden_size1=256, + hidden_size2=256, + output_size=10, + weight_scale=1e-2, + activation_type='leaky-relu' +) +alpha = 0.01 + +# trains the model +mlp_Lrelu.fit( + x_train=x_train, + y_train=y_train, + x_val=x_test, + y_val=y_test, + lr=1e-2, + epochs=10, + batch_size=256, + activation_type='leaky-relu', + alpha=alpha +) + +# tests the model +test_pred_Lrelu = mlp_Lrelu.predict(x_test, alpha) +test_acc_Lrelu = np.mean(test_pred_Lrelu == y_test) +print(f"\nFinal test accuracy: {test_acc_Lrelu:.4f}") \ No newline at end of file diff --git a/experiment-3-l1.py b/experiment-3-l1.py index e69de29..7b535b2 100644 --- a/experiment-3-l1.py +++ b/experiment-3-l1.py @@ -0,0 +1,186 @@ +import numpy as np +import matplotlib.pyplot as plt +from torchvision import datasets +import os + +class MLP: + def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l1): + self.l1 = l1 + + # initializes weights and biases for each layer + self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale + self.b1 = np.zeros((1, hidden_size1)) + self.W2 = np.random.randn(hidden_size1, hidden_size2) * weight_scale + self.b2 = np.zeros((1, hidden_size2)) + self.W3 = np.random.randn(hidden_size2, output_size) * weight_scale + self.b3 = np.zeros((1, output_size)) + + def forward(self, x): + # forwards pass through the network + self.x = x # input for backpropagation + self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1 + self.a1 = self.relu(self.z1) # ReLU activation + self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2 + self.a2 = self.relu(self.z2) # ReLU activation + self.z3 = self.a2 @ self.W3 + self.b3 # linear transformation for layer 3 + self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities + return self.a3 # output of the network + + def backward(self, y, lr): + # backwards pass for weight updates using gradient descent + m = y.shape[0] + y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding + + # computes gradients for each layer + dz3 = self.a3 - y_one_hot # gradient for output layer + dw3 = (self.a2.T @ dz3) / m + db3 = np.sum(dz3, axis=0, keepdims=True) / m + + dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # gradient for layer 2 + dw2 = (self.a1.T @ dz2) / m + db2 = np.sum(dz2, axis=0, keepdims=True) / m + + dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # gradient for layer 1 + dw1 = (self.x.T @ dz1) / m + db1 = np.sum(dz1, axis=0, keepdims=True) / m + + dw3 += self.l1 * np.sign(self.W3) + dw2 += self.l1 * np.sign(self.W2) + dw1 += self.l1 * np.sign(self.W1) + + # updates weights and biases using gradient descent + self.W3 -= lr * dw3 + self.b3 -= lr * db3 + self.W2 -= lr * dw2 + self.b2 -= lr * db2 + self.W1 -= lr * dw1 + self.b1 -= lr * db1 + + @staticmethod + def relu(x): + # ReLU activation + return np.maximum(0, x) + + @staticmethod + def relu_deriv(x): + # derivation of ReLU activation for backpropagation + return (x > 0).astype(float) + + @staticmethod + def softmax(x): + # softmax function normalizes outputs to probabilities + e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # exponentiates inputs + return e_x / np.sum(e_x, axis=1, keepdims=True) # normalizes to get probabilities + + @staticmethod + def one_hot_encode(y, num_classes): + # converts labels to one-hot encoded format + return np.eye(num_classes)[y] + + @staticmethod + def cross_entropy_loss(y, y_hat): + # computes cross-entropy loss between true labels and predicted probabilities + m = y.shape[0] + m = y.shape[0] + eps = 1e-12 + y_hat_clipped = np.clip(y_hat, eps, 1. - eps) + log_probs = -np.log(y_hat_clipped[np.arange(m), y]) + return np.mean(log_probs) + + def fit(self, x_train, y_train, x_val, y_val, lr, epochs, batch_size): + train_losses = [] + val_accuracies = [] + + for epoch in range(1, epochs + 1): + perm = np.random.permutation(x_train.shape[0]) # Shuffle the training data + x_train_shuffled, y_train_shuffled = x_train[perm], y_train[perm] + + epoch_loss = 0.0 + num_batches = int(np.ceil(x_train.shape[0] / batch_size)) + + for i in range(num_batches): + start = i * batch_size + end = start + batch_size + x_batch = x_train_shuffled[start:end] # batch of inputs + y_batch = y_train_shuffled[start:end] # batch of labels + + # Forward pass, backward pass, and weight update + self.forward(x_batch) + self.backward(y_batch, lr) + + epoch_loss += self.cross_entropy_loss(y_batch, self.a3) # updating the epoch loss + + epoch_loss /= num_batches # average loss is defined + train_losses.append(epoch_loss) + + val_pred = self.predict(x_val) + val_acc = np.mean(val_pred == y_val) + val_accuracies.append(val_acc) \ + + print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f} | Value Accuracy: {val_acc:.4f}") + + self.plot_graph(train_losses, val_accuracies) + return val_accuracies[-1] + + def plot_graph(self, train_losses, val_accuracies): + if not os.path.exists('results'): + os.makedirs('results') # creates results director + + fig, ax1 = plt.subplots() # initializes the plot + + ax1.set_xlabel('Epochs') + ax1.set_ylabel('Training Loss', color='tab:blue') + ax1.plot(range(1, len(train_losses) + 1), train_losses, color='tab:blue', label='Training Loss') + ax1.tick_params(axis='y', labelcolor='tab:blue') # defines loss subplot + + ax2 = ax1.twinx() + ax2.set_ylabel('Validation Accuracy', color='tab:orange') + ax2.plot(range(1, len(val_accuracies) + 1), val_accuracies, color='tab:orange', label='Validation Accuracy') + ax2.tick_params(axis='y', labelcolor='tab:orange') # defines accuracy subplot + + plt.title('Training Loss and Validation Accuracy over Epochs') + + result_path = 'results/experiment-3-l1.png' # defines the file name + fig.savefig(result_path) + print(f"Graph saved to: {result_path}") + + def predict(self, x): # predicts class labels for the input data + probs = self.forward(x) # forwards pass to get probabilities + return np.argmax(probs, axis=1) # returns the class with highest probability + +# acquiring the FashionMNIST dataset +train_set = datasets.FashionMNIST(root='.', train=True, download=True) +test_set = datasets.FashionMNIST(root='.', train=False, download=True) + +# preprocessing the data by flattening images and normalizing them. +x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0 +y_train = train_set.targets.numpy() + +x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0 +y_test = test_set.targets.numpy() + +# MLP Initialization +mlp = MLP( + input_size=28 * 28, + hidden_size1=256, + hidden_size2=256, + output_size=10, + weight_scale=1e-2, + l1 = 1e-6, +) + +# trains the model +mlp.fit( + x_train=x_train, + y_train=y_train, + x_val=x_test, + y_val=y_test, + lr=1e-2, + epochs=10, + batch_size=256 +) + +# tests the model +test_pred = mlp.predict(x_test) +test_acc = np.mean(test_pred == y_test) +print(f"\nFinal test accuracy: {test_acc:.4f}") \ No newline at end of file diff --git a/experiment-3-l2.py b/experiment-3-l2.py index e69de29..7805470 100644 --- a/experiment-3-l2.py +++ b/experiment-3-l2.py @@ -0,0 +1,187 @@ +import numpy as np +import matplotlib.pyplot as plt +from torchvision import datasets +import os + + +class MLP: + def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l2): + self.l2 = l2 + + # initializes weights and biases for each layer + self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale + self.b1 = np.zeros((1, hidden_size1)) + self.W2 = np.random.randn(hidden_size1, hidden_size2) * weight_scale + self.b2 = np.zeros((1, hidden_size2)) + self.W3 = np.random.randn(hidden_size2, output_size) * weight_scale + self.b3 = np.zeros((1, output_size)) + + def forward(self, x): + # forwards pass through the network + self.x = x # input for backpropagation + self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1 + self.a1 = self.relu(self.z1) # ReLU activation + self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2 + self.a2 = self.relu(self.z2) # ReLU activation + self.z3 = self.a2 @ self.W3 + self.b3 # linear transformation for layer 3 + self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities + return self.a3 # output of the network + + def backward(self, y, lr): + # backwards pass for weight updates using gradient descent + m = y.shape[0] + y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding + + # computes gradients for each layer + dz3 = self.a3 - y_one_hot # gradient for output layer + dw3 = (self.a2.T @ dz3) / m + db3 = np.sum(dz3, axis=0, keepdims=True) / m + + dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # gradient for layer 2 + dw2 = (self.a1.T @ dz2) / m + db2 = np.sum(dz2, axis=0, keepdims=True) / m + + dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # gradient for layer 1 + dw1 = (self.x.T @ dz1) / m + db1 = np.sum(dz1, axis=0, keepdims=True) / m + + dw3 += self.l2 * self.W3 + dw2 += self.l2 * self.W2 + dw1 += self.l2 * self.W1 + + # updates weights and biases using gradient descent + self.W3 -= lr * dw3 + self.b3 -= lr * db3 + self.W2 -= lr * dw2 + self.b2 -= lr * db2 + self.W1 -= lr * dw1 + self.b1 -= lr * db1 + + @staticmethod + def relu(x): + # ReLU activation + return np.maximum(0, x) + + @staticmethod + def relu_deriv(x): + # derivation of ReLU activation for backpropagation + return (x > 0).astype(float) + + @staticmethod + def softmax(x): + # softmax function normalizes outputs to probabilities + e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # exponentiates inputs + return e_x / np.sum(e_x, axis=1, keepdims=True) # normalizes to get probabilities + + @staticmethod + def one_hot_encode(y, num_classes): + # converts labels to one-hot encoded format + return np.eye(num_classes)[y] + + @staticmethod + def cross_entropy_loss(y, y_hat): + # computes cross-entropy loss between true labels and predicted probabilities + m = y.shape[0] + m = y.shape[0] + eps = 1e-12 + y_hat_clipped = np.clip(y_hat, eps, 1. - eps) + log_probs = -np.log(y_hat_clipped[np.arange(m), y]) + return np.mean(log_probs) + + def fit(self, x_train, y_train, x_val, y_val, lr, epochs, batch_size): + train_losses = [] + val_accuracies = [] + + for epoch in range(1, epochs + 1): + perm = np.random.permutation(x_train.shape[0]) # Shuffle the training data + x_train_shuffled, y_train_shuffled = x_train[perm], y_train[perm] + + epoch_loss = 0.0 + num_batches = int(np.ceil(x_train.shape[0] / batch_size)) + + for i in range(num_batches): + start = i * batch_size + end = start + batch_size + x_batch = x_train_shuffled[start:end] # batch of inputs + y_batch = y_train_shuffled[start:end] # batch of labels + + # Forward pass, backward pass, and weight update + self.forward(x_batch) + self.backward(y_batch, lr) + + epoch_loss += self.cross_entropy_loss(y_batch, self.a3) # updating the epoch loss + + epoch_loss /= num_batches # average loss is defined + train_losses.append(epoch_loss) + + val_pred = self.predict(x_val) + val_acc = np.mean(val_pred == y_val) + val_accuracies.append(val_acc) \ + + print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f} | Value Accuracy: {val_acc:.4f}") + + self.plot_graph(train_losses, val_accuracies) + return val_accuracies[-1] + + def plot_graph(self, train_losses, val_accuracies): + if not os.path.exists('results'): + os.makedirs('results') # creates results director + + fig, ax1 = plt.subplots() # initializes the plot + + ax1.set_xlabel('Epochs') + ax1.set_ylabel('Training Loss', color='tab:blue') + ax1.plot(range(1, len(train_losses) + 1), train_losses, color='tab:blue', label='Training Loss') + ax1.tick_params(axis='y', labelcolor='tab:blue') # defines loss subplot + + ax2 = ax1.twinx() + ax2.set_ylabel('Validation Accuracy', color='tab:orange') + ax2.plot(range(1, len(val_accuracies) + 1), val_accuracies, color='tab:orange', label='Validation Accuracy') + ax2.tick_params(axis='y', labelcolor='tab:orange') # defines accuracy subplot + + plt.title('Training Loss and Validation Accuracy over Epochs') + + result_path = 'results/experiment-3-l2.png' # defines the file name + fig.savefig(result_path) + print(f"Graph saved to: {result_path}") + + def predict(self, x): # predicts class labels for the input data + probs = self.forward(x) # forwards pass to get probabilities + return np.argmax(probs, axis=1) # returns the class with highest probability + +# acquiring the FashionMNIST dataset +train_set = datasets.FashionMNIST(root='.', train=True, download=True) +test_set = datasets.FashionMNIST(root='.', train=False, download=True) + +# preprocessing the data by flattening images and normalizing them. +x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0 +y_train = train_set.targets.numpy() + +x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0 +y_test = test_set.targets.numpy() + +# MLP Initialization +mlp = MLP( + input_size=28 * 28, + hidden_size1=256, + hidden_size2=256, + output_size=10, + weight_scale=1e-2, + l2 = 1e-4 +) + +# trains the model +mlp.fit( + x_train=x_train, + y_train=y_train, + x_val=x_test, + y_val=y_test, + lr=1e-2, + epochs=10, + batch_size=256 +) + +# tests the model +test_pred = mlp.predict(x_test) +test_acc = np.mean(test_pred == y_test) +print(f"\nFinal test accuracy: {test_acc:.4f}") \ No newline at end of file diff --git a/experiment-4.py b/experiment-4.py index e69de29..01852fd 100644 --- a/experiment-4.py +++ b/experiment-4.py @@ -0,0 +1,198 @@ +import numpy as np +import matplotlib.pyplot as plt +from torchvision import datasets +import os + + +class MLP: + def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l1, l2): + self.l1 = l1 + self.l2 = l2 + + # initializes weights and biases for each layer + self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale + self.b1 = np.zeros((1, hidden_size1)) + self.W2 = np.random.randn(hidden_size1, hidden_size2) * weight_scale + self.b2 = np.zeros((1, hidden_size2)) + self.W3 = np.random.randn(hidden_size2, output_size) * weight_scale + self.b3 = np.zeros((1, output_size)) + + def forward(self, x): + # forwards pass through the network + self.x = x # input for backpropagation + self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1 + self.a1 = self.relu(self.z1) # ReLU activation + self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2 + self.a2 = self.relu(self.z2) # ReLU activation + self.z3 = self.a2 @ self.W3 + self.b3 # linear transformation for layer 3 + self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities + return self.a3 # output of the network + + def backward(self, y, lr): + # backwards pass for weight updates using gradient descent + m = y.shape[0] + y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding + + # computes gradients for each layer + dz3 = self.a3 - y_one_hot # gradient for output layer + dw3 = (self.a2.T @ dz3) / m + db3 = np.sum(dz3, axis=0, keepdims=True) / m + + dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # gradient for layer 2 + dw2 = (self.a1.T @ dz2) / m + db2 = np.sum(dz2, axis=0, keepdims=True) / m + + dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # gradient for layer 1 + dw1 = (self.x.T @ dz1) / m + db1 = np.sum(dz1, axis=0, keepdims=True) / m + + + + dw3 += self.l2 * self.W3 + dw2 += self.l2 * self.W2 + dw1 += self.l2 * self.W1 + + + dw3 += self.l1 * np.sign(self.W3) + + dw2 += self.l1 * np.sign(self.W2) + + dw1 += self.l1 * np.sign(self.W1) + + # updates weights and biases using gradient descent + self.W3 -= lr * dw3 + self.b3 -= lr * db3 + self.W2 -= lr * dw2 + self.b2 -= lr * db2 + self.W1 -= lr * dw1 + self.b1 -= lr * db1 + + @staticmethod + def relu(x): + # ReLU activation + return np.maximum(0, x) + + @staticmethod + def relu_deriv(x): + # derivation of ReLU activation for backpropagation + return (x > 0).astype(float) + + @staticmethod + def softmax(x): + # softmax function normalizes outputs to probabilities + e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # exponentiates inputs + return e_x / np.sum(e_x, axis=1, keepdims=True) # normalizes to get probabilities + + @staticmethod + def one_hot_encode(y, num_classes): + # converts labels to one-hot encoded format + return np.eye(num_classes)[y] + + @staticmethod + def cross_entropy_loss(y, y_hat): + # computes cross-entropy loss between true labels and predicted probabilities + m = y.shape[0] + m = y.shape[0] + eps = 1e-12 + y_hat_clipped = np.clip(y_hat, eps, 1. - eps) + log_probs = -np.log(y_hat_clipped[np.arange(m), y]) + return np.mean(log_probs) + + def fit(self, x_train, y_train, x_val, y_val, lr, epochs, batch_size): + train_losses = [] + val_accuracies = [] + + for epoch in range(1, epochs + 1): + perm = np.random.permutation(x_train.shape[0]) # Shuffle the training data + x_train_shuffled, y_train_shuffled = x_train[perm], y_train[perm] + + epoch_loss = 0.0 + num_batches = int(np.ceil(x_train.shape[0] / batch_size)) + + for i in range(num_batches): + start = i * batch_size + end = start + batch_size + x_batch = x_train_shuffled[start:end] # batch of inputs + y_batch = y_train_shuffled[start:end] # batch of labels + + # Forward pass, backward pass, and weight update + self.forward(x_batch) + self.backward(y_batch, lr) + + epoch_loss += self.cross_entropy_loss(y_batch, self.a3) # updating the epoch loss + + epoch_loss /= num_batches # average loss is defined + train_losses.append(epoch_loss) + + val_pred = self.predict(x_val) + val_acc = np.mean(val_pred == y_val) + val_accuracies.append(val_acc) \ + + print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f} | Value Accuracy: {val_acc:.4f}") + + self.plot_graph(train_losses, val_accuracies) + return val_accuracies[-1] + + def plot_graph(self, train_losses, val_accuracies): + if not os.path.exists('results'): + os.makedirs('results') # creates results director + + fig, ax1 = plt.subplots() # initializes the plot + + ax1.set_xlabel('Epochs') + ax1.set_ylabel('Training Loss', color='tab:blue') + ax1.plot(range(1, len(train_losses) + 1), train_losses, color='tab:blue', label='Training Loss') + ax1.tick_params(axis='y', labelcolor='tab:blue') # defines loss subplot + + ax2 = ax1.twinx() + ax2.set_ylabel('Validation Accuracy', color='tab:orange') + ax2.plot(range(1, len(val_accuracies) + 1), val_accuracies, color='tab:orange', label='Validation Accuracy') + ax2.tick_params(axis='y', labelcolor='tab:orange') # defines accuracy subplot + + plt.title('Training Loss and Validation Accuracy over Epochs') + + result_path = 'results/experiment-4.png' # defines the file name + fig.savefig(result_path) + print(f"Graph saved to: {result_path}") + + def predict(self, x): # predicts class labels for the input data + probs = self.forward(x) # forwards pass to get probabilities + return np.argmax(probs, axis=1) # returns the class with highest probability + +# acquiring the FashionMNIST dataset +train_set = datasets.FashionMNIST(root='.', train=True, download=True) +test_set = datasets.FashionMNIST(root='.', train=False, download=True) + +# preprocessing the data by flattening images without normalizing them. +x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) +y_train = train_set.targets.numpy() + +x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) +y_test = test_set.targets.numpy() + +# MLP Initialization +mlp = MLP( + input_size=28 * 28, + hidden_size1=256, + hidden_size2=256, + output_size=10, + weight_scale=1e-2, + l1 = 1e-6, + l2 = 1e-4 +) + +# trains the model +mlp.fit( + x_train=x_train, + y_train=y_train, + x_val=x_test, + y_val=y_test, + lr=1e-2, + epochs=10, + batch_size=256 +) + +# tests the model +test_pred = mlp.predict(x_test) +test_acc = np.mean(test_pred == y_test) +print(f"\nFinal test accuracy: {test_acc:.4f}") \ No newline at end of file diff --git a/experiment-5.py b/experiment-5.py index e69de29..8d1bf9b 100644 --- a/experiment-5.py +++ b/experiment-5.py @@ -0,0 +1,203 @@ +import numpy as np +import matplotlib.pyplot as plt +from torchvision import datasets +from torchvision import transforms +import os + + + +class MLP: + def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l1, l2): + + self.l1 = l1 + self.l2 = l2 + + + # initializes weights and biases for each layer + self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale + self.b1 = np.zeros((1, hidden_size1)) + self.W2 = np.random.randn(hidden_size1, hidden_size2) * weight_scale + self.b2 = np.zeros((1, hidden_size2)) + self.W3 = np.random.randn(hidden_size2, output_size) * weight_scale + self.b3 = np.zeros((1, output_size)) + + def forward(self, x): + # forwards pass through the network + self.x = x # input for backpropagation + self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1 + self.a1 = self.relu(self.z1) # ReLU activation + self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2 + self.a2 = self.relu(self.z2) # ReLU activation + self.z3 = self.a2 @ self.W3 + self.b3 # linear transformation for layer 3 + self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities + return self.a3 # output of the network + + def backward(self, y, lr): + # backwards pass for weight updates using gradient descent + m = y.shape[0] + y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding + + # computes gradients for each layer + dz3 = self.a3 - y_one_hot # gradient for output layer + dw3 = (self.a2.T @ dz3) / m + db3 = np.sum(dz3, axis=0, keepdims=True) / m + + dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # gradient for layer 2 + dw2 = (self.a1.T @ dz2) / m + db2 = np.sum(dz2, axis=0, keepdims=True) / m + + dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # gradient for layer 1 + dw1 = (self.x.T @ dz1) / m + db1 = np.sum(dz1, axis=0, keepdims=True) / m + + + + dw3 += self.l2 * self.W3 + dw2 += self.l2 * self.W2 + dw1 += self.l2 * self.W1 + + + dw3 += self.l1 * np.sign(self.W3) + + dw2 += self.l1 * np.sign(self.W2) + + dw1 += self.l1 * np.sign(self.W1) + + # updates weights and biases using gradient descent + self.W3 -= lr * dw3 + self.b3 -= lr * db3 + self.W2 -= lr * dw2 + self.b2 -= lr * db2 + self.W1 -= lr * dw1 + self.b1 -= lr * db1 + + @staticmethod + def relu(x): + # ReLU activation + return np.maximum(0, x) + + @staticmethod + def relu_deriv(x): + # derivation of ReLU activation for backpropagation + return (x > 0).astype(float) + + @staticmethod + def softmax(x): + # softmax function normalizes outputs to probabilities + e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # exponentiates inputs + return e_x / np.sum(e_x, axis=1, keepdims=True) # normalizes to get probabilities + + @staticmethod + def one_hot_encode(y, num_classes): + # converts labels to one-hot encoded format + return np.eye(num_classes)[y] + + @staticmethod + def cross_entropy_loss(y, y_hat): + # computes cross-entropy loss between true labels and predicted probabilities + m = y.shape[0] + m = y.shape[0] + eps = 1e-12 + y_hat_clipped = np.clip(y_hat, eps, 1. - eps) + log_probs = -np.log(y_hat_clipped[np.arange(m), y]) + return np.mean(log_probs) + + def fit(self, x_train, y_train, x_val, y_val, lr, epochs, batch_size): + train_losses = [] + val_accuracies = [] + + for epoch in range(1, epochs + 1): + perm = np.random.permutation(x_train.shape[0]) # Shuffle the training data + x_train_shuffled, y_train_shuffled = x_train[perm], y_train[perm] + + epoch_loss = 0.0 + num_batches = int(np.ceil(x_train.shape[0] / batch_size)) + + for i in range(num_batches): + start = i * batch_size + end = start + batch_size + x_batch = x_train_shuffled[start:end] # batch of inputs + y_batch = y_train_shuffled[start:end] # batch of labels + + # Forward pass, backward pass, and weight update + self.forward(x_batch) + self.backward(y_batch, lr) + + epoch_loss += self.cross_entropy_loss(y_batch, self.a3) # updating the epoch loss + + epoch_loss /= num_batches # average loss is defined + train_losses.append(epoch_loss) + + val_pred = self.predict(x_val) + val_acc = np.mean(val_pred == y_val) + val_accuracies.append(val_acc) \ + + print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f} | Value Accuracy: {val_acc:.4f}") + + self.plot_graph(train_losses, val_accuracies) + return val_accuracies[-1] + + def plot_graph(self, train_losses, val_accuracies): + if not os.path.exists('results'): + os.makedirs('results') # creates results director + + fig, ax1 = plt.subplots() # initializes the plot + + ax1.set_xlabel('Epochs') + ax1.set_ylabel('Training Loss', color='tab:blue') + ax1.plot(range(1, len(train_losses) + 1), train_losses, color='tab:blue', label='Training Loss') + ax1.tick_params(axis='y', labelcolor='tab:blue') # defines loss subplot + + ax2 = ax1.twinx() + ax2.set_ylabel('Validation Accuracy', color='tab:orange') + ax2.plot(range(1, len(val_accuracies) + 1), val_accuracies, color='tab:orange', label='Validation Accuracy') + ax2.tick_params(axis='y', labelcolor='tab:orange') # defines accuracy subplot + + plt.title('Training Loss and Validation Accuracy over Epochs') + + result_path = 'results/experiment-5.png' # defines the file name + fig.savefig(result_path) + print(f"Graph saved to: {result_path}") + + def predict(self, x): # predicts class labels for the input data + probs = self.forward(x) # forwards pass to get probabilities + return np.argmax(probs, axis=1) # returns the class with highest probability + +# acquiring the FashionMNIST dataset +transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]) +train_set = datasets.FashionMNIST(root='.', train=True, download=True, transform = transform) +test_set = datasets.FashionMNIST(root='.', train=False, download=True, transform = transform) + +# preprocessing the data by flattening images and normalizing them. +x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) +y_train = train_set.targets.numpy() + +x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) +y_test = test_set.targets.numpy() + +# MLP Initialization +mlp = MLP( + input_size=28 * 28, + hidden_size1=256, + hidden_size2=256, + output_size=10, + weight_scale=1e-2, + l1 = 1e-6, + l2 = 1e-4 +) + +# trains the model +mlp.fit( + x_train=x_train, + y_train=y_train, + x_val=x_test, + y_val=y_test, + lr=1e-2, + epochs=10, + batch_size=256 +) + +# tests the model +test_pred = mlp.predict(x_test) +test_acc = np.mean(test_pred == y_test) +print(f"\nFinal test accuracy: {test_acc:.4f}") \ No newline at end of file diff --git a/experiment-2-leaky-relu.py b/experiment-6-convolutional-neural-network.py similarity index 100% rename from experiment-2-leaky-relu.py rename to experiment-6-convolutional-neural-network.py diff --git a/experiment-6.py b/experiment-6.py deleted file mode 100644 index e69de29..0000000 diff --git a/results/MLP-output.png b/results/MLP-output.png index 6536afd..6eb97dd 100644 Binary files a/results/MLP-output.png and b/results/MLP-output.png differ diff --git a/results/experiment-1-1.png b/results/experiment-1-1.png index 4e6c4c4..35e26dc 100644 Binary files a/results/experiment-1-1.png and b/results/experiment-1-1.png differ diff --git a/results/experiment-1-2.png b/results/experiment-1-2.png index e4e8597..486de1e 100644 Binary files a/results/experiment-1-2.png and b/results/experiment-1-2.png differ diff --git a/results/experiment-1-3.png b/results/experiment-1-3.png index d47730b..4558c73 100644 Binary files a/results/experiment-1-3.png and b/results/experiment-1-3.png differ diff --git a/results/experiment-2-leaky-relu.png b/results/experiment-2-leaky-relu.png new file mode 100644 index 0000000..2eac036 Binary files /dev/null and b/results/experiment-2-leaky-relu.png differ diff --git a/results/experiment-2-tanh.png b/results/experiment-2-tanh.png new file mode 100644 index 0000000..9899abc Binary files /dev/null and b/results/experiment-2-tanh.png differ diff --git a/results/experiment-3-l1.png b/results/experiment-3-l1.png new file mode 100644 index 0000000..056b09c Binary files /dev/null and b/results/experiment-3-l1.png differ diff --git a/results/experiment-3-l2.png b/results/experiment-3-l2.png new file mode 100644 index 0000000..eb1931f Binary files /dev/null and b/results/experiment-3-l2.png differ diff --git a/results/experiment-4.png b/results/experiment-4.png new file mode 100644 index 0000000..b30c60d Binary files /dev/null and b/results/experiment-4.png differ diff --git a/results/experiment-5.png b/results/experiment-5.png new file mode 100644 index 0000000..6fdf148 Binary files /dev/null and b/results/experiment-5.png differ