diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index d9d9e33..6ba2fea 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -5,8 +5,26 @@
+
+
+
+
+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
+
@@ -31,18 +49,28 @@
- {
- "keyToString": {
- "ModuleVcsDetector.initialDetectionPerformed": "true",
- "Python.Unnamed.executor": "Run",
- "Python.multilayer-perceptron.executor": "Run",
- "RunOnceActivity.ShowReadmeOnStart": "true",
- "RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager.252": "true",
- "RunOnceActivity.git.unshallow": "true",
- "git-widget-placeholder": "master",
- "last_opened_file_path": "/home/arctichawk1/Desktop/Projects/Private/Classification-of-Image-Data-with-MLP-and-CNN"
+
+}]]>
diff --git a/experiment-1.py b/experiment-1.py
index 359efdd..5c6d74f 100644
--- a/experiment-1.py
+++ b/experiment-1.py
@@ -22,43 +22,43 @@ class MLP:
self.b3 = np.zeros((1, output_size))
def forward(self, x):
- # Forward pass through the network
+ # forwards pass through the network
self.x = x # input for backpropagation
- self.z1 = x @ self.W1 + self.b1 # Linear transformation for first layer
+ self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1
self.a1 = self.relu(self.z1) # ReLU activation
if self.has_hidden_layer2:
- self.z2 = self.a1 @ self.W2 + self.b2 # Linear transformation for second layer
+ self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2
self.a2 = self.relu(self.z2) # ReLU activation
self.z3 = self.a2 @ self.W3 + self.b3 # Linear transformation for output layer
else:
self.z3 = self.a1 @ self.W3 + self.b3 # No second layer, directly to output
- self.a3 = self.softmax(self.z3) # Softmax to get class probabilities
+ self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities
return self.a3
def backward(self, y, lr):
- # Backward pass for weight updates using gradient descent
+ # backwards pass for weight updates using gradient descent
m = y.shape[0]
- y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # Converts labels to one-hot encoding
+ y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding
- # Gradient for output layer
- dz3 = self.a3 - y_one_hot
+ # computes gradients for each layer
+ dz3 = self.a3 - y_one_hot # gradient for output layer
dw3 = (self.a2.T if self.has_hidden_layer2 else self.a1.T) @ dz3 / m
db3 = np.sum(dz3, axis=0, keepdims=True) / m
if self.has_hidden_layer2:
- dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # Gradient for second hidden layer
+ dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # gradient for second hidden layer
dw2 = (self.a1.T @ dz2) / m
db2 = np.sum(dz2, axis=0, keepdims=True) / m
- dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # Gradient for first hidden layer
+ dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # gradient for one hidden layer
else:
- dz1 = (dz3 @ self.W3.T) * self.relu_deriv(self.z1) # No second hidden layer
+ dz1 = (dz3 @ self.W3.T) * self.relu_deriv(self.z1) # no second hidden layer
dw1 = (self.x.T @ dz1) / m
db1 = np.sum(dz1, axis=0, keepdims=True) / m
- # Update weights and biases using gradient descent
+ # updates weights and biases using gradient descent
self.W3 -= lr * dw3
self.b3 -= lr * db3
if self.has_hidden_layer2:
diff --git a/experiment-2-tanh.py b/experiment-2-tanh.py
deleted file mode 100644
index e69de29..0000000
diff --git a/experiment-2.py b/experiment-2.py
new file mode 100644
index 0000000..d71c469
--- /dev/null
+++ b/experiment-2.py
@@ -0,0 +1,240 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from torchvision import datasets
+import os
+
+class MLP_leaky_tanh:
+ def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, activation_type):
+ self.activation_type = activation_type
+
+ # initializes weights and biases for each layer
+ self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale
+ self.b1 = np.zeros((1, hidden_size1))
+ self.W2 = np.random.randn(hidden_size1, hidden_size2) * weight_scale
+ self.b2 = np.zeros((1, hidden_size2))
+ self.W3 = np.random.randn(hidden_size2, output_size) * weight_scale
+ self.b3 = np.zeros((1, output_size))
+
+ def forward(self, x, alpha=0):
+ # forwards pass through the network
+ self.x = x # input for backpropagation
+ self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1
+ self.a1 = self.activation(self.z1, alpha) # ReLU activation
+ self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2
+ self.a2 = self.activation(self.z2, alpha) # ReLU activation
+ self.z3 = self.a2 @ self.W3 + self.b3 # linear transformation for layer 3
+ self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities
+ return self.a3 # output of the network
+
+ def backward(self, y, lr, alpha=0):
+ # backwards pass for weight updates using gradient descent
+ m = y.shape[0]
+ y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding
+
+ # computes gradients for each layer
+ dz3 = self.a3 - y_one_hot # gradient for output layer
+ dw3 = (self.a2.T @ dz3) / m
+ db3 = np.sum(dz3, axis=0, keepdims=True) / m
+
+ dz2 = (dz3 @ self.W3.T) * self.activation_deriv(self.z2, alpha) # gradient for layer 2
+ dw2 = (self.a1.T @ dz2) / m
+ db2 = np.sum(dz2, axis=0, keepdims=True) / m
+
+ dz1 = (dz2 @ self.W2.T) * self.activation_deriv(self.z1, alpha) # gradient for layer 1
+ dw1 = (self.x.T @ dz1) / m
+ db1 = np.sum(dz1, axis=0, keepdims=True) / m
+
+ # updates weights and biases using gradient descent
+ self.W3 -= lr * dw3
+ self.b3 -= lr * db3
+ self.W2 -= lr * dw2
+ self.b2 -= lr * db2
+ self.W1 -= lr * dw1
+ self.b1 -= lr * db1
+
+ def activation(self, x, alpha=0):
+ # chooses activation function based on `activation_type`
+ if self.activation_type == 'leaky-relu':
+ return self.Lrelu(x, alpha)
+ elif self.activation_type == 'tanh':
+ return self.tanh(x)
+ else:
+ raise ValueError("Invalid activation type")
+
+ def activation_deriv(self, x, alpha):
+ # derivatives for the chosen activation function
+ if self.activation_type == 'leaky-relu':
+ return self.Lrelu_deriv(x, alpha)
+ elif self.activation_type == 'tanh':
+ return self.tanh_deriv(x)
+ else:
+ raise ValueError("Invalid activation type")
+
+ @staticmethod
+ def Lrelu(x, alpha=0):
+ # leaky ReLU activation
+ return np.where(x > 0, x, alpha * x)
+
+ @staticmethod
+ def Lrelu_deriv(x, alpha=0):
+ # derivation of leaky ReLU activation for backpropagation
+ return np.where(x > 0, 1, alpha)
+
+ @staticmethod
+ def tanh(x):
+ # tanh formula
+ return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))
+
+ @staticmethod
+ def tanh_deriv(x):
+ # derivation of tanh for backpropagation
+ return 1 - ((np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))) ** 2
+
+ @staticmethod
+ def softmax(x):
+ # softmax function normalizes outputs to probabilities
+ e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # exponentiates inputs
+ return e_x / np.sum(e_x, axis=1, keepdims=True) # normalizes to get probabilities
+
+ @staticmethod
+ def one_hot_encode(y, num_classes):
+ # converts labels to one-hot encoded format
+ return np.eye(num_classes)[y]
+
+ @staticmethod
+ def cross_entropy_loss(y, y_hat):
+ # computes cross-entropy loss between true labels and predicted probabilities
+ m = y.shape[0]
+ m = y.shape[0]
+ eps = 1e-12
+ y_hat_clipped = np.clip(y_hat, eps, 1. - eps)
+ log_probs = -np.log(y_hat_clipped[np.arange(m), y])
+ return np.mean(log_probs)
+
+ def fit(self, x_train, y_train, x_val, y_val, lr, epochs, batch_size, activation_type, alpha=0):
+ train_losses = []
+ val_accuracies = []
+
+ for epoch in range(1, epochs + 1):
+ perm = np.random.permutation(x_train.shape[0]) # Shuffle the training data
+ x_train_shuffled, y_train_shuffled = x_train[perm], y_train[perm]
+
+ epoch_loss = 0.0
+ num_batches = int(np.ceil(x_train.shape[0] / batch_size))
+
+ for i in range(num_batches):
+ start = i * batch_size
+ end = start + batch_size
+ x_batch = x_train_shuffled[start:end] # batch of inputs
+ y_batch = y_train_shuffled[start:end] # batch of labels
+
+ # Forward pass, backward pass, and weight update
+ self.forward(x_batch, alpha)
+ self.backward(y_batch, lr, alpha)
+
+ epoch_loss += self.cross_entropy_loss(y_batch, self.a3) # updating the epoch loss
+
+ epoch_loss /= num_batches # average loss is defined
+ train_losses.append(epoch_loss)
+
+ val_pred = self.predict(x_val, alpha)
+ val_acc = np.mean(val_pred == y_val)
+ val_accuracies.append(val_acc) \
+
+ print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f} | Value Accuracy: {val_acc:.4f}")
+
+ self.plot_graph(train_losses, val_accuracies, activation_type)
+ return val_accuracies[-1]
+
+ def plot_graph(self, train_losses, val_accuracies, activation_type):
+ if not os.path.exists('results'):
+ os.makedirs('results') # creates results director
+
+ fig, ax1 = plt.subplots() # initializes the plot
+
+ ax1.set_xlabel('Epochs')
+ ax1.set_ylabel('Training Loss', color='tab:blue')
+ ax1.plot(range(1, len(train_losses) + 1), train_losses, color='tab:blue', label='Training Loss')
+ ax1.tick_params(axis='y', labelcolor='tab:blue') # defines loss subplot
+
+ ax2 = ax1.twinx()
+ ax2.set_ylabel('Validation Accuracy', color='tab:orange')
+ ax2.plot(range(1, len(val_accuracies) + 1), val_accuracies, color='tab:orange', label='Validation Accuracy')
+ ax2.tick_params(axis='y', labelcolor='tab:orange') # defines accuracy subplot
+
+ plt.title('Training Loss and Validation Accuracy over Epochs')
+
+ result_path = 'results/experiment-2-' + activation_type + '.png' # defines the file name
+ fig.savefig(result_path)
+ print(f"Graph saved to: {result_path}")
+
+ def predict(self, x, alpha=0): # predicts class labels for the input data
+ probs = self.forward(x, alpha) # forwards pass to get probabilities
+ return np.argmax(probs, axis=1) # returns the class with highest probability
+
+# acquiring the FashionMNIST dataset
+train_set = datasets.FashionMNIST(root='.', train=True, download=True)
+test_set = datasets.FashionMNIST(root='.', train=False, download=True)
+
+# preprocessing the data by flattening images and normalizing them.
+x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0
+y_train = train_set.targets.numpy()
+
+x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0
+y_test = test_set.targets.numpy()
+
+# MLP initialization (tanh instead of ReLu)
+mlp_tanh = MLP_leaky_tanh(
+ input_size=28 * 28,
+ hidden_size1=256,
+ hidden_size2=256,
+ output_size=10,
+ weight_scale=1e-2,
+ activation_type='tanh'
+)
+
+# trains the model
+mlp_tanh.fit(
+ x_train=x_train,
+ y_train=y_train,
+ x_val=x_test,
+ y_val=y_test,
+ lr=1e-2,
+ epochs=10,
+ batch_size=256,
+ activation_type='tanh'
+)
+
+# tests the model
+test_pred_tanh = mlp_tanh.predict(x_test)
+test_acc_tanh = np.mean(test_pred_tanh == y_test)
+print(f"\nFinal test accuracy: {test_acc_tanh:.4f}")
+
+# MLP initialization (leaky ReLu instead of ReLu)
+mlp_Lrelu = MLP_leaky_tanh(
+ input_size=28 * 28,
+ hidden_size1=256,
+ hidden_size2=256,
+ output_size=10,
+ weight_scale=1e-2,
+ activation_type='leaky-relu'
+)
+alpha = 0.01
+
+# trains the model
+mlp_Lrelu.fit(
+ x_train=x_train,
+ y_train=y_train,
+ x_val=x_test,
+ y_val=y_test,
+ lr=1e-2,
+ epochs=10,
+ batch_size=256,
+ activation_type='leaky-relu',
+ alpha=alpha
+)
+
+# tests the model
+test_pred_Lrelu = mlp_Lrelu.predict(x_test, alpha)
+test_acc_Lrelu = np.mean(test_pred_Lrelu == y_test)
+print(f"\nFinal test accuracy: {test_acc_Lrelu:.4f}")
\ No newline at end of file
diff --git a/experiment-3-l1.py b/experiment-3-l1.py
index e69de29..7b535b2 100644
--- a/experiment-3-l1.py
+++ b/experiment-3-l1.py
@@ -0,0 +1,186 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from torchvision import datasets
+import os
+
+class MLP:
+ def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l1):
+ self.l1 = l1
+
+ # initializes weights and biases for each layer
+ self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale
+ self.b1 = np.zeros((1, hidden_size1))
+ self.W2 = np.random.randn(hidden_size1, hidden_size2) * weight_scale
+ self.b2 = np.zeros((1, hidden_size2))
+ self.W3 = np.random.randn(hidden_size2, output_size) * weight_scale
+ self.b3 = np.zeros((1, output_size))
+
+ def forward(self, x):
+ # forwards pass through the network
+ self.x = x # input for backpropagation
+ self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1
+ self.a1 = self.relu(self.z1) # ReLU activation
+ self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2
+ self.a2 = self.relu(self.z2) # ReLU activation
+ self.z3 = self.a2 @ self.W3 + self.b3 # linear transformation for layer 3
+ self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities
+ return self.a3 # output of the network
+
+ def backward(self, y, lr):
+ # backwards pass for weight updates using gradient descent
+ m = y.shape[0]
+ y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding
+
+ # computes gradients for each layer
+ dz3 = self.a3 - y_one_hot # gradient for output layer
+ dw3 = (self.a2.T @ dz3) / m
+ db3 = np.sum(dz3, axis=0, keepdims=True) / m
+
+ dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # gradient for layer 2
+ dw2 = (self.a1.T @ dz2) / m
+ db2 = np.sum(dz2, axis=0, keepdims=True) / m
+
+ dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # gradient for layer 1
+ dw1 = (self.x.T @ dz1) / m
+ db1 = np.sum(dz1, axis=0, keepdims=True) / m
+
+ dw3 += self.l1 * np.sign(self.W3)
+ dw2 += self.l1 * np.sign(self.W2)
+ dw1 += self.l1 * np.sign(self.W1)
+
+ # updates weights and biases using gradient descent
+ self.W3 -= lr * dw3
+ self.b3 -= lr * db3
+ self.W2 -= lr * dw2
+ self.b2 -= lr * db2
+ self.W1 -= lr * dw1
+ self.b1 -= lr * db1
+
+ @staticmethod
+ def relu(x):
+ # ReLU activation
+ return np.maximum(0, x)
+
+ @staticmethod
+ def relu_deriv(x):
+ # derivation of ReLU activation for backpropagation
+ return (x > 0).astype(float)
+
+ @staticmethod
+ def softmax(x):
+ # softmax function normalizes outputs to probabilities
+ e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # exponentiates inputs
+ return e_x / np.sum(e_x, axis=1, keepdims=True) # normalizes to get probabilities
+
+ @staticmethod
+ def one_hot_encode(y, num_classes):
+ # converts labels to one-hot encoded format
+ return np.eye(num_classes)[y]
+
+ @staticmethod
+ def cross_entropy_loss(y, y_hat):
+ # computes cross-entropy loss between true labels and predicted probabilities
+ m = y.shape[0]
+ m = y.shape[0]
+ eps = 1e-12
+ y_hat_clipped = np.clip(y_hat, eps, 1. - eps)
+ log_probs = -np.log(y_hat_clipped[np.arange(m), y])
+ return np.mean(log_probs)
+
+ def fit(self, x_train, y_train, x_val, y_val, lr, epochs, batch_size):
+ train_losses = []
+ val_accuracies = []
+
+ for epoch in range(1, epochs + 1):
+ perm = np.random.permutation(x_train.shape[0]) # Shuffle the training data
+ x_train_shuffled, y_train_shuffled = x_train[perm], y_train[perm]
+
+ epoch_loss = 0.0
+ num_batches = int(np.ceil(x_train.shape[0] / batch_size))
+
+ for i in range(num_batches):
+ start = i * batch_size
+ end = start + batch_size
+ x_batch = x_train_shuffled[start:end] # batch of inputs
+ y_batch = y_train_shuffled[start:end] # batch of labels
+
+ # Forward pass, backward pass, and weight update
+ self.forward(x_batch)
+ self.backward(y_batch, lr)
+
+ epoch_loss += self.cross_entropy_loss(y_batch, self.a3) # updating the epoch loss
+
+ epoch_loss /= num_batches # average loss is defined
+ train_losses.append(epoch_loss)
+
+ val_pred = self.predict(x_val)
+ val_acc = np.mean(val_pred == y_val)
+ val_accuracies.append(val_acc) \
+
+ print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f} | Value Accuracy: {val_acc:.4f}")
+
+ self.plot_graph(train_losses, val_accuracies)
+ return val_accuracies[-1]
+
+ def plot_graph(self, train_losses, val_accuracies):
+ if not os.path.exists('results'):
+ os.makedirs('results') # creates results director
+
+ fig, ax1 = plt.subplots() # initializes the plot
+
+ ax1.set_xlabel('Epochs')
+ ax1.set_ylabel('Training Loss', color='tab:blue')
+ ax1.plot(range(1, len(train_losses) + 1), train_losses, color='tab:blue', label='Training Loss')
+ ax1.tick_params(axis='y', labelcolor='tab:blue') # defines loss subplot
+
+ ax2 = ax1.twinx()
+ ax2.set_ylabel('Validation Accuracy', color='tab:orange')
+ ax2.plot(range(1, len(val_accuracies) + 1), val_accuracies, color='tab:orange', label='Validation Accuracy')
+ ax2.tick_params(axis='y', labelcolor='tab:orange') # defines accuracy subplot
+
+ plt.title('Training Loss and Validation Accuracy over Epochs')
+
+ result_path = 'results/experiment-3-l1.png' # defines the file name
+ fig.savefig(result_path)
+ print(f"Graph saved to: {result_path}")
+
+ def predict(self, x): # predicts class labels for the input data
+ probs = self.forward(x) # forwards pass to get probabilities
+ return np.argmax(probs, axis=1) # returns the class with highest probability
+
+# acquiring the FashionMNIST dataset
+train_set = datasets.FashionMNIST(root='.', train=True, download=True)
+test_set = datasets.FashionMNIST(root='.', train=False, download=True)
+
+# preprocessing the data by flattening images and normalizing them.
+x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0
+y_train = train_set.targets.numpy()
+
+x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0
+y_test = test_set.targets.numpy()
+
+# MLP Initialization
+mlp = MLP(
+ input_size=28 * 28,
+ hidden_size1=256,
+ hidden_size2=256,
+ output_size=10,
+ weight_scale=1e-2,
+ l1 = 1e-6,
+)
+
+# trains the model
+mlp.fit(
+ x_train=x_train,
+ y_train=y_train,
+ x_val=x_test,
+ y_val=y_test,
+ lr=1e-2,
+ epochs=10,
+ batch_size=256
+)
+
+# tests the model
+test_pred = mlp.predict(x_test)
+test_acc = np.mean(test_pred == y_test)
+print(f"\nFinal test accuracy: {test_acc:.4f}")
\ No newline at end of file
diff --git a/experiment-3-l2.py b/experiment-3-l2.py
index e69de29..7805470 100644
--- a/experiment-3-l2.py
+++ b/experiment-3-l2.py
@@ -0,0 +1,187 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from torchvision import datasets
+import os
+
+
+class MLP:
+ def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l2):
+ self.l2 = l2
+
+ # initializes weights and biases for each layer
+ self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale
+ self.b1 = np.zeros((1, hidden_size1))
+ self.W2 = np.random.randn(hidden_size1, hidden_size2) * weight_scale
+ self.b2 = np.zeros((1, hidden_size2))
+ self.W3 = np.random.randn(hidden_size2, output_size) * weight_scale
+ self.b3 = np.zeros((1, output_size))
+
+ def forward(self, x):
+ # forwards pass through the network
+ self.x = x # input for backpropagation
+ self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1
+ self.a1 = self.relu(self.z1) # ReLU activation
+ self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2
+ self.a2 = self.relu(self.z2) # ReLU activation
+ self.z3 = self.a2 @ self.W3 + self.b3 # linear transformation for layer 3
+ self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities
+ return self.a3 # output of the network
+
+ def backward(self, y, lr):
+ # backwards pass for weight updates using gradient descent
+ m = y.shape[0]
+ y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding
+
+ # computes gradients for each layer
+ dz3 = self.a3 - y_one_hot # gradient for output layer
+ dw3 = (self.a2.T @ dz3) / m
+ db3 = np.sum(dz3, axis=0, keepdims=True) / m
+
+ dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # gradient for layer 2
+ dw2 = (self.a1.T @ dz2) / m
+ db2 = np.sum(dz2, axis=0, keepdims=True) / m
+
+ dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # gradient for layer 1
+ dw1 = (self.x.T @ dz1) / m
+ db1 = np.sum(dz1, axis=0, keepdims=True) / m
+
+ dw3 += self.l2 * self.W3
+ dw2 += self.l2 * self.W2
+ dw1 += self.l2 * self.W1
+
+ # updates weights and biases using gradient descent
+ self.W3 -= lr * dw3
+ self.b3 -= lr * db3
+ self.W2 -= lr * dw2
+ self.b2 -= lr * db2
+ self.W1 -= lr * dw1
+ self.b1 -= lr * db1
+
+ @staticmethod
+ def relu(x):
+ # ReLU activation
+ return np.maximum(0, x)
+
+ @staticmethod
+ def relu_deriv(x):
+ # derivation of ReLU activation for backpropagation
+ return (x > 0).astype(float)
+
+ @staticmethod
+ def softmax(x):
+ # softmax function normalizes outputs to probabilities
+ e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # exponentiates inputs
+ return e_x / np.sum(e_x, axis=1, keepdims=True) # normalizes to get probabilities
+
+ @staticmethod
+ def one_hot_encode(y, num_classes):
+ # converts labels to one-hot encoded format
+ return np.eye(num_classes)[y]
+
+ @staticmethod
+ def cross_entropy_loss(y, y_hat):
+ # computes cross-entropy loss between true labels and predicted probabilities
+ m = y.shape[0]
+ m = y.shape[0]
+ eps = 1e-12
+ y_hat_clipped = np.clip(y_hat, eps, 1. - eps)
+ log_probs = -np.log(y_hat_clipped[np.arange(m), y])
+ return np.mean(log_probs)
+
+ def fit(self, x_train, y_train, x_val, y_val, lr, epochs, batch_size):
+ train_losses = []
+ val_accuracies = []
+
+ for epoch in range(1, epochs + 1):
+ perm = np.random.permutation(x_train.shape[0]) # Shuffle the training data
+ x_train_shuffled, y_train_shuffled = x_train[perm], y_train[perm]
+
+ epoch_loss = 0.0
+ num_batches = int(np.ceil(x_train.shape[0] / batch_size))
+
+ for i in range(num_batches):
+ start = i * batch_size
+ end = start + batch_size
+ x_batch = x_train_shuffled[start:end] # batch of inputs
+ y_batch = y_train_shuffled[start:end] # batch of labels
+
+ # Forward pass, backward pass, and weight update
+ self.forward(x_batch)
+ self.backward(y_batch, lr)
+
+ epoch_loss += self.cross_entropy_loss(y_batch, self.a3) # updating the epoch loss
+
+ epoch_loss /= num_batches # average loss is defined
+ train_losses.append(epoch_loss)
+
+ val_pred = self.predict(x_val)
+ val_acc = np.mean(val_pred == y_val)
+ val_accuracies.append(val_acc) \
+
+ print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f} | Value Accuracy: {val_acc:.4f}")
+
+ self.plot_graph(train_losses, val_accuracies)
+ return val_accuracies[-1]
+
+ def plot_graph(self, train_losses, val_accuracies):
+ if not os.path.exists('results'):
+ os.makedirs('results') # creates results director
+
+ fig, ax1 = plt.subplots() # initializes the plot
+
+ ax1.set_xlabel('Epochs')
+ ax1.set_ylabel('Training Loss', color='tab:blue')
+ ax1.plot(range(1, len(train_losses) + 1), train_losses, color='tab:blue', label='Training Loss')
+ ax1.tick_params(axis='y', labelcolor='tab:blue') # defines loss subplot
+
+ ax2 = ax1.twinx()
+ ax2.set_ylabel('Validation Accuracy', color='tab:orange')
+ ax2.plot(range(1, len(val_accuracies) + 1), val_accuracies, color='tab:orange', label='Validation Accuracy')
+ ax2.tick_params(axis='y', labelcolor='tab:orange') # defines accuracy subplot
+
+ plt.title('Training Loss and Validation Accuracy over Epochs')
+
+ result_path = 'results/experiment-3-l2.png' # defines the file name
+ fig.savefig(result_path)
+ print(f"Graph saved to: {result_path}")
+
+ def predict(self, x): # predicts class labels for the input data
+ probs = self.forward(x) # forwards pass to get probabilities
+ return np.argmax(probs, axis=1) # returns the class with highest probability
+
+# acquiring the FashionMNIST dataset
+train_set = datasets.FashionMNIST(root='.', train=True, download=True)
+test_set = datasets.FashionMNIST(root='.', train=False, download=True)
+
+# preprocessing the data by flattening images and normalizing them.
+x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0
+y_train = train_set.targets.numpy()
+
+x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0
+y_test = test_set.targets.numpy()
+
+# MLP Initialization
+mlp = MLP(
+ input_size=28 * 28,
+ hidden_size1=256,
+ hidden_size2=256,
+ output_size=10,
+ weight_scale=1e-2,
+ l2 = 1e-4
+)
+
+# trains the model
+mlp.fit(
+ x_train=x_train,
+ y_train=y_train,
+ x_val=x_test,
+ y_val=y_test,
+ lr=1e-2,
+ epochs=10,
+ batch_size=256
+)
+
+# tests the model
+test_pred = mlp.predict(x_test)
+test_acc = np.mean(test_pred == y_test)
+print(f"\nFinal test accuracy: {test_acc:.4f}")
\ No newline at end of file
diff --git a/experiment-4.py b/experiment-4.py
index e69de29..01852fd 100644
--- a/experiment-4.py
+++ b/experiment-4.py
@@ -0,0 +1,198 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from torchvision import datasets
+import os
+
+
+class MLP:
+ def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l1, l2):
+ self.l1 = l1
+ self.l2 = l2
+
+ # initializes weights and biases for each layer
+ self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale
+ self.b1 = np.zeros((1, hidden_size1))
+ self.W2 = np.random.randn(hidden_size1, hidden_size2) * weight_scale
+ self.b2 = np.zeros((1, hidden_size2))
+ self.W3 = np.random.randn(hidden_size2, output_size) * weight_scale
+ self.b3 = np.zeros((1, output_size))
+
+ def forward(self, x):
+ # forwards pass through the network
+ self.x = x # input for backpropagation
+ self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1
+ self.a1 = self.relu(self.z1) # ReLU activation
+ self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2
+ self.a2 = self.relu(self.z2) # ReLU activation
+ self.z3 = self.a2 @ self.W3 + self.b3 # linear transformation for layer 3
+ self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities
+ return self.a3 # output of the network
+
+ def backward(self, y, lr):
+ # backwards pass for weight updates using gradient descent
+ m = y.shape[0]
+ y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding
+
+ # computes gradients for each layer
+ dz3 = self.a3 - y_one_hot # gradient for output layer
+ dw3 = (self.a2.T @ dz3) / m
+ db3 = np.sum(dz3, axis=0, keepdims=True) / m
+
+ dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # gradient for layer 2
+ dw2 = (self.a1.T @ dz2) / m
+ db2 = np.sum(dz2, axis=0, keepdims=True) / m
+
+ dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # gradient for layer 1
+ dw1 = (self.x.T @ dz1) / m
+ db1 = np.sum(dz1, axis=0, keepdims=True) / m
+
+
+
+ dw3 += self.l2 * self.W3
+ dw2 += self.l2 * self.W2
+ dw1 += self.l2 * self.W1
+
+
+ dw3 += self.l1 * np.sign(self.W3)
+
+ dw2 += self.l1 * np.sign(self.W2)
+
+ dw1 += self.l1 * np.sign(self.W1)
+
+ # updates weights and biases using gradient descent
+ self.W3 -= lr * dw3
+ self.b3 -= lr * db3
+ self.W2 -= lr * dw2
+ self.b2 -= lr * db2
+ self.W1 -= lr * dw1
+ self.b1 -= lr * db1
+
+ @staticmethod
+ def relu(x):
+ # ReLU activation
+ return np.maximum(0, x)
+
+ @staticmethod
+ def relu_deriv(x):
+ # derivation of ReLU activation for backpropagation
+ return (x > 0).astype(float)
+
+ @staticmethod
+ def softmax(x):
+ # softmax function normalizes outputs to probabilities
+ e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # exponentiates inputs
+ return e_x / np.sum(e_x, axis=1, keepdims=True) # normalizes to get probabilities
+
+ @staticmethod
+ def one_hot_encode(y, num_classes):
+ # converts labels to one-hot encoded format
+ return np.eye(num_classes)[y]
+
+ @staticmethod
+ def cross_entropy_loss(y, y_hat):
+ # computes cross-entropy loss between true labels and predicted probabilities
+ m = y.shape[0]
+ m = y.shape[0]
+ eps = 1e-12
+ y_hat_clipped = np.clip(y_hat, eps, 1. - eps)
+ log_probs = -np.log(y_hat_clipped[np.arange(m), y])
+ return np.mean(log_probs)
+
+ def fit(self, x_train, y_train, x_val, y_val, lr, epochs, batch_size):
+ train_losses = []
+ val_accuracies = []
+
+ for epoch in range(1, epochs + 1):
+ perm = np.random.permutation(x_train.shape[0]) # Shuffle the training data
+ x_train_shuffled, y_train_shuffled = x_train[perm], y_train[perm]
+
+ epoch_loss = 0.0
+ num_batches = int(np.ceil(x_train.shape[0] / batch_size))
+
+ for i in range(num_batches):
+ start = i * batch_size
+ end = start + batch_size
+ x_batch = x_train_shuffled[start:end] # batch of inputs
+ y_batch = y_train_shuffled[start:end] # batch of labels
+
+ # Forward pass, backward pass, and weight update
+ self.forward(x_batch)
+ self.backward(y_batch, lr)
+
+ epoch_loss += self.cross_entropy_loss(y_batch, self.a3) # updating the epoch loss
+
+ epoch_loss /= num_batches # average loss is defined
+ train_losses.append(epoch_loss)
+
+ val_pred = self.predict(x_val)
+ val_acc = np.mean(val_pred == y_val)
+ val_accuracies.append(val_acc) \
+
+ print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f} | Value Accuracy: {val_acc:.4f}")
+
+ self.plot_graph(train_losses, val_accuracies)
+ return val_accuracies[-1]
+
+ def plot_graph(self, train_losses, val_accuracies):
+ if not os.path.exists('results'):
+ os.makedirs('results') # creates results director
+
+ fig, ax1 = plt.subplots() # initializes the plot
+
+ ax1.set_xlabel('Epochs')
+ ax1.set_ylabel('Training Loss', color='tab:blue')
+ ax1.plot(range(1, len(train_losses) + 1), train_losses, color='tab:blue', label='Training Loss')
+ ax1.tick_params(axis='y', labelcolor='tab:blue') # defines loss subplot
+
+ ax2 = ax1.twinx()
+ ax2.set_ylabel('Validation Accuracy', color='tab:orange')
+ ax2.plot(range(1, len(val_accuracies) + 1), val_accuracies, color='tab:orange', label='Validation Accuracy')
+ ax2.tick_params(axis='y', labelcolor='tab:orange') # defines accuracy subplot
+
+ plt.title('Training Loss and Validation Accuracy over Epochs')
+
+ result_path = 'results/experiment-4.png' # defines the file name
+ fig.savefig(result_path)
+ print(f"Graph saved to: {result_path}")
+
+ def predict(self, x): # predicts class labels for the input data
+ probs = self.forward(x) # forwards pass to get probabilities
+ return np.argmax(probs, axis=1) # returns the class with highest probability
+
+# acquiring the FashionMNIST dataset
+train_set = datasets.FashionMNIST(root='.', train=True, download=True)
+test_set = datasets.FashionMNIST(root='.', train=False, download=True)
+
+# preprocessing the data by flattening images without normalizing them.
+x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32)
+y_train = train_set.targets.numpy()
+
+x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32)
+y_test = test_set.targets.numpy()
+
+# MLP Initialization
+mlp = MLP(
+ input_size=28 * 28,
+ hidden_size1=256,
+ hidden_size2=256,
+ output_size=10,
+ weight_scale=1e-2,
+ l1 = 1e-6,
+ l2 = 1e-4
+)
+
+# trains the model
+mlp.fit(
+ x_train=x_train,
+ y_train=y_train,
+ x_val=x_test,
+ y_val=y_test,
+ lr=1e-2,
+ epochs=10,
+ batch_size=256
+)
+
+# tests the model
+test_pred = mlp.predict(x_test)
+test_acc = np.mean(test_pred == y_test)
+print(f"\nFinal test accuracy: {test_acc:.4f}")
\ No newline at end of file
diff --git a/experiment-5.py b/experiment-5.py
index e69de29..8d1bf9b 100644
--- a/experiment-5.py
+++ b/experiment-5.py
@@ -0,0 +1,203 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from torchvision import datasets
+from torchvision import transforms
+import os
+
+
+
+class MLP:
+ def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l1, l2):
+
+ self.l1 = l1
+ self.l2 = l2
+
+
+ # initializes weights and biases for each layer
+ self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale
+ self.b1 = np.zeros((1, hidden_size1))
+ self.W2 = np.random.randn(hidden_size1, hidden_size2) * weight_scale
+ self.b2 = np.zeros((1, hidden_size2))
+ self.W3 = np.random.randn(hidden_size2, output_size) * weight_scale
+ self.b3 = np.zeros((1, output_size))
+
+ def forward(self, x):
+ # forwards pass through the network
+ self.x = x # input for backpropagation
+ self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1
+ self.a1 = self.relu(self.z1) # ReLU activation
+ self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2
+ self.a2 = self.relu(self.z2) # ReLU activation
+ self.z3 = self.a2 @ self.W3 + self.b3 # linear transformation for layer 3
+ self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities
+ return self.a3 # output of the network
+
+ def backward(self, y, lr):
+ # backwards pass for weight updates using gradient descent
+ m = y.shape[0]
+ y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding
+
+ # computes gradients for each layer
+ dz3 = self.a3 - y_one_hot # gradient for output layer
+ dw3 = (self.a2.T @ dz3) / m
+ db3 = np.sum(dz3, axis=0, keepdims=True) / m
+
+ dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # gradient for layer 2
+ dw2 = (self.a1.T @ dz2) / m
+ db2 = np.sum(dz2, axis=0, keepdims=True) / m
+
+ dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # gradient for layer 1
+ dw1 = (self.x.T @ dz1) / m
+ db1 = np.sum(dz1, axis=0, keepdims=True) / m
+
+
+
+ dw3 += self.l2 * self.W3
+ dw2 += self.l2 * self.W2
+ dw1 += self.l2 * self.W1
+
+
+ dw3 += self.l1 * np.sign(self.W3)
+
+ dw2 += self.l1 * np.sign(self.W2)
+
+ dw1 += self.l1 * np.sign(self.W1)
+
+ # updates weights and biases using gradient descent
+ self.W3 -= lr * dw3
+ self.b3 -= lr * db3
+ self.W2 -= lr * dw2
+ self.b2 -= lr * db2
+ self.W1 -= lr * dw1
+ self.b1 -= lr * db1
+
+ @staticmethod
+ def relu(x):
+ # ReLU activation
+ return np.maximum(0, x)
+
+ @staticmethod
+ def relu_deriv(x):
+ # derivation of ReLU activation for backpropagation
+ return (x > 0).astype(float)
+
+ @staticmethod
+ def softmax(x):
+ # softmax function normalizes outputs to probabilities
+ e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # exponentiates inputs
+ return e_x / np.sum(e_x, axis=1, keepdims=True) # normalizes to get probabilities
+
+ @staticmethod
+ def one_hot_encode(y, num_classes):
+ # converts labels to one-hot encoded format
+ return np.eye(num_classes)[y]
+
+ @staticmethod
+ def cross_entropy_loss(y, y_hat):
+ # computes cross-entropy loss between true labels and predicted probabilities
+ m = y.shape[0]
+ m = y.shape[0]
+ eps = 1e-12
+ y_hat_clipped = np.clip(y_hat, eps, 1. - eps)
+ log_probs = -np.log(y_hat_clipped[np.arange(m), y])
+ return np.mean(log_probs)
+
+ def fit(self, x_train, y_train, x_val, y_val, lr, epochs, batch_size):
+ train_losses = []
+ val_accuracies = []
+
+ for epoch in range(1, epochs + 1):
+ perm = np.random.permutation(x_train.shape[0]) # Shuffle the training data
+ x_train_shuffled, y_train_shuffled = x_train[perm], y_train[perm]
+
+ epoch_loss = 0.0
+ num_batches = int(np.ceil(x_train.shape[0] / batch_size))
+
+ for i in range(num_batches):
+ start = i * batch_size
+ end = start + batch_size
+ x_batch = x_train_shuffled[start:end] # batch of inputs
+ y_batch = y_train_shuffled[start:end] # batch of labels
+
+ # Forward pass, backward pass, and weight update
+ self.forward(x_batch)
+ self.backward(y_batch, lr)
+
+ epoch_loss += self.cross_entropy_loss(y_batch, self.a3) # updating the epoch loss
+
+ epoch_loss /= num_batches # average loss is defined
+ train_losses.append(epoch_loss)
+
+ val_pred = self.predict(x_val)
+ val_acc = np.mean(val_pred == y_val)
+ val_accuracies.append(val_acc) \
+
+ print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f} | Value Accuracy: {val_acc:.4f}")
+
+ self.plot_graph(train_losses, val_accuracies)
+ return val_accuracies[-1]
+
+ def plot_graph(self, train_losses, val_accuracies):
+ if not os.path.exists('results'):
+ os.makedirs('results') # creates results director
+
+ fig, ax1 = plt.subplots() # initializes the plot
+
+ ax1.set_xlabel('Epochs')
+ ax1.set_ylabel('Training Loss', color='tab:blue')
+ ax1.plot(range(1, len(train_losses) + 1), train_losses, color='tab:blue', label='Training Loss')
+ ax1.tick_params(axis='y', labelcolor='tab:blue') # defines loss subplot
+
+ ax2 = ax1.twinx()
+ ax2.set_ylabel('Validation Accuracy', color='tab:orange')
+ ax2.plot(range(1, len(val_accuracies) + 1), val_accuracies, color='tab:orange', label='Validation Accuracy')
+ ax2.tick_params(axis='y', labelcolor='tab:orange') # defines accuracy subplot
+
+ plt.title('Training Loss and Validation Accuracy over Epochs')
+
+ result_path = 'results/experiment-5.png' # defines the file name
+ fig.savefig(result_path)
+ print(f"Graph saved to: {result_path}")
+
+ def predict(self, x): # predicts class labels for the input data
+ probs = self.forward(x) # forwards pass to get probabilities
+ return np.argmax(probs, axis=1) # returns the class with highest probability
+
+# acquiring the FashionMNIST dataset
+transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
+train_set = datasets.FashionMNIST(root='.', train=True, download=True, transform = transform)
+test_set = datasets.FashionMNIST(root='.', train=False, download=True, transform = transform)
+
+# preprocessing the data by flattening images and normalizing them.
+x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32)
+y_train = train_set.targets.numpy()
+
+x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32)
+y_test = test_set.targets.numpy()
+
+# MLP Initialization
+mlp = MLP(
+ input_size=28 * 28,
+ hidden_size1=256,
+ hidden_size2=256,
+ output_size=10,
+ weight_scale=1e-2,
+ l1 = 1e-6,
+ l2 = 1e-4
+)
+
+# trains the model
+mlp.fit(
+ x_train=x_train,
+ y_train=y_train,
+ x_val=x_test,
+ y_val=y_test,
+ lr=1e-2,
+ epochs=10,
+ batch_size=256
+)
+
+# tests the model
+test_pred = mlp.predict(x_test)
+test_acc = np.mean(test_pred == y_test)
+print(f"\nFinal test accuracy: {test_acc:.4f}")
\ No newline at end of file
diff --git a/experiment-2-leaky-relu.py b/experiment-6-convolutional-neural-network.py
similarity index 100%
rename from experiment-2-leaky-relu.py
rename to experiment-6-convolutional-neural-network.py
diff --git a/experiment-6.py b/experiment-6.py
deleted file mode 100644
index e69de29..0000000
diff --git a/results/MLP-output.png b/results/MLP-output.png
index 6536afd..6eb97dd 100644
Binary files a/results/MLP-output.png and b/results/MLP-output.png differ
diff --git a/results/experiment-1-1.png b/results/experiment-1-1.png
index 4e6c4c4..35e26dc 100644
Binary files a/results/experiment-1-1.png and b/results/experiment-1-1.png differ
diff --git a/results/experiment-1-2.png b/results/experiment-1-2.png
index e4e8597..486de1e 100644
Binary files a/results/experiment-1-2.png and b/results/experiment-1-2.png differ
diff --git a/results/experiment-1-3.png b/results/experiment-1-3.png
index d47730b..4558c73 100644
Binary files a/results/experiment-1-3.png and b/results/experiment-1-3.png differ
diff --git a/results/experiment-2-leaky-relu.png b/results/experiment-2-leaky-relu.png
new file mode 100644
index 0000000..2eac036
Binary files /dev/null and b/results/experiment-2-leaky-relu.png differ
diff --git a/results/experiment-2-tanh.png b/results/experiment-2-tanh.png
new file mode 100644
index 0000000..9899abc
Binary files /dev/null and b/results/experiment-2-tanh.png differ
diff --git a/results/experiment-3-l1.png b/results/experiment-3-l1.png
new file mode 100644
index 0000000..056b09c
Binary files /dev/null and b/results/experiment-3-l1.png differ
diff --git a/results/experiment-3-l2.png b/results/experiment-3-l2.png
new file mode 100644
index 0000000..eb1931f
Binary files /dev/null and b/results/experiment-3-l2.png differ
diff --git a/results/experiment-4.png b/results/experiment-4.png
new file mode 100644
index 0000000..b30c60d
Binary files /dev/null and b/results/experiment-4.png differ
diff --git a/results/experiment-5.png b/results/experiment-5.png
new file mode 100644
index 0000000..6fdf148
Binary files /dev/null and b/results/experiment-5.png differ