Added experiments until 5.
52
.idea/workspace.xml
generated
|
|
@ -5,8 +5,26 @@
|
||||||
</component>
|
</component>
|
||||||
<component name="ChangeListManager">
|
<component name="ChangeListManager">
|
||||||
<list default="true" id="53d2c8fc-09f6-4596-950a-66eac2662d99" name="Changes" comment="">
|
<list default="true" id="53d2c8fc-09f6-4596-950a-66eac2662d99" name="Changes" comment="">
|
||||||
|
<change afterPath="$PROJECT_DIR$/experiment-2.py" afterDir="false" />
|
||||||
|
<change afterPath="$PROJECT_DIR$/results/experiment-2-leaky-relu.png" afterDir="false" />
|
||||||
|
<change afterPath="$PROJECT_DIR$/results/experiment-2-tanh.png" afterDir="false" />
|
||||||
|
<change afterPath="$PROJECT_DIR$/results/experiment-3-l1.png" afterDir="false" />
|
||||||
|
<change afterPath="$PROJECT_DIR$/results/experiment-3-l2.png" afterDir="false" />
|
||||||
|
<change afterPath="$PROJECT_DIR$/results/experiment-4.png" afterDir="false" />
|
||||||
|
<change afterPath="$PROJECT_DIR$/results/experiment-5.png" afterDir="false" />
|
||||||
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||||
<change beforePath="$PROJECT_DIR$/multilayer-perceptron.py" beforeDir="false" afterPath="$PROJECT_DIR$/multilayer-perceptron.py" afterDir="false" />
|
<change beforePath="$PROJECT_DIR$/experiment-1.py" beforeDir="false" afterPath="$PROJECT_DIR$/experiment-1.py" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/experiment-2-leaky-relu.py" beforeDir="false" afterPath="$PROJECT_DIR$/convolutional-neural-network-experiment-6.py" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/experiment-2-tanh.py" beforeDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/experiment-3-l1.py" beforeDir="false" afterPath="$PROJECT_DIR$/experiment-3-l1.py" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/experiment-3-l2.py" beforeDir="false" afterPath="$PROJECT_DIR$/experiment-3-l2.py" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/experiment-4.py" beforeDir="false" afterPath="$PROJECT_DIR$/experiment-4.py" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/experiment-5.py" beforeDir="false" afterPath="$PROJECT_DIR$/experiment-5.py" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/experiment-6.py" beforeDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/results/MLP-output.png" beforeDir="false" afterPath="$PROJECT_DIR$/results/MLP-output.png" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/results/experiment-1-1.png" beforeDir="false" afterPath="$PROJECT_DIR$/results/experiment-1-1.png" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/results/experiment-1-2.png" beforeDir="false" afterPath="$PROJECT_DIR$/results/experiment-1-2.png" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/results/experiment-1-3.png" beforeDir="false" afterPath="$PROJECT_DIR$/results/experiment-1-3.png" afterDir="false" />
|
||||||
</list>
|
</list>
|
||||||
<option name="SHOW_DIALOG" value="false" />
|
<option name="SHOW_DIALOG" value="false" />
|
||||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||||
|
|
@ -31,18 +49,28 @@
|
||||||
<option name="hideEmptyMiddlePackages" value="true" />
|
<option name="hideEmptyMiddlePackages" value="true" />
|
||||||
<option name="showLibraryContents" value="true" />
|
<option name="showLibraryContents" value="true" />
|
||||||
</component>
|
</component>
|
||||||
<component name="PropertiesComponent">{
|
<component name="PropertiesComponent"><![CDATA[{
|
||||||
"keyToString": {
|
"keyToString": {
|
||||||
"ModuleVcsDetector.initialDetectionPerformed": "true",
|
"ModuleVcsDetector.initialDetectionPerformed": "true",
|
||||||
"Python.Unnamed.executor": "Run",
|
"Python.Unnamed.executor": "Run",
|
||||||
"Python.multilayer-perceptron.executor": "Run",
|
"Python.experiment-1-1.executor": "Run",
|
||||||
"RunOnceActivity.ShowReadmeOnStart": "true",
|
"Python.experiment-1-2.executor": "Run",
|
||||||
"RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager.252": "true",
|
"Python.experiment-1.executor": "Run",
|
||||||
"RunOnceActivity.git.unshallow": "true",
|
"Python.experiment-2-leaky-relu.executor": "Run",
|
||||||
"git-widget-placeholder": "master",
|
"Python.experiment-2-tanh.executor": "Run",
|
||||||
"last_opened_file_path": "/home/arctichawk1/Desktop/Projects/Private/Classification-of-Image-Data-with-MLP-and-CNN"
|
"Python.experiment-2.executor": "Run",
|
||||||
|
"Python.experiment-3-l1.executor": "Run",
|
||||||
|
"Python.experiment-3-l2.executor": "Run",
|
||||||
|
"Python.experiment-4.executor": "Run",
|
||||||
|
"Python.experiment-5.executor": "Run",
|
||||||
|
"Python.multilayer-perceptron.executor": "Run",
|
||||||
|
"RunOnceActivity.ShowReadmeOnStart": "true",
|
||||||
|
"RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager.252": "true",
|
||||||
|
"RunOnceActivity.git.unshallow": "true",
|
||||||
|
"git-widget-placeholder": "master",
|
||||||
|
"last_opened_file_path": "/home/arctichawk1/Desktop/Projects/Private/Classification-of-Image-Data-with-MLP-and-CNN"
|
||||||
}
|
}
|
||||||
}</component>
|
}]]></component>
|
||||||
<component name="SharedIndexes">
|
<component name="SharedIndexes">
|
||||||
<attachedChunks>
|
<attachedChunks>
|
||||||
<set>
|
<set>
|
||||||
|
|
|
||||||
|
|
@ -22,43 +22,43 @@ class MLP:
|
||||||
self.b3 = np.zeros((1, output_size))
|
self.b3 = np.zeros((1, output_size))
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
# Forward pass through the network
|
# forwards pass through the network
|
||||||
self.x = x # input for backpropagation
|
self.x = x # input for backpropagation
|
||||||
self.z1 = x @ self.W1 + self.b1 # Linear transformation for first layer
|
self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1
|
||||||
self.a1 = self.relu(self.z1) # ReLU activation
|
self.a1 = self.relu(self.z1) # ReLU activation
|
||||||
|
|
||||||
if self.has_hidden_layer2:
|
if self.has_hidden_layer2:
|
||||||
self.z2 = self.a1 @ self.W2 + self.b2 # Linear transformation for second layer
|
self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2
|
||||||
self.a2 = self.relu(self.z2) # ReLU activation
|
self.a2 = self.relu(self.z2) # ReLU activation
|
||||||
self.z3 = self.a2 @ self.W3 + self.b3 # Linear transformation for output layer
|
self.z3 = self.a2 @ self.W3 + self.b3 # Linear transformation for output layer
|
||||||
else:
|
else:
|
||||||
self.z3 = self.a1 @ self.W3 + self.b3 # No second layer, directly to output
|
self.z3 = self.a1 @ self.W3 + self.b3 # No second layer, directly to output
|
||||||
|
|
||||||
self.a3 = self.softmax(self.z3) # Softmax to get class probabilities
|
self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities
|
||||||
return self.a3
|
return self.a3
|
||||||
|
|
||||||
def backward(self, y, lr):
|
def backward(self, y, lr):
|
||||||
# Backward pass for weight updates using gradient descent
|
# backwards pass for weight updates using gradient descent
|
||||||
m = y.shape[0]
|
m = y.shape[0]
|
||||||
y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # Converts labels to one-hot encoding
|
y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding
|
||||||
|
|
||||||
# Gradient for output layer
|
# computes gradients for each layer
|
||||||
dz3 = self.a3 - y_one_hot
|
dz3 = self.a3 - y_one_hot # gradient for output layer
|
||||||
dw3 = (self.a2.T if self.has_hidden_layer2 else self.a1.T) @ dz3 / m
|
dw3 = (self.a2.T if self.has_hidden_layer2 else self.a1.T) @ dz3 / m
|
||||||
db3 = np.sum(dz3, axis=0, keepdims=True) / m
|
db3 = np.sum(dz3, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
if self.has_hidden_layer2:
|
if self.has_hidden_layer2:
|
||||||
dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # Gradient for second hidden layer
|
dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # gradient for second hidden layer
|
||||||
dw2 = (self.a1.T @ dz2) / m
|
dw2 = (self.a1.T @ dz2) / m
|
||||||
db2 = np.sum(dz2, axis=0, keepdims=True) / m
|
db2 = np.sum(dz2, axis=0, keepdims=True) / m
|
||||||
dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # Gradient for first hidden layer
|
dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # gradient for one hidden layer
|
||||||
else:
|
else:
|
||||||
dz1 = (dz3 @ self.W3.T) * self.relu_deriv(self.z1) # No second hidden layer
|
dz1 = (dz3 @ self.W3.T) * self.relu_deriv(self.z1) # no second hidden layer
|
||||||
|
|
||||||
dw1 = (self.x.T @ dz1) / m
|
dw1 = (self.x.T @ dz1) / m
|
||||||
db1 = np.sum(dz1, axis=0, keepdims=True) / m
|
db1 = np.sum(dz1, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
# Update weights and biases using gradient descent
|
# updates weights and biases using gradient descent
|
||||||
self.W3 -= lr * dw3
|
self.W3 -= lr * dw3
|
||||||
self.b3 -= lr * db3
|
self.b3 -= lr * db3
|
||||||
if self.has_hidden_layer2:
|
if self.has_hidden_layer2:
|
||||||
|
|
|
||||||
240
experiment-2.py
Normal file
|
|
@ -0,0 +1,240 @@
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from torchvision import datasets
|
||||||
|
import os
|
||||||
|
|
||||||
|
class MLP_leaky_tanh:
|
||||||
|
def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, activation_type):
|
||||||
|
self.activation_type = activation_type
|
||||||
|
|
||||||
|
# initializes weights and biases for each layer
|
||||||
|
self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale
|
||||||
|
self.b1 = np.zeros((1, hidden_size1))
|
||||||
|
self.W2 = np.random.randn(hidden_size1, hidden_size2) * weight_scale
|
||||||
|
self.b2 = np.zeros((1, hidden_size2))
|
||||||
|
self.W3 = np.random.randn(hidden_size2, output_size) * weight_scale
|
||||||
|
self.b3 = np.zeros((1, output_size))
|
||||||
|
|
||||||
|
def forward(self, x, alpha=0):
|
||||||
|
# forwards pass through the network
|
||||||
|
self.x = x # input for backpropagation
|
||||||
|
self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1
|
||||||
|
self.a1 = self.activation(self.z1, alpha) # ReLU activation
|
||||||
|
self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2
|
||||||
|
self.a2 = self.activation(self.z2, alpha) # ReLU activation
|
||||||
|
self.z3 = self.a2 @ self.W3 + self.b3 # linear transformation for layer 3
|
||||||
|
self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities
|
||||||
|
return self.a3 # output of the network
|
||||||
|
|
||||||
|
def backward(self, y, lr, alpha=0):
|
||||||
|
# backwards pass for weight updates using gradient descent
|
||||||
|
m = y.shape[0]
|
||||||
|
y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding
|
||||||
|
|
||||||
|
# computes gradients for each layer
|
||||||
|
dz3 = self.a3 - y_one_hot # gradient for output layer
|
||||||
|
dw3 = (self.a2.T @ dz3) / m
|
||||||
|
db3 = np.sum(dz3, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
|
dz2 = (dz3 @ self.W3.T) * self.activation_deriv(self.z2, alpha) # gradient for layer 2
|
||||||
|
dw2 = (self.a1.T @ dz2) / m
|
||||||
|
db2 = np.sum(dz2, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
|
dz1 = (dz2 @ self.W2.T) * self.activation_deriv(self.z1, alpha) # gradient for layer 1
|
||||||
|
dw1 = (self.x.T @ dz1) / m
|
||||||
|
db1 = np.sum(dz1, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
|
# updates weights and biases using gradient descent
|
||||||
|
self.W3 -= lr * dw3
|
||||||
|
self.b3 -= lr * db3
|
||||||
|
self.W2 -= lr * dw2
|
||||||
|
self.b2 -= lr * db2
|
||||||
|
self.W1 -= lr * dw1
|
||||||
|
self.b1 -= lr * db1
|
||||||
|
|
||||||
|
def activation(self, x, alpha=0):
|
||||||
|
# chooses activation function based on `activation_type`
|
||||||
|
if self.activation_type == 'leaky-relu':
|
||||||
|
return self.Lrelu(x, alpha)
|
||||||
|
elif self.activation_type == 'tanh':
|
||||||
|
return self.tanh(x)
|
||||||
|
else:
|
||||||
|
raise ValueError("Invalid activation type")
|
||||||
|
|
||||||
|
def activation_deriv(self, x, alpha):
|
||||||
|
# derivatives for the chosen activation function
|
||||||
|
if self.activation_type == 'leaky-relu':
|
||||||
|
return self.Lrelu_deriv(x, alpha)
|
||||||
|
elif self.activation_type == 'tanh':
|
||||||
|
return self.tanh_deriv(x)
|
||||||
|
else:
|
||||||
|
raise ValueError("Invalid activation type")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def Lrelu(x, alpha=0):
|
||||||
|
# leaky ReLU activation
|
||||||
|
return np.where(x > 0, x, alpha * x)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def Lrelu_deriv(x, alpha=0):
|
||||||
|
# derivation of leaky ReLU activation for backpropagation
|
||||||
|
return np.where(x > 0, 1, alpha)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def tanh(x):
|
||||||
|
# tanh formula
|
||||||
|
return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def tanh_deriv(x):
|
||||||
|
# derivation of tanh for backpropagation
|
||||||
|
return 1 - ((np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))) ** 2
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def softmax(x):
|
||||||
|
# softmax function normalizes outputs to probabilities
|
||||||
|
e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # exponentiates inputs
|
||||||
|
return e_x / np.sum(e_x, axis=1, keepdims=True) # normalizes to get probabilities
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def one_hot_encode(y, num_classes):
|
||||||
|
# converts labels to one-hot encoded format
|
||||||
|
return np.eye(num_classes)[y]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def cross_entropy_loss(y, y_hat):
|
||||||
|
# computes cross-entropy loss between true labels and predicted probabilities
|
||||||
|
m = y.shape[0]
|
||||||
|
m = y.shape[0]
|
||||||
|
eps = 1e-12
|
||||||
|
y_hat_clipped = np.clip(y_hat, eps, 1. - eps)
|
||||||
|
log_probs = -np.log(y_hat_clipped[np.arange(m), y])
|
||||||
|
return np.mean(log_probs)
|
||||||
|
|
||||||
|
def fit(self, x_train, y_train, x_val, y_val, lr, epochs, batch_size, activation_type, alpha=0):
|
||||||
|
train_losses = []
|
||||||
|
val_accuracies = []
|
||||||
|
|
||||||
|
for epoch in range(1, epochs + 1):
|
||||||
|
perm = np.random.permutation(x_train.shape[0]) # Shuffle the training data
|
||||||
|
x_train_shuffled, y_train_shuffled = x_train[perm], y_train[perm]
|
||||||
|
|
||||||
|
epoch_loss = 0.0
|
||||||
|
num_batches = int(np.ceil(x_train.shape[0] / batch_size))
|
||||||
|
|
||||||
|
for i in range(num_batches):
|
||||||
|
start = i * batch_size
|
||||||
|
end = start + batch_size
|
||||||
|
x_batch = x_train_shuffled[start:end] # batch of inputs
|
||||||
|
y_batch = y_train_shuffled[start:end] # batch of labels
|
||||||
|
|
||||||
|
# Forward pass, backward pass, and weight update
|
||||||
|
self.forward(x_batch, alpha)
|
||||||
|
self.backward(y_batch, lr, alpha)
|
||||||
|
|
||||||
|
epoch_loss += self.cross_entropy_loss(y_batch, self.a3) # updating the epoch loss
|
||||||
|
|
||||||
|
epoch_loss /= num_batches # average loss is defined
|
||||||
|
train_losses.append(epoch_loss)
|
||||||
|
|
||||||
|
val_pred = self.predict(x_val, alpha)
|
||||||
|
val_acc = np.mean(val_pred == y_val)
|
||||||
|
val_accuracies.append(val_acc) \
|
||||||
|
|
||||||
|
print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f} | Value Accuracy: {val_acc:.4f}")
|
||||||
|
|
||||||
|
self.plot_graph(train_losses, val_accuracies, activation_type)
|
||||||
|
return val_accuracies[-1]
|
||||||
|
|
||||||
|
def plot_graph(self, train_losses, val_accuracies, activation_type):
|
||||||
|
if not os.path.exists('results'):
|
||||||
|
os.makedirs('results') # creates results director
|
||||||
|
|
||||||
|
fig, ax1 = plt.subplots() # initializes the plot
|
||||||
|
|
||||||
|
ax1.set_xlabel('Epochs')
|
||||||
|
ax1.set_ylabel('Training Loss', color='tab:blue')
|
||||||
|
ax1.plot(range(1, len(train_losses) + 1), train_losses, color='tab:blue', label='Training Loss')
|
||||||
|
ax1.tick_params(axis='y', labelcolor='tab:blue') # defines loss subplot
|
||||||
|
|
||||||
|
ax2 = ax1.twinx()
|
||||||
|
ax2.set_ylabel('Validation Accuracy', color='tab:orange')
|
||||||
|
ax2.plot(range(1, len(val_accuracies) + 1), val_accuracies, color='tab:orange', label='Validation Accuracy')
|
||||||
|
ax2.tick_params(axis='y', labelcolor='tab:orange') # defines accuracy subplot
|
||||||
|
|
||||||
|
plt.title('Training Loss and Validation Accuracy over Epochs')
|
||||||
|
|
||||||
|
result_path = 'results/experiment-2-' + activation_type + '.png' # defines the file name
|
||||||
|
fig.savefig(result_path)
|
||||||
|
print(f"Graph saved to: {result_path}")
|
||||||
|
|
||||||
|
def predict(self, x, alpha=0): # predicts class labels for the input data
|
||||||
|
probs = self.forward(x, alpha) # forwards pass to get probabilities
|
||||||
|
return np.argmax(probs, axis=1) # returns the class with highest probability
|
||||||
|
|
||||||
|
# acquiring the FashionMNIST dataset
|
||||||
|
train_set = datasets.FashionMNIST(root='.', train=True, download=True)
|
||||||
|
test_set = datasets.FashionMNIST(root='.', train=False, download=True)
|
||||||
|
|
||||||
|
# preprocessing the data by flattening images and normalizing them.
|
||||||
|
x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0
|
||||||
|
y_train = train_set.targets.numpy()
|
||||||
|
|
||||||
|
x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0
|
||||||
|
y_test = test_set.targets.numpy()
|
||||||
|
|
||||||
|
# MLP initialization (tanh instead of ReLu)
|
||||||
|
mlp_tanh = MLP_leaky_tanh(
|
||||||
|
input_size=28 * 28,
|
||||||
|
hidden_size1=256,
|
||||||
|
hidden_size2=256,
|
||||||
|
output_size=10,
|
||||||
|
weight_scale=1e-2,
|
||||||
|
activation_type='tanh'
|
||||||
|
)
|
||||||
|
|
||||||
|
# trains the model
|
||||||
|
mlp_tanh.fit(
|
||||||
|
x_train=x_train,
|
||||||
|
y_train=y_train,
|
||||||
|
x_val=x_test,
|
||||||
|
y_val=y_test,
|
||||||
|
lr=1e-2,
|
||||||
|
epochs=10,
|
||||||
|
batch_size=256,
|
||||||
|
activation_type='tanh'
|
||||||
|
)
|
||||||
|
|
||||||
|
# tests the model
|
||||||
|
test_pred_tanh = mlp_tanh.predict(x_test)
|
||||||
|
test_acc_tanh = np.mean(test_pred_tanh == y_test)
|
||||||
|
print(f"\nFinal test accuracy: {test_acc_tanh:.4f}")
|
||||||
|
|
||||||
|
# MLP initialization (leaky ReLu instead of ReLu)
|
||||||
|
mlp_Lrelu = MLP_leaky_tanh(
|
||||||
|
input_size=28 * 28,
|
||||||
|
hidden_size1=256,
|
||||||
|
hidden_size2=256,
|
||||||
|
output_size=10,
|
||||||
|
weight_scale=1e-2,
|
||||||
|
activation_type='leaky-relu'
|
||||||
|
)
|
||||||
|
alpha = 0.01
|
||||||
|
|
||||||
|
# trains the model
|
||||||
|
mlp_Lrelu.fit(
|
||||||
|
x_train=x_train,
|
||||||
|
y_train=y_train,
|
||||||
|
x_val=x_test,
|
||||||
|
y_val=y_test,
|
||||||
|
lr=1e-2,
|
||||||
|
epochs=10,
|
||||||
|
batch_size=256,
|
||||||
|
activation_type='leaky-relu',
|
||||||
|
alpha=alpha
|
||||||
|
)
|
||||||
|
|
||||||
|
# tests the model
|
||||||
|
test_pred_Lrelu = mlp_Lrelu.predict(x_test, alpha)
|
||||||
|
test_acc_Lrelu = np.mean(test_pred_Lrelu == y_test)
|
||||||
|
print(f"\nFinal test accuracy: {test_acc_Lrelu:.4f}")
|
||||||
|
|
@ -0,0 +1,186 @@
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from torchvision import datasets
|
||||||
|
import os
|
||||||
|
|
||||||
|
class MLP:
|
||||||
|
def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l1):
|
||||||
|
self.l1 = l1
|
||||||
|
|
||||||
|
# initializes weights and biases for each layer
|
||||||
|
self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale
|
||||||
|
self.b1 = np.zeros((1, hidden_size1))
|
||||||
|
self.W2 = np.random.randn(hidden_size1, hidden_size2) * weight_scale
|
||||||
|
self.b2 = np.zeros((1, hidden_size2))
|
||||||
|
self.W3 = np.random.randn(hidden_size2, output_size) * weight_scale
|
||||||
|
self.b3 = np.zeros((1, output_size))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# forwards pass through the network
|
||||||
|
self.x = x # input for backpropagation
|
||||||
|
self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1
|
||||||
|
self.a1 = self.relu(self.z1) # ReLU activation
|
||||||
|
self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2
|
||||||
|
self.a2 = self.relu(self.z2) # ReLU activation
|
||||||
|
self.z3 = self.a2 @ self.W3 + self.b3 # linear transformation for layer 3
|
||||||
|
self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities
|
||||||
|
return self.a3 # output of the network
|
||||||
|
|
||||||
|
def backward(self, y, lr):
|
||||||
|
# backwards pass for weight updates using gradient descent
|
||||||
|
m = y.shape[0]
|
||||||
|
y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding
|
||||||
|
|
||||||
|
# computes gradients for each layer
|
||||||
|
dz3 = self.a3 - y_one_hot # gradient for output layer
|
||||||
|
dw3 = (self.a2.T @ dz3) / m
|
||||||
|
db3 = np.sum(dz3, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
|
dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # gradient for layer 2
|
||||||
|
dw2 = (self.a1.T @ dz2) / m
|
||||||
|
db2 = np.sum(dz2, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
|
dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # gradient for layer 1
|
||||||
|
dw1 = (self.x.T @ dz1) / m
|
||||||
|
db1 = np.sum(dz1, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
|
dw3 += self.l1 * np.sign(self.W3)
|
||||||
|
dw2 += self.l1 * np.sign(self.W2)
|
||||||
|
dw1 += self.l1 * np.sign(self.W1)
|
||||||
|
|
||||||
|
# updates weights and biases using gradient descent
|
||||||
|
self.W3 -= lr * dw3
|
||||||
|
self.b3 -= lr * db3
|
||||||
|
self.W2 -= lr * dw2
|
||||||
|
self.b2 -= lr * db2
|
||||||
|
self.W1 -= lr * dw1
|
||||||
|
self.b1 -= lr * db1
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def relu(x):
|
||||||
|
# ReLU activation
|
||||||
|
return np.maximum(0, x)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def relu_deriv(x):
|
||||||
|
# derivation of ReLU activation for backpropagation
|
||||||
|
return (x > 0).astype(float)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def softmax(x):
|
||||||
|
# softmax function normalizes outputs to probabilities
|
||||||
|
e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # exponentiates inputs
|
||||||
|
return e_x / np.sum(e_x, axis=1, keepdims=True) # normalizes to get probabilities
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def one_hot_encode(y, num_classes):
|
||||||
|
# converts labels to one-hot encoded format
|
||||||
|
return np.eye(num_classes)[y]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def cross_entropy_loss(y, y_hat):
|
||||||
|
# computes cross-entropy loss between true labels and predicted probabilities
|
||||||
|
m = y.shape[0]
|
||||||
|
m = y.shape[0]
|
||||||
|
eps = 1e-12
|
||||||
|
y_hat_clipped = np.clip(y_hat, eps, 1. - eps)
|
||||||
|
log_probs = -np.log(y_hat_clipped[np.arange(m), y])
|
||||||
|
return np.mean(log_probs)
|
||||||
|
|
||||||
|
def fit(self, x_train, y_train, x_val, y_val, lr, epochs, batch_size):
|
||||||
|
train_losses = []
|
||||||
|
val_accuracies = []
|
||||||
|
|
||||||
|
for epoch in range(1, epochs + 1):
|
||||||
|
perm = np.random.permutation(x_train.shape[0]) # Shuffle the training data
|
||||||
|
x_train_shuffled, y_train_shuffled = x_train[perm], y_train[perm]
|
||||||
|
|
||||||
|
epoch_loss = 0.0
|
||||||
|
num_batches = int(np.ceil(x_train.shape[0] / batch_size))
|
||||||
|
|
||||||
|
for i in range(num_batches):
|
||||||
|
start = i * batch_size
|
||||||
|
end = start + batch_size
|
||||||
|
x_batch = x_train_shuffled[start:end] # batch of inputs
|
||||||
|
y_batch = y_train_shuffled[start:end] # batch of labels
|
||||||
|
|
||||||
|
# Forward pass, backward pass, and weight update
|
||||||
|
self.forward(x_batch)
|
||||||
|
self.backward(y_batch, lr)
|
||||||
|
|
||||||
|
epoch_loss += self.cross_entropy_loss(y_batch, self.a3) # updating the epoch loss
|
||||||
|
|
||||||
|
epoch_loss /= num_batches # average loss is defined
|
||||||
|
train_losses.append(epoch_loss)
|
||||||
|
|
||||||
|
val_pred = self.predict(x_val)
|
||||||
|
val_acc = np.mean(val_pred == y_val)
|
||||||
|
val_accuracies.append(val_acc) \
|
||||||
|
|
||||||
|
print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f} | Value Accuracy: {val_acc:.4f}")
|
||||||
|
|
||||||
|
self.plot_graph(train_losses, val_accuracies)
|
||||||
|
return val_accuracies[-1]
|
||||||
|
|
||||||
|
def plot_graph(self, train_losses, val_accuracies):
|
||||||
|
if not os.path.exists('results'):
|
||||||
|
os.makedirs('results') # creates results director
|
||||||
|
|
||||||
|
fig, ax1 = plt.subplots() # initializes the plot
|
||||||
|
|
||||||
|
ax1.set_xlabel('Epochs')
|
||||||
|
ax1.set_ylabel('Training Loss', color='tab:blue')
|
||||||
|
ax1.plot(range(1, len(train_losses) + 1), train_losses, color='tab:blue', label='Training Loss')
|
||||||
|
ax1.tick_params(axis='y', labelcolor='tab:blue') # defines loss subplot
|
||||||
|
|
||||||
|
ax2 = ax1.twinx()
|
||||||
|
ax2.set_ylabel('Validation Accuracy', color='tab:orange')
|
||||||
|
ax2.plot(range(1, len(val_accuracies) + 1), val_accuracies, color='tab:orange', label='Validation Accuracy')
|
||||||
|
ax2.tick_params(axis='y', labelcolor='tab:orange') # defines accuracy subplot
|
||||||
|
|
||||||
|
plt.title('Training Loss and Validation Accuracy over Epochs')
|
||||||
|
|
||||||
|
result_path = 'results/experiment-3-l1.png' # defines the file name
|
||||||
|
fig.savefig(result_path)
|
||||||
|
print(f"Graph saved to: {result_path}")
|
||||||
|
|
||||||
|
def predict(self, x): # predicts class labels for the input data
|
||||||
|
probs = self.forward(x) # forwards pass to get probabilities
|
||||||
|
return np.argmax(probs, axis=1) # returns the class with highest probability
|
||||||
|
|
||||||
|
# acquiring the FashionMNIST dataset
|
||||||
|
train_set = datasets.FashionMNIST(root='.', train=True, download=True)
|
||||||
|
test_set = datasets.FashionMNIST(root='.', train=False, download=True)
|
||||||
|
|
||||||
|
# preprocessing the data by flattening images and normalizing them.
|
||||||
|
x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0
|
||||||
|
y_train = train_set.targets.numpy()
|
||||||
|
|
||||||
|
x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0
|
||||||
|
y_test = test_set.targets.numpy()
|
||||||
|
|
||||||
|
# MLP Initialization
|
||||||
|
mlp = MLP(
|
||||||
|
input_size=28 * 28,
|
||||||
|
hidden_size1=256,
|
||||||
|
hidden_size2=256,
|
||||||
|
output_size=10,
|
||||||
|
weight_scale=1e-2,
|
||||||
|
l1 = 1e-6,
|
||||||
|
)
|
||||||
|
|
||||||
|
# trains the model
|
||||||
|
mlp.fit(
|
||||||
|
x_train=x_train,
|
||||||
|
y_train=y_train,
|
||||||
|
x_val=x_test,
|
||||||
|
y_val=y_test,
|
||||||
|
lr=1e-2,
|
||||||
|
epochs=10,
|
||||||
|
batch_size=256
|
||||||
|
)
|
||||||
|
|
||||||
|
# tests the model
|
||||||
|
test_pred = mlp.predict(x_test)
|
||||||
|
test_acc = np.mean(test_pred == y_test)
|
||||||
|
print(f"\nFinal test accuracy: {test_acc:.4f}")
|
||||||
|
|
@ -0,0 +1,187 @@
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from torchvision import datasets
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class MLP:
|
||||||
|
def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l2):
|
||||||
|
self.l2 = l2
|
||||||
|
|
||||||
|
# initializes weights and biases for each layer
|
||||||
|
self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale
|
||||||
|
self.b1 = np.zeros((1, hidden_size1))
|
||||||
|
self.W2 = np.random.randn(hidden_size1, hidden_size2) * weight_scale
|
||||||
|
self.b2 = np.zeros((1, hidden_size2))
|
||||||
|
self.W3 = np.random.randn(hidden_size2, output_size) * weight_scale
|
||||||
|
self.b3 = np.zeros((1, output_size))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# forwards pass through the network
|
||||||
|
self.x = x # input for backpropagation
|
||||||
|
self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1
|
||||||
|
self.a1 = self.relu(self.z1) # ReLU activation
|
||||||
|
self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2
|
||||||
|
self.a2 = self.relu(self.z2) # ReLU activation
|
||||||
|
self.z3 = self.a2 @ self.W3 + self.b3 # linear transformation for layer 3
|
||||||
|
self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities
|
||||||
|
return self.a3 # output of the network
|
||||||
|
|
||||||
|
def backward(self, y, lr):
|
||||||
|
# backwards pass for weight updates using gradient descent
|
||||||
|
m = y.shape[0]
|
||||||
|
y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding
|
||||||
|
|
||||||
|
# computes gradients for each layer
|
||||||
|
dz3 = self.a3 - y_one_hot # gradient for output layer
|
||||||
|
dw3 = (self.a2.T @ dz3) / m
|
||||||
|
db3 = np.sum(dz3, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
|
dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # gradient for layer 2
|
||||||
|
dw2 = (self.a1.T @ dz2) / m
|
||||||
|
db2 = np.sum(dz2, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
|
dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # gradient for layer 1
|
||||||
|
dw1 = (self.x.T @ dz1) / m
|
||||||
|
db1 = np.sum(dz1, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
|
dw3 += self.l2 * self.W3
|
||||||
|
dw2 += self.l2 * self.W2
|
||||||
|
dw1 += self.l2 * self.W1
|
||||||
|
|
||||||
|
# updates weights and biases using gradient descent
|
||||||
|
self.W3 -= lr * dw3
|
||||||
|
self.b3 -= lr * db3
|
||||||
|
self.W2 -= lr * dw2
|
||||||
|
self.b2 -= lr * db2
|
||||||
|
self.W1 -= lr * dw1
|
||||||
|
self.b1 -= lr * db1
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def relu(x):
|
||||||
|
# ReLU activation
|
||||||
|
return np.maximum(0, x)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def relu_deriv(x):
|
||||||
|
# derivation of ReLU activation for backpropagation
|
||||||
|
return (x > 0).astype(float)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def softmax(x):
|
||||||
|
# softmax function normalizes outputs to probabilities
|
||||||
|
e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # exponentiates inputs
|
||||||
|
return e_x / np.sum(e_x, axis=1, keepdims=True) # normalizes to get probabilities
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def one_hot_encode(y, num_classes):
|
||||||
|
# converts labels to one-hot encoded format
|
||||||
|
return np.eye(num_classes)[y]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def cross_entropy_loss(y, y_hat):
|
||||||
|
# computes cross-entropy loss between true labels and predicted probabilities
|
||||||
|
m = y.shape[0]
|
||||||
|
m = y.shape[0]
|
||||||
|
eps = 1e-12
|
||||||
|
y_hat_clipped = np.clip(y_hat, eps, 1. - eps)
|
||||||
|
log_probs = -np.log(y_hat_clipped[np.arange(m), y])
|
||||||
|
return np.mean(log_probs)
|
||||||
|
|
||||||
|
def fit(self, x_train, y_train, x_val, y_val, lr, epochs, batch_size):
|
||||||
|
train_losses = []
|
||||||
|
val_accuracies = []
|
||||||
|
|
||||||
|
for epoch in range(1, epochs + 1):
|
||||||
|
perm = np.random.permutation(x_train.shape[0]) # Shuffle the training data
|
||||||
|
x_train_shuffled, y_train_shuffled = x_train[perm], y_train[perm]
|
||||||
|
|
||||||
|
epoch_loss = 0.0
|
||||||
|
num_batches = int(np.ceil(x_train.shape[0] / batch_size))
|
||||||
|
|
||||||
|
for i in range(num_batches):
|
||||||
|
start = i * batch_size
|
||||||
|
end = start + batch_size
|
||||||
|
x_batch = x_train_shuffled[start:end] # batch of inputs
|
||||||
|
y_batch = y_train_shuffled[start:end] # batch of labels
|
||||||
|
|
||||||
|
# Forward pass, backward pass, and weight update
|
||||||
|
self.forward(x_batch)
|
||||||
|
self.backward(y_batch, lr)
|
||||||
|
|
||||||
|
epoch_loss += self.cross_entropy_loss(y_batch, self.a3) # updating the epoch loss
|
||||||
|
|
||||||
|
epoch_loss /= num_batches # average loss is defined
|
||||||
|
train_losses.append(epoch_loss)
|
||||||
|
|
||||||
|
val_pred = self.predict(x_val)
|
||||||
|
val_acc = np.mean(val_pred == y_val)
|
||||||
|
val_accuracies.append(val_acc) \
|
||||||
|
|
||||||
|
print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f} | Value Accuracy: {val_acc:.4f}")
|
||||||
|
|
||||||
|
self.plot_graph(train_losses, val_accuracies)
|
||||||
|
return val_accuracies[-1]
|
||||||
|
|
||||||
|
def plot_graph(self, train_losses, val_accuracies):
|
||||||
|
if not os.path.exists('results'):
|
||||||
|
os.makedirs('results') # creates results director
|
||||||
|
|
||||||
|
fig, ax1 = plt.subplots() # initializes the plot
|
||||||
|
|
||||||
|
ax1.set_xlabel('Epochs')
|
||||||
|
ax1.set_ylabel('Training Loss', color='tab:blue')
|
||||||
|
ax1.plot(range(1, len(train_losses) + 1), train_losses, color='tab:blue', label='Training Loss')
|
||||||
|
ax1.tick_params(axis='y', labelcolor='tab:blue') # defines loss subplot
|
||||||
|
|
||||||
|
ax2 = ax1.twinx()
|
||||||
|
ax2.set_ylabel('Validation Accuracy', color='tab:orange')
|
||||||
|
ax2.plot(range(1, len(val_accuracies) + 1), val_accuracies, color='tab:orange', label='Validation Accuracy')
|
||||||
|
ax2.tick_params(axis='y', labelcolor='tab:orange') # defines accuracy subplot
|
||||||
|
|
||||||
|
plt.title('Training Loss and Validation Accuracy over Epochs')
|
||||||
|
|
||||||
|
result_path = 'results/experiment-3-l2.png' # defines the file name
|
||||||
|
fig.savefig(result_path)
|
||||||
|
print(f"Graph saved to: {result_path}")
|
||||||
|
|
||||||
|
def predict(self, x): # predicts class labels for the input data
|
||||||
|
probs = self.forward(x) # forwards pass to get probabilities
|
||||||
|
return np.argmax(probs, axis=1) # returns the class with highest probability
|
||||||
|
|
||||||
|
# acquiring the FashionMNIST dataset
|
||||||
|
train_set = datasets.FashionMNIST(root='.', train=True, download=True)
|
||||||
|
test_set = datasets.FashionMNIST(root='.', train=False, download=True)
|
||||||
|
|
||||||
|
# preprocessing the data by flattening images and normalizing them.
|
||||||
|
x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0
|
||||||
|
y_train = train_set.targets.numpy()
|
||||||
|
|
||||||
|
x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0
|
||||||
|
y_test = test_set.targets.numpy()
|
||||||
|
|
||||||
|
# MLP Initialization
|
||||||
|
mlp = MLP(
|
||||||
|
input_size=28 * 28,
|
||||||
|
hidden_size1=256,
|
||||||
|
hidden_size2=256,
|
||||||
|
output_size=10,
|
||||||
|
weight_scale=1e-2,
|
||||||
|
l2 = 1e-4
|
||||||
|
)
|
||||||
|
|
||||||
|
# trains the model
|
||||||
|
mlp.fit(
|
||||||
|
x_train=x_train,
|
||||||
|
y_train=y_train,
|
||||||
|
x_val=x_test,
|
||||||
|
y_val=y_test,
|
||||||
|
lr=1e-2,
|
||||||
|
epochs=10,
|
||||||
|
batch_size=256
|
||||||
|
)
|
||||||
|
|
||||||
|
# tests the model
|
||||||
|
test_pred = mlp.predict(x_test)
|
||||||
|
test_acc = np.mean(test_pred == y_test)
|
||||||
|
print(f"\nFinal test accuracy: {test_acc:.4f}")
|
||||||
198
experiment-4.py
|
|
@ -0,0 +1,198 @@
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from torchvision import datasets
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class MLP:
|
||||||
|
def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l1, l2):
|
||||||
|
self.l1 = l1
|
||||||
|
self.l2 = l2
|
||||||
|
|
||||||
|
# initializes weights and biases for each layer
|
||||||
|
self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale
|
||||||
|
self.b1 = np.zeros((1, hidden_size1))
|
||||||
|
self.W2 = np.random.randn(hidden_size1, hidden_size2) * weight_scale
|
||||||
|
self.b2 = np.zeros((1, hidden_size2))
|
||||||
|
self.W3 = np.random.randn(hidden_size2, output_size) * weight_scale
|
||||||
|
self.b3 = np.zeros((1, output_size))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# forwards pass through the network
|
||||||
|
self.x = x # input for backpropagation
|
||||||
|
self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1
|
||||||
|
self.a1 = self.relu(self.z1) # ReLU activation
|
||||||
|
self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2
|
||||||
|
self.a2 = self.relu(self.z2) # ReLU activation
|
||||||
|
self.z3 = self.a2 @ self.W3 + self.b3 # linear transformation for layer 3
|
||||||
|
self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities
|
||||||
|
return self.a3 # output of the network
|
||||||
|
|
||||||
|
def backward(self, y, lr):
|
||||||
|
# backwards pass for weight updates using gradient descent
|
||||||
|
m = y.shape[0]
|
||||||
|
y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding
|
||||||
|
|
||||||
|
# computes gradients for each layer
|
||||||
|
dz3 = self.a3 - y_one_hot # gradient for output layer
|
||||||
|
dw3 = (self.a2.T @ dz3) / m
|
||||||
|
db3 = np.sum(dz3, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
|
dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # gradient for layer 2
|
||||||
|
dw2 = (self.a1.T @ dz2) / m
|
||||||
|
db2 = np.sum(dz2, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
|
dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # gradient for layer 1
|
||||||
|
dw1 = (self.x.T @ dz1) / m
|
||||||
|
db1 = np.sum(dz1, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
dw3 += self.l2 * self.W3
|
||||||
|
dw2 += self.l2 * self.W2
|
||||||
|
dw1 += self.l2 * self.W1
|
||||||
|
|
||||||
|
|
||||||
|
dw3 += self.l1 * np.sign(self.W3)
|
||||||
|
|
||||||
|
dw2 += self.l1 * np.sign(self.W2)
|
||||||
|
|
||||||
|
dw1 += self.l1 * np.sign(self.W1)
|
||||||
|
|
||||||
|
# updates weights and biases using gradient descent
|
||||||
|
self.W3 -= lr * dw3
|
||||||
|
self.b3 -= lr * db3
|
||||||
|
self.W2 -= lr * dw2
|
||||||
|
self.b2 -= lr * db2
|
||||||
|
self.W1 -= lr * dw1
|
||||||
|
self.b1 -= lr * db1
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def relu(x):
|
||||||
|
# ReLU activation
|
||||||
|
return np.maximum(0, x)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def relu_deriv(x):
|
||||||
|
# derivation of ReLU activation for backpropagation
|
||||||
|
return (x > 0).astype(float)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def softmax(x):
|
||||||
|
# softmax function normalizes outputs to probabilities
|
||||||
|
e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # exponentiates inputs
|
||||||
|
return e_x / np.sum(e_x, axis=1, keepdims=True) # normalizes to get probabilities
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def one_hot_encode(y, num_classes):
|
||||||
|
# converts labels to one-hot encoded format
|
||||||
|
return np.eye(num_classes)[y]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def cross_entropy_loss(y, y_hat):
|
||||||
|
# computes cross-entropy loss between true labels and predicted probabilities
|
||||||
|
m = y.shape[0]
|
||||||
|
m = y.shape[0]
|
||||||
|
eps = 1e-12
|
||||||
|
y_hat_clipped = np.clip(y_hat, eps, 1. - eps)
|
||||||
|
log_probs = -np.log(y_hat_clipped[np.arange(m), y])
|
||||||
|
return np.mean(log_probs)
|
||||||
|
|
||||||
|
def fit(self, x_train, y_train, x_val, y_val, lr, epochs, batch_size):
|
||||||
|
train_losses = []
|
||||||
|
val_accuracies = []
|
||||||
|
|
||||||
|
for epoch in range(1, epochs + 1):
|
||||||
|
perm = np.random.permutation(x_train.shape[0]) # Shuffle the training data
|
||||||
|
x_train_shuffled, y_train_shuffled = x_train[perm], y_train[perm]
|
||||||
|
|
||||||
|
epoch_loss = 0.0
|
||||||
|
num_batches = int(np.ceil(x_train.shape[0] / batch_size))
|
||||||
|
|
||||||
|
for i in range(num_batches):
|
||||||
|
start = i * batch_size
|
||||||
|
end = start + batch_size
|
||||||
|
x_batch = x_train_shuffled[start:end] # batch of inputs
|
||||||
|
y_batch = y_train_shuffled[start:end] # batch of labels
|
||||||
|
|
||||||
|
# Forward pass, backward pass, and weight update
|
||||||
|
self.forward(x_batch)
|
||||||
|
self.backward(y_batch, lr)
|
||||||
|
|
||||||
|
epoch_loss += self.cross_entropy_loss(y_batch, self.a3) # updating the epoch loss
|
||||||
|
|
||||||
|
epoch_loss /= num_batches # average loss is defined
|
||||||
|
train_losses.append(epoch_loss)
|
||||||
|
|
||||||
|
val_pred = self.predict(x_val)
|
||||||
|
val_acc = np.mean(val_pred == y_val)
|
||||||
|
val_accuracies.append(val_acc) \
|
||||||
|
|
||||||
|
print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f} | Value Accuracy: {val_acc:.4f}")
|
||||||
|
|
||||||
|
self.plot_graph(train_losses, val_accuracies)
|
||||||
|
return val_accuracies[-1]
|
||||||
|
|
||||||
|
def plot_graph(self, train_losses, val_accuracies):
|
||||||
|
if not os.path.exists('results'):
|
||||||
|
os.makedirs('results') # creates results director
|
||||||
|
|
||||||
|
fig, ax1 = plt.subplots() # initializes the plot
|
||||||
|
|
||||||
|
ax1.set_xlabel('Epochs')
|
||||||
|
ax1.set_ylabel('Training Loss', color='tab:blue')
|
||||||
|
ax1.plot(range(1, len(train_losses) + 1), train_losses, color='tab:blue', label='Training Loss')
|
||||||
|
ax1.tick_params(axis='y', labelcolor='tab:blue') # defines loss subplot
|
||||||
|
|
||||||
|
ax2 = ax1.twinx()
|
||||||
|
ax2.set_ylabel('Validation Accuracy', color='tab:orange')
|
||||||
|
ax2.plot(range(1, len(val_accuracies) + 1), val_accuracies, color='tab:orange', label='Validation Accuracy')
|
||||||
|
ax2.tick_params(axis='y', labelcolor='tab:orange') # defines accuracy subplot
|
||||||
|
|
||||||
|
plt.title('Training Loss and Validation Accuracy over Epochs')
|
||||||
|
|
||||||
|
result_path = 'results/experiment-4.png' # defines the file name
|
||||||
|
fig.savefig(result_path)
|
||||||
|
print(f"Graph saved to: {result_path}")
|
||||||
|
|
||||||
|
def predict(self, x): # predicts class labels for the input data
|
||||||
|
probs = self.forward(x) # forwards pass to get probabilities
|
||||||
|
return np.argmax(probs, axis=1) # returns the class with highest probability
|
||||||
|
|
||||||
|
# acquiring the FashionMNIST dataset
|
||||||
|
train_set = datasets.FashionMNIST(root='.', train=True, download=True)
|
||||||
|
test_set = datasets.FashionMNIST(root='.', train=False, download=True)
|
||||||
|
|
||||||
|
# preprocessing the data by flattening images without normalizing them.
|
||||||
|
x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32)
|
||||||
|
y_train = train_set.targets.numpy()
|
||||||
|
|
||||||
|
x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32)
|
||||||
|
y_test = test_set.targets.numpy()
|
||||||
|
|
||||||
|
# MLP Initialization
|
||||||
|
mlp = MLP(
|
||||||
|
input_size=28 * 28,
|
||||||
|
hidden_size1=256,
|
||||||
|
hidden_size2=256,
|
||||||
|
output_size=10,
|
||||||
|
weight_scale=1e-2,
|
||||||
|
l1 = 1e-6,
|
||||||
|
l2 = 1e-4
|
||||||
|
)
|
||||||
|
|
||||||
|
# trains the model
|
||||||
|
mlp.fit(
|
||||||
|
x_train=x_train,
|
||||||
|
y_train=y_train,
|
||||||
|
x_val=x_test,
|
||||||
|
y_val=y_test,
|
||||||
|
lr=1e-2,
|
||||||
|
epochs=10,
|
||||||
|
batch_size=256
|
||||||
|
)
|
||||||
|
|
||||||
|
# tests the model
|
||||||
|
test_pred = mlp.predict(x_test)
|
||||||
|
test_acc = np.mean(test_pred == y_test)
|
||||||
|
print(f"\nFinal test accuracy: {test_acc:.4f}")
|
||||||
203
experiment-5.py
|
|
@ -0,0 +1,203 @@
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from torchvision import datasets
|
||||||
|
from torchvision import transforms
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class MLP:
|
||||||
|
def __init__(self, input_size, hidden_size1, hidden_size2, output_size, weight_scale, l1, l2):
|
||||||
|
|
||||||
|
self.l1 = l1
|
||||||
|
self.l2 = l2
|
||||||
|
|
||||||
|
|
||||||
|
# initializes weights and biases for each layer
|
||||||
|
self.W1 = np.random.randn(input_size, hidden_size1) * weight_scale
|
||||||
|
self.b1 = np.zeros((1, hidden_size1))
|
||||||
|
self.W2 = np.random.randn(hidden_size1, hidden_size2) * weight_scale
|
||||||
|
self.b2 = np.zeros((1, hidden_size2))
|
||||||
|
self.W3 = np.random.randn(hidden_size2, output_size) * weight_scale
|
||||||
|
self.b3 = np.zeros((1, output_size))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# forwards pass through the network
|
||||||
|
self.x = x # input for backpropagation
|
||||||
|
self.z1 = x @ self.W1 + self.b1 # linear transformation for layer 1
|
||||||
|
self.a1 = self.relu(self.z1) # ReLU activation
|
||||||
|
self.z2 = self.a1 @ self.W2 + self.b2 # linear transformation for layer 2
|
||||||
|
self.a2 = self.relu(self.z2) # ReLU activation
|
||||||
|
self.z3 = self.a2 @ self.W3 + self.b3 # linear transformation for layer 3
|
||||||
|
self.a3 = self.softmax(self.z3) # applies softmax to get class probabilities
|
||||||
|
return self.a3 # output of the network
|
||||||
|
|
||||||
|
def backward(self, y, lr):
|
||||||
|
# backwards pass for weight updates using gradient descent
|
||||||
|
m = y.shape[0]
|
||||||
|
y_one_hot = self.one_hot_encode(y, self.W3.shape[1]) # converts labels to one-hot encoding
|
||||||
|
|
||||||
|
# computes gradients for each layer
|
||||||
|
dz3 = self.a3 - y_one_hot # gradient for output layer
|
||||||
|
dw3 = (self.a2.T @ dz3) / m
|
||||||
|
db3 = np.sum(dz3, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
|
dz2 = (dz3 @ self.W3.T) * self.relu_deriv(self.z2) # gradient for layer 2
|
||||||
|
dw2 = (self.a1.T @ dz2) / m
|
||||||
|
db2 = np.sum(dz2, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
|
dz1 = (dz2 @ self.W2.T) * self.relu_deriv(self.z1) # gradient for layer 1
|
||||||
|
dw1 = (self.x.T @ dz1) / m
|
||||||
|
db1 = np.sum(dz1, axis=0, keepdims=True) / m
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
dw3 += self.l2 * self.W3
|
||||||
|
dw2 += self.l2 * self.W2
|
||||||
|
dw1 += self.l2 * self.W1
|
||||||
|
|
||||||
|
|
||||||
|
dw3 += self.l1 * np.sign(self.W3)
|
||||||
|
|
||||||
|
dw2 += self.l1 * np.sign(self.W2)
|
||||||
|
|
||||||
|
dw1 += self.l1 * np.sign(self.W1)
|
||||||
|
|
||||||
|
# updates weights and biases using gradient descent
|
||||||
|
self.W3 -= lr * dw3
|
||||||
|
self.b3 -= lr * db3
|
||||||
|
self.W2 -= lr * dw2
|
||||||
|
self.b2 -= lr * db2
|
||||||
|
self.W1 -= lr * dw1
|
||||||
|
self.b1 -= lr * db1
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def relu(x):
|
||||||
|
# ReLU activation
|
||||||
|
return np.maximum(0, x)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def relu_deriv(x):
|
||||||
|
# derivation of ReLU activation for backpropagation
|
||||||
|
return (x > 0).astype(float)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def softmax(x):
|
||||||
|
# softmax function normalizes outputs to probabilities
|
||||||
|
e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) # exponentiates inputs
|
||||||
|
return e_x / np.sum(e_x, axis=1, keepdims=True) # normalizes to get probabilities
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def one_hot_encode(y, num_classes):
|
||||||
|
# converts labels to one-hot encoded format
|
||||||
|
return np.eye(num_classes)[y]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def cross_entropy_loss(y, y_hat):
|
||||||
|
# computes cross-entropy loss between true labels and predicted probabilities
|
||||||
|
m = y.shape[0]
|
||||||
|
m = y.shape[0]
|
||||||
|
eps = 1e-12
|
||||||
|
y_hat_clipped = np.clip(y_hat, eps, 1. - eps)
|
||||||
|
log_probs = -np.log(y_hat_clipped[np.arange(m), y])
|
||||||
|
return np.mean(log_probs)
|
||||||
|
|
||||||
|
def fit(self, x_train, y_train, x_val, y_val, lr, epochs, batch_size):
|
||||||
|
train_losses = []
|
||||||
|
val_accuracies = []
|
||||||
|
|
||||||
|
for epoch in range(1, epochs + 1):
|
||||||
|
perm = np.random.permutation(x_train.shape[0]) # Shuffle the training data
|
||||||
|
x_train_shuffled, y_train_shuffled = x_train[perm], y_train[perm]
|
||||||
|
|
||||||
|
epoch_loss = 0.0
|
||||||
|
num_batches = int(np.ceil(x_train.shape[0] / batch_size))
|
||||||
|
|
||||||
|
for i in range(num_batches):
|
||||||
|
start = i * batch_size
|
||||||
|
end = start + batch_size
|
||||||
|
x_batch = x_train_shuffled[start:end] # batch of inputs
|
||||||
|
y_batch = y_train_shuffled[start:end] # batch of labels
|
||||||
|
|
||||||
|
# Forward pass, backward pass, and weight update
|
||||||
|
self.forward(x_batch)
|
||||||
|
self.backward(y_batch, lr)
|
||||||
|
|
||||||
|
epoch_loss += self.cross_entropy_loss(y_batch, self.a3) # updating the epoch loss
|
||||||
|
|
||||||
|
epoch_loss /= num_batches # average loss is defined
|
||||||
|
train_losses.append(epoch_loss)
|
||||||
|
|
||||||
|
val_pred = self.predict(x_val)
|
||||||
|
val_acc = np.mean(val_pred == y_val)
|
||||||
|
val_accuracies.append(val_acc) \
|
||||||
|
|
||||||
|
print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f} | Value Accuracy: {val_acc:.4f}")
|
||||||
|
|
||||||
|
self.plot_graph(train_losses, val_accuracies)
|
||||||
|
return val_accuracies[-1]
|
||||||
|
|
||||||
|
def plot_graph(self, train_losses, val_accuracies):
|
||||||
|
if not os.path.exists('results'):
|
||||||
|
os.makedirs('results') # creates results director
|
||||||
|
|
||||||
|
fig, ax1 = plt.subplots() # initializes the plot
|
||||||
|
|
||||||
|
ax1.set_xlabel('Epochs')
|
||||||
|
ax1.set_ylabel('Training Loss', color='tab:blue')
|
||||||
|
ax1.plot(range(1, len(train_losses) + 1), train_losses, color='tab:blue', label='Training Loss')
|
||||||
|
ax1.tick_params(axis='y', labelcolor='tab:blue') # defines loss subplot
|
||||||
|
|
||||||
|
ax2 = ax1.twinx()
|
||||||
|
ax2.set_ylabel('Validation Accuracy', color='tab:orange')
|
||||||
|
ax2.plot(range(1, len(val_accuracies) + 1), val_accuracies, color='tab:orange', label='Validation Accuracy')
|
||||||
|
ax2.tick_params(axis='y', labelcolor='tab:orange') # defines accuracy subplot
|
||||||
|
|
||||||
|
plt.title('Training Loss and Validation Accuracy over Epochs')
|
||||||
|
|
||||||
|
result_path = 'results/experiment-5.png' # defines the file name
|
||||||
|
fig.savefig(result_path)
|
||||||
|
print(f"Graph saved to: {result_path}")
|
||||||
|
|
||||||
|
def predict(self, x): # predicts class labels for the input data
|
||||||
|
probs = self.forward(x) # forwards pass to get probabilities
|
||||||
|
return np.argmax(probs, axis=1) # returns the class with highest probability
|
||||||
|
|
||||||
|
# acquiring the FashionMNIST dataset
|
||||||
|
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
|
||||||
|
train_set = datasets.FashionMNIST(root='.', train=True, download=True, transform = transform)
|
||||||
|
test_set = datasets.FashionMNIST(root='.', train=False, download=True, transform = transform)
|
||||||
|
|
||||||
|
# preprocessing the data by flattening images and normalizing them.
|
||||||
|
x_train = train_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32)
|
||||||
|
y_train = train_set.targets.numpy()
|
||||||
|
|
||||||
|
x_test = test_set.data.numpy().reshape(-1, 28 * 28).astype(np.float32)
|
||||||
|
y_test = test_set.targets.numpy()
|
||||||
|
|
||||||
|
# MLP Initialization
|
||||||
|
mlp = MLP(
|
||||||
|
input_size=28 * 28,
|
||||||
|
hidden_size1=256,
|
||||||
|
hidden_size2=256,
|
||||||
|
output_size=10,
|
||||||
|
weight_scale=1e-2,
|
||||||
|
l1 = 1e-6,
|
||||||
|
l2 = 1e-4
|
||||||
|
)
|
||||||
|
|
||||||
|
# trains the model
|
||||||
|
mlp.fit(
|
||||||
|
x_train=x_train,
|
||||||
|
y_train=y_train,
|
||||||
|
x_val=x_test,
|
||||||
|
y_val=y_test,
|
||||||
|
lr=1e-2,
|
||||||
|
epochs=10,
|
||||||
|
batch_size=256
|
||||||
|
)
|
||||||
|
|
||||||
|
# tests the model
|
||||||
|
test_pred = mlp.predict(x_test)
|
||||||
|
test_acc = np.mean(test_pred == y_test)
|
||||||
|
print(f"\nFinal test accuracy: {test_acc:.4f}")
|
||||||
|
Before Width: | Height: | Size: 37 KiB After Width: | Height: | Size: 37 KiB |
|
Before Width: | Height: | Size: 39 KiB After Width: | Height: | Size: 42 KiB |
|
Before Width: | Height: | Size: 38 KiB After Width: | Height: | Size: 38 KiB |
|
Before Width: | Height: | Size: 37 KiB After Width: | Height: | Size: 37 KiB |
BIN
results/experiment-2-leaky-relu.png
Normal file
|
After Width: | Height: | Size: 37 KiB |
BIN
results/experiment-2-tanh.png
Normal file
|
After Width: | Height: | Size: 38 KiB |
BIN
results/experiment-3-l1.png
Normal file
|
After Width: | Height: | Size: 37 KiB |
BIN
results/experiment-3-l2.png
Normal file
|
After Width: | Height: | Size: 38 KiB |
BIN
results/experiment-4.png
Normal file
|
After Width: | Height: | Size: 42 KiB |
BIN
results/experiment-5.png
Normal file
|
After Width: | Height: | Size: 44 KiB |