import torch
def activation(x, func=None):
"""
Arguments
---------
x: torch.tensor
"""
if func == 'softmax':
#dim =1 takes sum over the rows.
return torch.exp(x)/torch.sum(torch.exp(x), dim=1).view(-1, 1)
return 1/(1+torch.exp(-x))
Below explains the view(it only changes the shape for veiwing) but in memory the shape is retained. A value of negative 1 means the shape if inferred from the tensor.
In [21]: a
Out[21]:
tensor([[ 1, 0, 0],
[ 1, 1, 1],
[-1, 1, 1],
[ 2, 3, 4]])
In [22]: a.sum(dim=1).view(-1, 1)
Out[22]:
tensor([[1],
[3],
[1],
[9]])
In [23]: a/a.sum(dim=1).view(-1, 1)
Out[23]:
tensor([[ 1.0000, 0.0000, 0.0000],
[ 0.3333, 0.3333, 0.3333],
[-1.0000, 1.0000, 1.0000],
[ 0.2222, 0.3333, 0.4444]])
In [24]:
#Generate data for example.
torch.manual_seed(7)
#features
features = torch.randn((1, 5))
#weights
weights = torch.randn_like(features)
#bias
bias = torch.randn((1, 1))
print(f"F:{features},\nW:{weights},\nb:{bias}")
F:tensor([[-0.1468, 0.7861, 0.9468, -1.1143, 1.6908]]), W:tensor([[-0.8948, -0.3556, 1.2324, 0.1382, -1.6822]]), b:tensor([[0.3177]])
#Using matrix multuplication change the shape of second tensor to match with number of columns of features.
#Functions that can be used torch.reshape(), torch.resize(), torch.view()
activation((torch.matmul(features, weights.T))+bias)
tensor([[0.1595]])
weights.shape
torch.Size([1, 5])
activation(torch.sum(features*weights)+bias)
tensor([[0.1595]])
#Torch has a function torch.from_numpy() to convert numpy array to tensors.
import numpy as np
np.random.seed(13)
torch.from_numpy(np.random.randint(10,size=(5, 2)))
tensor([[2, 0], [0, 6], [2, 4], [9, 3], [4, 2]])
NOTE: If any value is changed in-place for one object then the other value changes as well. Both have a shared memory.
We will use the MNIST dataset of handwritten examples of digits in postcards which is pre-processed and formatted. This dataset can be loaded with torchvision
in pytorch.
from torchvision import datasets, transforms
#Normalizing the data using transform
transform= transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,),(0.5,))])
#Download and load the MNIST dataset
trainset = datasets.MNIST('MNIST_data/',download=True, train=True, transform = transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to MNIST_data/MNIST/raw/train-images-idx3-ubyte.gz
Extracting MNIST_data/MNIST/raw/train-images-idx3-ubyte.gz to MNIST_data/MNIST/raw Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to MNIST_data/MNIST/raw/train-labels-idx1-ubyte.gz
Extracting MNIST_data/MNIST/raw/train-labels-idx1-ubyte.gz to MNIST_data/MNIST/raw Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to MNIST_data/MNIST/raw/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/MNIST/raw/t10k-images-idx3-ubyte.gz to MNIST_data/MNIST/raw Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to MNIST_data/MNIST/raw/t10k-labels-idx1-ubyte.gz
Extracting MNIST_data/MNIST/raw/t10k-labels-idx1-ubyte.gz to MNIST_data/MNIST/raw
/usr/local/lib/python3.7/dist-packages/torchvision/datasets/mnist.py:498: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. This means you can write to the underlying (supposedly non-writeable) NumPy array using the tensor. You may want to copy the array to protect its data or make it writeable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:180.) return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
Tensor of size (64, 1, 28, 28) that it is a batch of 64 images with 1 channel of 28 * 28 pixel. Note that an (d1, d2, ..., dn)-dimensional array/tensor is represented as follows.
Each $d_i$ represents the length of the list we will have "list of $d_1$ lists each containing $d_2$ lists with $d_3$ lists and so on..."
iter_data = iter(trainloader)
images, labels = next(iter_data)
print(images.shape)
print(labels.shape)
torch.Size([64, 1, 28, 28]) torch.Size([64])
import matplotlib.pyplot as plt
%matplotlib inline
plt.imshow(images[1].numpy().squeeze(), cmap='Greys_r')
<matplotlib.image.AxesImage at 0x7f8dbd63f510>
#Input images of size 64*784 where each row is of size 28*28 = 784
torch.manual_seed(13)
img_input = torch.flatten(images, start_dim=1)
#Assiging weights to the first layer which will have 684 weights with respect to each hidden units.
W_1 = torch.randn((784, 256))
bias = torch.randn((256))
#Assiging weights to the output layer
W_2 = torch.randn((256, 10))
bias1 = torch.randn((10))
#Using sigmoid as activation and output function
out = activation(torch.matmul(activation(torch.matmul(img_input, W_1)+bias), W_2)+bias1)
#Using softmax for activation and output function
out1 = activation(torch.matmul(activation(torch.matmul(img_input, W_1)+bias, func='softmax'), W_2)+bias1, func='softmax')
#Using sigmoid for activation and softmax for final ouput.
out2 = activation(torch.matmul(activation(torch.matmul(img_input, W_1)+bias), W_2)+bias1, func='softmax')
torch.sum(torch.matmul(img_input, W_1)+bias, dim=0).shape
torch.Size([256])
torch.flatten(images, start_dim=1).shape
torch.Size([64, 784])
import seaborn as sns
Plotting the probabilities of the above network which is not trained.
fig, ax = plt.subplots(nrows=1, ncols=4,figsize=(16, 8))
#First input image of the 64 images.
ax[0].imshow(img_input[1].numpy().reshape((28, 28)).squeeze(),cmap='Reds')
#Proba class of network without training
sns.barplot(y=list(range(10)), x=out[:1,:].numpy().squeeze(), ax=ax[1], orient='h')
ax[1].set_title('Class Probability --- Logistic')
ax[1].set_xlabel('Probabiity')
ax[1].set_ylabel('Class')
#Proba class using softmax.
sns.barplot(y=list(range(10)), x=out1[:1, :].numpy().squeeze(), ax=ax[2], orient='h')
ax[2].set_title('Class Probability --- Softmax')
ax[2].set_xlabel('Probabiity')
ax[2].set_ylabel('Class')
#Proba class using softmax.
sns.barplot(y=list(range(10)), x=out2[:1, :].numpy().squeeze(), ax=ax[3], orient='h')
ax[3].set_title('Class Probability --- Logistic-Softmax')
ax[3].set_xlabel('Probabiity')
ax[3].set_ylabel('Class')
Text(0, 0.5, 'Class')
out2.sum(dim=1)
tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])
from torch import nn
#Build a forward network
#Note that the input is a batch of 64 flattened images, that is matrix of size (64, 784).
#Each column of the matrix corresponds to weights associated to the neuron.
#(64, 784)*(784, 128)*(128, 64)*(64, 10)=(64, 10)
#The result is probability for each image in the batch.
model = nn.Sequential(nn.Linear(784, 128),nn.ReLU(),nn.Linear(128, 64),nn.ReLU(),nn.Linear(64, 10), nn.LogSoftmax(dim=1))
#Loss function
criterion = nn.NLLLoss()
#data
images, labels = next(iter(trainloader))
#Flatten images
images = images.view(images.shape[0], -1)
logits = model(images)
#loss calculation
loss = criterion(logits, labels)
print(loss)
tensor(2.3218, grad_fn=<NllLossBackward>)
logits.shape
torch.Size([64, 10])
Autograd is reverse automatic differentiation system and a good introduction is provided on the pytorch page.
We require error associated with output of each neuron and to rectify it we require certain partial derivatives and autograd simplies this task by keeping track of the operations associated with scalar. In simple terms autograd stores the jacobian, it is simply the derivative(linear transformation) of multivariable function and give any vector $v$ (the direction of derivative) it gives out $Jv$.
Note that partial derivatives can be represented by a tree
So if we want to calculate $\partial z/\partial s$ we get this by adding up the values in the above tree. So it would be \begin{equation} \frac{\partial z}{\partial s}= \frac{\partial z}{\partial x} \frac{\partial x}{\partial s} + \frac{\partial z}{\partial y} \frac{\partial y}{\partial s} \end{equation} In a similar manner autograd calculates the partial derivatives for the requires function. From the documentaion:
Autograd relies on the user to write thread safe C++ hooks. If you want the hook to be correctly applied in multithreading environment, you will need to write proper thread locking code to ensure the hooks are thread safe.
In python we don't need to worry because GIL.
In the following example we use the flag requires_grad=True
to keep track of the operations on the tensor
#Autograd
#To keep a track of operations that created the tensor we have to set requires_grad = True.
x = torch.randn(2, 2, requires_grad=True)
x
tensor([[ 0.9017, -0.4343], [ 0.1861, 0.1828]], requires_grad=True)
y= x**2
y
tensor([[0.8131, 0.1886], [0.0346, 0.0334]], grad_fn=<PowBackward0>)
In the following we see that<PowBackward0 at 0x7f8dadefa7d0>
is the function as we raised 2 to original tensor.
#The operation that created y can be seen as follows
y.grad_fn
<PowBackward0 at 0x7f8dadefa7d0>
The above shows that it is powerbackward function.
z = torch.exp(y)
z.grad_fn
<ExpBackward at 0x7f8dadf0d390>
The above shows that it is exp function.
z1=z.mean()
z1
tensor(1.3829, grad_fn=<MeanBackward0>)
#Currently no value is given to grad attribute. As we haven't called the backward method on the variable
print(x.grad)
None
z1.backward(retain_graph=True)
print(x.grad)
tensor([[ 1.0166, -0.2622], [ 0.0964, 0.0945]])
x = torch.tensor([1., 4.], requires_grad=True)
y = x**2
y.backward()
--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) <ipython-input-44-ab75bb780f4c> in <module>() ----> 1 y.backward() /usr/local/lib/python3.7/dist-packages/torch/_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs) 253 create_graph=create_graph, 254 inputs=inputs) --> 255 torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) 256 257 def register_hook(self, hook): /usr/local/lib/python3.7/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs) 141 142 grad_tensors_ = _tensor_or_tensors_to_tuple(grad_tensors, len(tensors)) --> 143 grad_tensors_ = _make_grads(tensors, grad_tensors_) 144 if retain_graph is None: 145 retain_graph = create_graph /usr/local/lib/python3.7/dist-packages/torch/autograd/__init__.py in _make_grads(outputs, grads) 48 if out.requires_grad: 49 if out.numel() != 1: ---> 50 raise RuntimeError("grad can be implicitly created only for scalar outputs") 51 new_grads.append(torch.ones_like(out, memory_format=torch.preserve_format)) 52 else: RuntimeError: grad can be implicitly created only for scalar outputs
The above error was thrown as y is not a scalar. So in this case we need to give the direction for the gradient evaluation. Suppose we want to evaluate it in the direction of $(1, 4)$. So that would be $Jv^t$ where $J$ is the jacobian. Hence, we have the following
\begin{equation} J= \begin{pmatrix} 2 & 0\\ 0 & 8\\ \end{pmatrix} % \begin{pmatrix} 1\\ 4 \end{pmatrix} % = \begin{pmatrix} 2\\ 32 \end{pmatrix} \end{equation}y.backward(gradient=torch.tensor([1., 4.]), retain_graph=True)
print(x.grad)
tensor([ 2., 32.])
t=torch.tensor([1.,2.], requires_grad=True)
#function (x1, x2) --> (x1^3, x2^3)
z=t**3
#gradient in the direction of (1, 1)
z.backward(gradient=torch.tensor([1., 1.]), retain_graph=True)
#check whether it is correct it must give (3, 12)
print(t.grad==torch.tensor([3., 12.]))
tensor([True, True])