I am participating in Kaggle's Isolated Sign Language Recognition challenge, where we are given a dataset consisting of "landmark" points of someone's hand as a temporal sequence, and the goal is to identify what sign is being made.
This notebook presents a "toy problem" I came up with to get my feet wet using RNNs to solve classification problems where the input is temporal data.
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import torch
import torch.nn as nn
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using %s device' % device)
Using cpu device
def condition(T):
T.requires_grad_()
return T.to(device)
Consider a bead on a hoop rotating with constant angular velocity $\omega$. Assuming that the bead is rotating in the $x, y$ plane, and it's position at time $t=0$ is $(0, 0, 0)$, then it's coordinates at time $t$ will be given by the vector $$ \textbf{p}(t) = (\cos(\omega t), \sin(\omega t), 0). $$
We will generate "signs" by first rotating this hoop around the $x$ axis by $\phi$ radians, and then rotate around the $z$ axis by $\psi$ radians. The associated rotation matrices are given by \begin{align*} R_z^\phi := \begin{pmatrix} 1 & 0 & 0 \\ 0 & \cos \phi & -\sin \phi \\ 0 & -\cos \phi & \sin \phi \end{pmatrix}, \qquad R_x^\psi := \begin{pmatrix} \cos \psi & \sin \psi & 0 \\ -\sin \psi & \cos \psi & 0 \\ 0 & 0 & 1 \end{pmatrix}, \end{align*} respectivly. We will project the result onto the $x,y$ plane to create a "shadow" using the projection matrix \begin{align*} P_{x,y} = \begin{pmatrix} 1 & 0 & 0 \\ 0 & 1 & 0 \\ 0 & 0 & 0 \end{pmatrix}. \end{align*}
To construct our dataset we will choose $N$ vectors $\mathcal{P}_n = (\phi_n, \psi_n, \alpha_n)$ for $n \in 1, \cdots N$ and $\alpha > 0$, then choose sequences \begin{align*} \mathbf{p}_{\mathcal{P}}(t) := R_z^\phi R_x^\psi P_{x, y}\mathbf{p}(t)^T + \mathbf{\mathcal{N}}(\alpha) \end{align*} where $\mathcal{N}$ is the normal distribution centered at 0 with variance $\alpha$.
We begin by generating a sequence of time-steps and constructing the base vector $\mathbf{p}(t)$:
t = np.linspace(0, 2*np.pi, 100)
p = lambda t, omega: (np.cos(omega * t), np.sin(omega * t), 0)
Prior to applying linear transformations the path taken by the bead looks like this:
P = p(t, 1)
plt.plot(P[0], P[1])
plt.title('Bead on a Hoop')
plt.show()
To ease computation we compute
\begin{align*}
A^{\phi, \psi} :&= R_z^\phi R_x^\psi P_{x, y} \\
&= \begin{pmatrix}
\cos\psi & \sin \psi & 0 \\
-\cos \phi \sin\psi & \cos \phi \cos \psi & -\sin \phi \\
0 & 0 & 0
\end{pmatrix},
\end{align*}
and thus
\begin{align*}
A^{\phi, \psi}p(t)^T = \begin{pmatrix}
p_1(t)\cos \psi + p_2(t) \sin \psi \\
-p_1(t)\cos \phi \sin \psi + p_2(t) \cos \phi \cos \psi \\
0
\end{pmatrix},
\end{align*}
which we implement as a lambda
function:
A = lambda phi, psi, p: [
p[0]*np.cos(psi) + p[1]*np.sin(psi),
p[0]*np.cos(phi)*np.sin(psi) + p[1]*np.cos(phi) * np.cos(psi)
]
We then write some paramaterized code to generate a sequence, and a helper function to convert a given sequence into a Tensor. The latter function will be important later on.
def generate_sequence(phi, psi, noise, t_noise=0.2, n_samples=100, t_max=2*np.pi):
t = np.linspace(0, t_max, n_samples) + np.random.normal(scale=t_noise, size=(n_samples,))
P = p(t, 1)
a = A(phi, psi, P)
a[0] = a[0] + np.random.normal(loc=0.0, scale=noise, size=t.shape)
a[1] = a[1] + np.random.normal(loc=0.0, scale=noise, size=t.shape)
return a
def seq2tensor(seq):
length = seq[0].shape[0]
tensor = torch.zeros((length, 2))
for ii, s in enumerate(seq):
for jj, v in enumerate(seq[ii]):
tensor[jj, ii] = v
return tensor
We use a constant amount of noise as well as add noise to the temporal variable. Thus we choose a few representative $(\phi, \psi)$ pairs and generate the following noisy data:
pairs = [
(np.pi/3, np.pi/16),
(np.pi/2 + 0.3, np.pi/4 + 0.1),
(np.pi/5, np.pi/8),
(np.pi/3 - 0.25, -np.pi/8)
]
colors = ['C0', 'C1', 'C2', 'C4']
training_data = []
for ii, pa in enumerate(pairs):
for _ in range(20):
seq = generate_sequence(pa[0], pa[1], 0.08)
training_data.append((seq, ii))
plt.plot(seq[0], seq[1], colors[ii], linewidth=0.2, alpha=0.35)
plt.plot(A(pa[0], pa[1], P)[0], A(pa[0], pa[1], P)[1], colors[ii])
plt.title('Noisy Data')
plt.xticks(ticks=None)
plt.yticks(ticks=None)
plt.show()
Finally, before we start building a model and training it we wrap our data generation process in a torch.utils.data.Dataset
subclass to streamline the training pipeline.
class SequenceDataset(torch.utils.data.Dataset):
def __init__(self,
pairs,
n_samples=20,
t_samples=100,
t_max=2*np.pi,
t_noise=0.2,
s_noise=0.04):
self.data = []
for ii, pa in enumerate(pairs):
for _ in range(n_samples):
seq = generate_sequence(
pa[0], pa[1], s_noise,
t_noise=t_noise,
n_samples=t_samples,
t_max=t_max
)
self.data.append((condition(seq2tensor(seq)), ii))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]
A simple GRU model is sufficient for this problem and we do not need to introduce more sophisticated models like attention to successfully address this problem.
We use a single layer GRU followed by a fully connected linear layer with a log softmax output. The output size is the number of classes that we wish to classify.
class GRUEncoder(nn.Module):
def __init__(self,
input_size,
hidden_size,
output_size,
dropout):
super(GRUEncoder, self).__init__()
self.hidden_size = hidden_size
self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, x, h):
out, h = self.gru(x, h)
out = self.fc(h)
out = self.softmax(out)
return out
def init_hidden(self, batch_size):
hidden = condition(torch.zeros(1, batch_size, self.hidden_size))
return hidden
batch_size = 16
dataset = SequenceDataset(pairs, n_samples=100)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Input the three dimensions and output softmax in the four categories
model = GRUEncoder(2, 10, 4, 0.1).to(device)
opt = torch.optim.Adam(model.parameters(), lr=0.01)
losses = []
epochs = 100
print_every = 10
loss_fn = nn.NLLLoss()
for e in range(1, epochs + 1):
avg_loss = 0.0
for ii, batch in enumerate(dataloader):
data, target = batch
h = model.init_hidden(batch_size)
opt.zero_grad()
out = model(data, h).reshape((batch_size, -1))
loss = loss_fn(out, target)
loss.backward()
opt.step()
avg_loss += loss.item()
losses.append(avg_loss / batch_size)
if e == 1 or (e % print_every) == 0:
print("%d: \t%.5f" % (e, avg_loss / batch_size))
1: 4.05293 10: 2.40914 20: 2.35928 30: 2.34807 40: 2.29171 50: 2.29706 60: 2.30091 70: 2.34789 80: 2.29383 90: 2.35367 100: 2.31696
Let's take a look at the learning curve
plt.plot(range(len(losses)), losses)
plt.title('Learning Curve')
plt.show()
We measure classification accuracy by drawing more samples from a SequenceDataset
. On this simple model our network attains a 100% classification accuracy.
test_dataset = SequenceDataset(pairs, n_samples=10)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)
for ii, batch in enumerate(test_dataloader):
x, truth = batch
h = model.init_hidden(len(test_dataset))
out = model(x, h)
v = truth - torch.argmax(out, dim=2)
print("Classification Accuracy: %.2f%%" % (np.count_nonzero(v==0)/len(v[0])*100))
Classification Accuracy: 100.00%
Our previous model attained 100% accuracy, which means either (a) we're overfitting the dataset or (b) the data is easily seperable. We will now make a dataset with more targets and more noise, which means that the various classes will be "closer" to one another and more difficult to distinguish.
We begin by generating a new set of training data.
colors = ['C%d' % ii for ii in range(100)]
training_data = []
p1 = np.linspace(0, np.pi/2, 5)
p2 = np.linspace(0, np.pi/2, 5)
pairs = [(a + np.random.normal(scale=1.3), b + np.random.normal(scale=1.3)) for b in p2 for a in p1]
for ii, pa in enumerate(pairs):
for _ in range(20):
seq = generate_sequence(pa[0], pa[1], 0.1)
training_data.append((seq, ii))
plt.plot(seq[0], seq[1], colors[ii], linewidth=0.2, alpha=0.35)
plt.plot(A(pa[0], pa[1], P)[0], A(pa[0], pa[1], P)[1], colors[ii])
plt.title('Noisy Data with %d Classes' % len(pairs))
plt.xticks(ticks=None)
plt.yticks(ticks=None)
plt.show()
Before we build our dataset, data loader, and model
batch_size = 16
dataset = SequenceDataset(pairs, n_samples=100)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Input the three dimensions and output log softmax in the four categories
model = GRUEncoder(2, 100, len(pairs), 0.1).to(device)
opt = torch.optim.Adam(model.parameters(), lr=0.01)
losses = []
Let's train!
epochs = 100
print_every = 10
loss_fn = nn.NLLLoss()
for e in range(1, epochs + 1):
avg_loss = 0.0
for ii, batch in enumerate(dataloader):
data, target = batch
_batch_size = data.shape[0]
h = model.init_hidden(_batch_size)
opt.zero_grad()
out = model(data, h).reshape((_batch_size, -1))
loss = loss_fn(out, target)
loss.backward()
opt.step()
avg_loss += loss.item()
losses.append(avg_loss / batch_size)
if e == 1 or (e % print_every) == 0:
print("%d: \t%.5f" % (e, avg_loss / batch_size))
1: 8.95451 10: 4.72466 20: 3.90430 30: 3.79116 40: 3.48710 50: 4.44861 60: 4.03913 70: 3.87881 80: 3.67685 90: 3.69464 100: 4.26656
plt.plot(range(len(losses)), losses)
plt.title('Learning Curve')
plt.show()
Now we get about 99% accuracy, which is to be expected.
test_dataset = SequenceDataset(pairs, n_samples=10)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)
for ii, batch in enumerate(test_dataloader):
x, truth = batch
h = model.init_hidden(len(test_dataset))
out = model(x, h)
v = truth - torch.argmax(out, dim=2)
print("Classification Accuracy: %.2f%%" % (np.count_nonzero(v==0)/len(v[0])*100))
Classification Accuracy: 100.00%