Python: RNN principle realized by numpy

Python implements the principle of RNN
I’ve tweaked the code a little bit so it can do gradient descent.

import numpy as np
import torch
from torch import nn

class Rnn(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, bidirectional=False):
        super(Rnn, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional

    def forward(self, x):
        '''

        :param x: [seq, batch_size, embedding]
        :return: out, hidden
        '''

        # x.shape [sep, batch, feature]
        # hidden.shape [hidden_size, batch]
        # Whh0.shape [hidden_size, hidden_size]  Wih0.shape [hidden_size, feature]
        # Whh1.shape [hidden_size, hidden_size]  Wih1.size  [hidden_size, hidden_size]

        out = []
        x, hidden = np.array(x), [np.zeros((self.hidden_size, x.shape[1])) for i in range(self.num_layers)]
        Wih = [np.random.random((self.hidden_size, self.hidden_size)) for i in range(1, self.num_layers)]
        Wih0 = np.random.random((self.hidden_size, x.shape[2]))
        Whh = [np.random.random((self.hidden_size, self.hidden_size)) for i in range(self.num_layers)]
        # x, hidden, Wih, Whh = torch.from_numpy(x), torch.tensor(hidden), torch.tensor(Wih), torch.tensor(Whh)

        x = torch.from_numpy(x)
        hidden = torch.tensor(hidden)
        Wih0 = torch.tensor(Wih0, requires_grad=True)
        Wih, Whh = torch.tensor(Wih, requires_grad=True), torch.tensor(Whh, requires_grad=True)

        time = x.shape[0]
        for i in range(time):
            hidden[0] = torch.tanh((torch.matmul(Wih0, torch.transpose(x[i, ...], 1, 0)) +
                              torch.matmul(Whh[0], hidden[0])
                              ))

            for i in range(1, self.num_layers):
                hidden[i] = torch.tanh((torch.matmul(Wih[i-1], hidden[i-1]) +
                                     torch.matmul(Whh[i], hidden[i])
                                     ))

            out.append(hidden[self.num_layers-1])
        # If the element in the list is a tensor, it cannot be converted with torch.tensor() and an error will be reported
        return torch.stack([i for i in out]), hidden


def sigmoid(x):
    return 1.0/(1.0 + 1.0/np.exp(x))


if __name__ == '__main__':
    a = torch.tensor([1, 2, 3])
    print(torch.cuda.is_available(), type(a))
    rnn = Rnn(1, 5, 4)
    input = np.random.random((6, 2, 1))
    out, h = rnn(input)
    print(f'seq is {input.shape[0]}, batch_size is {input.shape[1]} ', 'out.shape ', out.shape, ' h.shape ', h.shape)
    # print(sigmoid(np.random.random((2, 3))))
    #
    # element-wise multiplication
    # print(np.array([1, 2])*np.array([2, 1]))

The divider
First of all, the code is just for understanding. The gradient descent part is not written. The default parameters have been fixed, so it does not affect understanding. Code mainly to achieve the principle of RNN, only use NUMPY library, can not be used for GPU acceleration.

import numpy as np


class Rnn():

    def __init__(self, input_size, hidden_size, num_layers, bidirectional=False):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional

    def feed(self, x):
        '''

        :param x: [seq, batch_size, embedding]
        :return: out, hidden
        '''

        # x.shape [sep, batch, feature]
        # hidden.shape [hidden_size, batch]
        # Whh0.shape [hidden_size, hidden_size]  Wih0.shape [hidden_size, feature]
        # Whh1.shape [hidden_size, hidden_size]  Wih1.size  [hidden_size, hidden_size]

        out = []
        x, hidden = np.array(x), [np.zeros((self.hidden_size, x.shape[1])) for i in range(self.num_layers)]
        Wih = [np.random.random((self.hidden_size, self.hidden_size)) for i in range(1, self.num_layers)]
        Wih.insert(0, np.random.random((self.hidden_size, x.shape[2])))
        Whh = [np.random.random((self.hidden_size, self.hidden_size)) for i in range(self.num_layers)]

        time = x.shape[0]
        for i in range(time):
            hidden[0] = np.tanh((np.dot(Wih[0], np.transpose(x[i, ...], (1, 0))) +
                              np.dot(Whh[0], hidden[0])
                              ))

            for i in range(1, self.num_layers):
                hidden[i] = np.tanh((np.dot(Wih[i], hidden[i-1]) +
                                     np.dot(Whh[i], hidden[i])
                                     ))

            out.append(hidden[self.num_layers-1])

        return np.array(out), np.array(hidden)


def sigmoid(x):
    return 1.0/(1.0 + 1.0/np.exp(x))


if __name__ == '__main__':
    rnn = Rnn(1, 5, 4)
    input = np.random.random((6, 2, 1))
    out, h = rnn.feed(input)
    print(f'seq is {input.shape[0]}, batch_size is {input.shape[1]} ', 'out.shape ', out.shape, ' h.shape ', h.shape)
    # print(sigmoid(np.random.random((2, 3))))
    #
    # element-wise multiplication
    # print(np.array([1, 2])*np.array([2, 1]))

Read More: