# Copyright 2023 Wieger Wesselink.
# Distributed under the Boost Software License, Version 1.0.
# (See accompanying file LICENSE or http://www.boost.org/LICENSE_1_0.txt)
"""Neural network layers used by the MultilayerPerceptron class.
Layers in this module operate on matrices with row layout (each row is a sample).
They expose a minimal interface with feedforward, backpropagate and optimize.
"""
import numpy as np
from nerva_numpy.activation_functions import ActivationFunction, SReLUActivation, parse_activation
from nerva_numpy.matrix_operations import column_repeat, columns_mean, columns_sum, diag, elements_sum, hadamard, \
identity, ones, inv_sqrt, row_repeat, rows_sum, vector_size, zeros, Matrix
from nerva_numpy.optimizers import CompositeOptimizer, parse_optimizer
from nerva_numpy.softmax_functions import log_softmax, softmax
from nerva_numpy.weight_initializers import set_layer_weights
[docs]
class Layer(object):
"""
Base class for layers of a neural network with data in row layout
(each row is a sample: shape (N, D)).
"""
def __init__(self):
self.X = None
self.DX = None
self.optimizer = None
[docs]
def feedforward(self, X: Matrix) -> Matrix:
raise NotImplementedError
[docs]
def backpropagate(self, Y: Matrix, DY: Matrix) -> None:
raise NotImplementedError
[docs]
def optimize(self, eta):
if self.optimizer:
self.optimizer.update(eta)
[docs]
class LinearLayer(Layer):
"""Linear layer: Y = X W^T + b.
Shapes: X (N, D) -> Y (N, K), W (K, D), b (K,).
"""
def __init__(self, D: int, K: int):
super().__init__()
self.W = zeros(K, D)
self.DW = zeros(K, D)
self.b = zeros(K)
self.Db = zeros(K)
self.optimizer = None
[docs]
def feedforward(self, X: Matrix) -> Matrix:
self.X = X
N, D = X.shape
W = self.W
b = self.b
Y = X @ W.T + row_repeat(b, N)
return Y
[docs]
def backpropagate(self, Y: Matrix, DY: Matrix) -> None:
X = self.X
W = self.W
DW = DY.T @ X
Db = columns_sum(DY)
DX = DY @ W
self.DW[:] = DW
self.Db[:] = Db
self.DX = DX
[docs]
def output_size(self) -> int:
return self.W.shape[0]
[docs]
def set_optimizer(self, optimizer: str):
make_optimizer = parse_optimizer(optimizer)
self.optimizer = CompositeOptimizer([make_optimizer(self.W, self.DW), make_optimizer(self.b, self.Db)])
[docs]
def set_weights(self, weight_initializer):
set_layer_weights(self, weight_initializer)
[docs]
class ActivationLayer(LinearLayer):
"""Linear layer followed by a pointwise activation function."""
def __init__(self, D: int, K: int, act: ActivationFunction):
super().__init__(D, K)
self.Z = None
self.DZ = None
self.act = act
[docs]
def feedforward(self, X: Matrix) -> Matrix:
self.X = X
N, D = X.shape
W = self.W
b = self.b
act = self.act
Z = X @ W.T + row_repeat(b, N)
Y = act(Z)
self.Z = Z
return Y
[docs]
def backpropagate(self, Y: Matrix, DY: Matrix) -> None:
X = self.X
W = self.W
Z = self.Z
act = self.act
DZ = hadamard(DY, act.gradient(Z))
DW = DZ.T @ X
Db = columns_sum(DZ)
DX = DZ @ W
self.DZ = DZ
self.DW[:] = DW
self.Db[:] = Db
self.DX = DX
[docs]
class SReLULayer(ActivationLayer):
"""Activation layer with SReLU and trainable activation parameters.
In addition to W and b, this layer optimizes SReLU's (al, tl, ar, tr).
"""
def __init__(self, D: int, K: int, act: SReLUActivation):
super().__init__(D, K, act)
self.Dal = 0
self.Dtl = 0
self.Dar = 0
self.Dtr = 0
[docs]
def backpropagate(self, Y: Matrix, DY: Matrix) -> None:
super().backpropagate(Y, DY)
Z = self.Z
al, tl, ar, tr = self.act.x
Al = lambda Z: np.where(Z <= tl, Z - tl, 0)
Tl = lambda Z: np.where(Z <= tl, 1 - al, 0)
Ar = lambda Z: np.where((Z <= tl) | (Z < tr), 0, Z - tr)
Tr = lambda Z: np.where((Z <= tl) | (Z < tr), 0, 1 - ar)
Dal = elements_sum(hadamard(DY, Al(Z)))
Dtl = elements_sum(hadamard(DY, Tl(Z)))
Dar = elements_sum(hadamard(DY, Ar(Z)))
Dtr = elements_sum(hadamard(DY, Tr(Z)))
self.act.Dx[:] = np.array([Dal, Dtl, Dar, Dtr])
[docs]
def set_optimizer(self, optimizer: str):
make_optimizer = parse_optimizer(optimizer)
self.optimizer = CompositeOptimizer([make_optimizer(self.W, self.DW),
make_optimizer(self.b, self.Db),
make_optimizer(self.act.x, self.act.Dx)
])
[docs]
class SoftmaxLayer(LinearLayer):
"""Linear layer followed by softmax over the last dimension."""
def __init__(self, D: int, K: int):
super().__init__(D, K)
self.Z = None
self.DZ = None
[docs]
def feedforward(self, X: Matrix) -> Matrix:
self.X = X
N, D = X.shape
W = self.W
b = self.b
Z = X @ W.T + row_repeat(b, N)
Y = softmax(Z)
self.Z = Z
return Y
[docs]
def backpropagate(self, Y: Matrix, DY: Matrix) -> None:
N, K = self.Z.shape
X = self.X
W = self.W
DZ = hadamard(Y, DY - column_repeat(diag(DY @ Y.T), K))
DW = DZ.T @ X
Db = columns_sum(DZ)
DX = DZ @ W
self.DZ = DZ
self.DW[:] = DW
self.Db[:] = Db
self.DX = DX
[docs]
class LogSoftmaxLayer(LinearLayer):
"""Linear layer followed by log_softmax over the last dimension."""
def __init__(self, D: int, K: int):
super().__init__(D, K)
self.Z = None
self.DZ = None
[docs]
def feedforward(self, X: Matrix) -> Matrix:
self.X = X
N, D = X.shape
W = self.W
b = self.b
Z = X @ W.T + row_repeat(b, N)
Y = log_softmax(Z)
self.Z = Z
return Y
[docs]
def backpropagate(self, Y: Matrix, DY: Matrix) -> None:
N, K = self.Z.shape
X = self.X
W = self.W
Z = self.Z
DZ = DY - hadamard(softmax(Z), column_repeat(rows_sum(DY), K))
DW = DZ.T @ X
Db = columns_sum(DZ)
DX = DZ @ W
self.DZ = DZ
self.DW[:] = DW
self.Db[:] = Db
self.DX = DX
[docs]
class BatchNormalizationLayer(Layer):
"""Batch normalization layer with per-feature gamma and beta.
Normalizes inputs across the batch using per-feature statistics.
Shapes: X (N, D) -> Y (N, D), gamma/beta (D,).
"""
def __init__(self, D: int):
super().__init__()
self.Z = None
self.DZ = None
self.gamma = ones(D)
self.Dgamma = zeros(D)
self.beta = zeros(D)
self.Dbeta = zeros(D)
self.inv_sqrt_Sigma = zeros(D)
self.optimizer = None
[docs]
def feedforward(self, X: Matrix) -> Matrix:
self.X = X
N, D = X.shape
gamma = self.gamma
beta = self.beta
R = X - row_repeat(columns_mean(X), N)
Sigma = diag(R.T @ R).T / N
inv_sqrt_Sigma = inv_sqrt(Sigma)
Z = hadamard(row_repeat(inv_sqrt_Sigma, N), R)
Y = hadamard(row_repeat(gamma, N), Z) + row_repeat(beta, N)
self.inv_sqrt_Sigma[:] = inv_sqrt_Sigma
self.Z = Z
return Y
[docs]
def backpropagate(self, Y: Matrix, DY: Matrix) -> None:
"""Compute gradients for gamma/beta and propagate DX through BN."""
N, D = self.X.shape
Z = self.Z
gamma = self.gamma
inv_sqrt_Sigma = self.inv_sqrt_Sigma
DZ = hadamard(row_repeat(gamma, N), DY)
Dbeta = columns_sum(DY)
Dgamma = columns_sum(hadamard(DY, Z))
DX = hadamard(row_repeat(inv_sqrt_Sigma / N, N), (N * identity(N) - ones(N, N)) @ DZ - hadamard(Z, row_repeat(diag(Z.T @ DZ).T, N)))
self.DZ = DZ
self.Dbeta[:] = Dbeta
self.Dgamma[:] = Dgamma
self.DX = DX
[docs]
def output_size(self) -> int:
return vector_size(self.gamma)
[docs]
def set_optimizer(self, optimizer: str):
make_optimizer = parse_optimizer(optimizer)
self.optimizer = CompositeOptimizer([make_optimizer(self.beta, self.Dbeta), make_optimizer(self.gamma, self.Dgamma)])
[docs]
def parse_linear_layer(text: str,
D: int,
K: int,
optimizer: str,
weight_initializer: str
) -> Layer:
"""Parse a textual layer spec and create a configured Layer instance.
Supports Linear, Softmax, LogSoftmax, activation names (e.g. ReLU), and
SReLU(...). The optimizer and weight initializer are applied.
"""
if text == 'Linear':
layer = LinearLayer(D, K)
elif text == 'Softmax':
layer = SoftmaxLayer(D, K)
elif text == 'LogSoftmax':
layer = LogSoftmaxLayer(D, K)
elif text.startswith('SReLU'):
act = parse_activation(text)
layer = SReLULayer(D, K, act)
else:
act = parse_activation(text)
layer = ActivationLayer(D, K, act)
layer.set_optimizer(optimizer)
layer.set_weights(weight_initializer)
return layer