{-# LANGUAGE ScopedTypeVariables, FlexibleContexts, BangPatterns #-} {-# OPTIONS -Wall #-} -- | -- Module : Network -- Copyright : (c) 2017 Christian Merten -- Maintainer : c.merten@gmx.net -- Stability : experimental -- Portability : GHC -- -- An implementation of artifical feed-forward neural networks in pure Haskell. -- -- An example is added in /XOR.hs/ module Network ( -- * Network Network(..), Layer(..), newNetwork, output, -- * Learning functions trainShuffled, trainNTimes, CostFunction(..), getDelta, LearningRate, Lambda, TrainingDataLength, Sample, Samples, (-->), -- * Activation functions ActivationFunction, ActivationFunctionDerivative, sigmoid, sigmoid', -- * Network serialization saveNetwork, loadNetwork ) where import Data.List (foldl') import Data.List.Split (chunksOf) import Data.Maybe (fromMaybe) import Text.Read (readMaybe) import Data.Binary import System.Directory import System.Random import Control.Monad (zipWithM, forM) import Data.Array.IO import Debug.Trace (trace) import Text.Regex.PCRE import Numeric.LinearAlgebra -- | The generic feedforward network type, a binary instance is implemented. -- It takes a list of layers -- with a minimum of one (output layer). -- It is usually constructed using the `newNetwork` function. data Network a = Network { layers :: [Layer a] } deriving (Show) -- | One layer of a network, storing the weights matrix and the biases vector -- of this layer. data Layer a = Layer { weights :: Matrix a, biases :: Vector a } deriving (Show) instance (Element a, Binary a) => Binary (Network a) where put (Network ls) = put ls get = Network `fmap` get instance (Element a, Binary a) => Binary (Layer a) where put (Layer ws bs) = do put (toLists ws) put (toList bs) get = do ws <- get bs <- get return $ Layer (fromLists ws) (fromList bs) -- | Cost Function Enum data CostFunction = QuadraticCost | CrossEntropyCost deriving (Show, Eq) -- | getDelta based on the raw input, the activated input and the desired output -- results in different values depending on the CostFunction type. getDelta :: Floating a => CostFunction -> a -> a -> a -> a getDelta QuadraticCost z a y = (a - y) * sigmoid'(z) getDelta CrossEntropyCost _ a y = a - y -- | Activation function used to calculate the actual output of a neuron. -- Usually the 'sigmoid' function. type ActivationFunction a = a -> a -- | The derivative of an activation function. type ActivationFunctionDerivative a = a -> a -- | Training sample that can be used for the training functions. -- -- > trainingData :: Samples Double -- > trainingData = [ fromList [0, 0] --> fromList [0], -- > fromList [0, 1] --> fromList [1], -- > fromList [1, 0] --> fromList [1], -- > fromList [1, 1] --> fromList [0]] type Sample a = (Vector a, Vector a) -- | A list of 'Sample's type Samples a = [Sample a] -- | A simple synonym for the (,) operator, used to create samples very -- intuitively. (-->) :: Vector a -> Vector a -> Sample a (-->) = (,) -- | The learning rate, affects the learning speed, lower learning rate results -- in slower learning, but usually better results after more epochs. type LearningRate a = a -- | Lambda value affecting the regularization while learning. type Lambda a = a -- | Wrapper around the training data length. type TrainingDataLength = Int -- | Size of one mini batch, subset of training set used to update the weights -- averaged over this batch. type MiniBatchSize = Int -- | Initializes a new network with random values for weights and biases -- in all layers. -- -- > net <- newNetwork [2, 3, 4] newNetwork :: [Int] -> IO (Network Double) newNetwork layerSizes | length layerSizes < 2 = error "Network too small!" | otherwise = do lays <- zipWithM go (init layerSizes) (tail layerSizes) return $ Network lays where go :: Int -> Int -> IO (Layer Double) go inputSize outputSize = do ws <- (/ (sqrt $ fromIntegral inputSize)) <$> randn outputSize inputSize seed <- randomIO let bs = randomVector seed Gaussian outputSize return $ Layer ws bs -- | Calculate the output of the network based on the network, a given -- 'ActivationFunction' and the input vector. output :: (Numeric a, Num (Vector a)) => Network a -> ActivationFunction a -> Vector a -> Vector a output net act input = foldl' f input (layers net) where f vec layer = cmap act ((weights layer #> vec) + biases layer) rawOutputs :: (Numeric a, Num (Vector a)) => Network a -> ActivationFunction a -> Vector a -> [(Vector a, Vector a)] rawOutputs net act input = scanl f (input, input) (layers net) where f (_, a) layer = let z' = (weights layer #> a) + biases layer in (z', cmap act z') -- | The most used training function, randomly shuffling the training set before -- every training epoch -- -- > trainShuffled 30 (\n e -> "") net CrossEntropyCost 0.5 trainData 10 0.1 trainShuffled :: forall a. (Numeric a, Floating a, Floating (Vector a)) => Int -> (Network a -> Int -> String) -> Network a -> CostFunction -> Lambda a -> Samples a -> MiniBatchSize -> LearningRate a -> IO (Network a) trainShuffled 0 _ net _ _ _ _ _ = return net trainShuffled epochs debug net costFunction lambda trainSamples miniBatchSize eta = do spls <- shuffle trainSamples let !net' = trainSGD net costFunction lambda spls miniBatchSize eta trace (debug net' epochs) (trainShuffled (epochs - 1) debug net' costFunction lambda trainSamples miniBatchSize eta) -- | Pure version of 'trainShuffled', training the network /n/ times without -- shuffling the training set, resulting in slightly worse results. trainNTimes :: forall a. (Numeric a, Floating a, Floating (Vector a)) => Int -> (Network a -> Int -> String) -> Network a -> CostFunction -> Lambda a -> Samples a -> MiniBatchSize -> LearningRate a -> Network a trainNTimes 0 _ net _ _ _ _ _ = net trainNTimes epochs debug net costFunction lambda trainSamples miniBatchSize eta = trace (debug net' epochs) (trainNTimes (epochs - 1) debug net' costFunction lambda trainSamples miniBatchSize eta) where !net' = trainSGD net costFunction lambda trainSamples miniBatchSize eta -- | Train the network using Stochastic Gradient Descent. -- This is an improved version of Gradient Descent splitting the training data into -- subsets to update the networks weights more often resulting in better accuracy. -- -- On each mini batch, 'update' is called to calculate the improved network. trainSGD :: forall a. (Numeric a, Floating a, Floating (Vector a)) => Network a -> CostFunction -> Lambda a -> Samples a -> MiniBatchSize -> LearningRate a -> Network a trainSGD net costFunction lambda trainSamples miniBatchSize eta = foldl' updateMiniBatch net (chunksOf miniBatchSize trainSamples) where -- update network based on given mini batch using Gradient Descent updateMiniBatch :: Network a -> Samples a -> Network a updateMiniBatch = update eta costFunction lambda (length trainSamples) -- | Update the network using a set of samples and Gradient Descent. -- This takes one mini batch to perform GD. update :: forall a. (Numeric a, Floating a, Floating (Vector a)) => LearningRate a -> CostFunction -> Lambda a -> TrainingDataLength -> Network a -> Samples a -> Network a update eta costFunction lambda n net spls = case mayNewNablas of Nothing -> net Just newNablas -> net { layers = applyNablas newNablas } where -- calculate new nablas based on samples mayNewNablas :: Maybe [Layer a] mayNewNablas = foldl' updateNablas Nothing spls -- update nablas by calculating new nablas and adding them to the -- existing ones updateNablas :: Maybe [Layer a] -> Sample a -> Maybe [Layer a] updateNablas mayNablas sample = let -- calculate new nablas for this training sample !nablasDelta = backprop net costFunction sample -- takes an existing nabla layer and adds the delta addDelta nabla nablaDelta = nabla { weights = weights nabla + weights nablaDelta, biases = biases nabla + biases nablaDelta } in case mayNablas of -- if there are already nablas, add the new ones Just nablas -> Just $ zipWith addDelta nablas nablasDelta -- otherwise return the new ones Nothing -> Just $ nablasDelta -- apply nablas to layers of the network applyNablas :: [Layer a] -> [Layer a] applyNablas nablas = zipWith applyNabla (layers net) nablas -- apply nabla to one layer applyNabla :: Layer a -> Layer a -> Layer a applyNabla layer nabla = let w = weights layer -- weights matrix nw = weights nabla -- weights nablas matrix b = biases layer -- biases vector nb = biases nabla -- biases nablas vector fac = 1 - eta * (lambda / fromIntegral n) -- subtract nablas from weights w' = scale fac w - scale (eta / (fromIntegral $ length spls)) nw -- subtract nablas from biases b' = b - scale (eta / (fromIntegral $ length spls)) nb in layer { weights = w', biases = b' } -- | Backpropagate the error and calculate the partial derivatives for the -- weights and biases in each layer. -- Returns a list of layers holding the nablas accordingly. backprop :: forall a. (Numeric a, Floating a, Floating (Vector a)) => Network a -> CostFunction -> Sample a -> [Layer a] backprop net costFunction spl = finalNablas where rawFeedforward :: [(Vector a, Vector a)] rawFeedforward = reverse $ rawOutputs net sigmoid (fst spl) -- get starting activation and raw value headZ, headA :: Vector a (headZ, headA) = head rawFeedforward -- get starting delta, based on the activation of the last layer startDelta = getDelta costFunction headZ headA (snd spl) -- calculate nabla of biases lastNablaB = startDelta -- calculate nabla of weighs of last layer in advance lastNablaW = startDelta `outer` previousA where previousA | length rawFeedforward > 1 = snd $ rawFeedforward !! 1 | otherwise = fst spl lastLayer = Layer { weights = lastNablaW, biases = lastNablaB } -- reverse layers, analogy to the reversed (z, a) list layersReversed = reverse $ layers net -- calculate nablas, beginning at the end of the network (startDelta) (finalNablas, _) = foldl' calculate ([lastLayer], startDelta) [1..length layersReversed - 1] -- takes the index and updates nablas calculate (nablas, oldDelta) idx = let -- extract raw and activated value (z, _) = rawFeedforward !! idx -- apply prime derivative of sigmoid z' = cmap sigmoid' z -- calculate new delta w = weights $ layersReversed !! (idx - 1) delta = (tr w #> oldDelta) * z' -- nablaB is just the delta vector nablaB = delta -- activation in previous layer aPrevious = snd $ rawFeedforward !! (idx + 1) -- dot product of delta and the activation in the previous layer nablaW = delta `outer` aPrevious -- put nablas into a new layer in (Layer { weights = nablaW, biases = nablaB } : nablas, delta) -- | The sigmoid function sigmoid :: Floating a => ActivationFunction a sigmoid x = 1 / (1 + exp (-x)) -- | The derivative of the sigmoid function. sigmoid' :: Floating a => ActivationFunctionDerivative a sigmoid' x = sigmoid x * (1 - sigmoid x) -- | Shuffle a list randomly. shuffle :: [a] -> IO [a] shuffle xs = do ar <- newArr n xs forM [1..n] $ \i -> do j <- randomRIO (i,n) vi <- readArray ar i vj <- readArray ar j writeArray ar j vi return vj where n = length xs newArr :: Int -> [a] -> IO (IOArray Int a) newArr len lst = newListArray (1,len) lst -- | Saves the network as the given filename. When the file already exists, -- it looks for another filename by increasing the version, e.g -- /mnist.net/ becomes /mnist1.net/. saveNetwork :: (Element a, Binary a) => FilePath -> Network a -> IO () saveNetwork fp net = do ex <- doesFileExist fp case ex of True -> saveNetwork (newFileName fp) net False -> encodeFile fp net -- | Find a new filename by replacing the old version with the next higher one. newFileName :: FilePath -> FilePath newFileName fp = case fp =~ "(.+[a-z]){0,1}([0-9]*)(\\..*)" :: [[String]] of [[_, p, v, s]] -> p ++ show (version v + 1) ++ s _ -> fp ++ "l" where version :: String -> Int version xs = fromMaybe 0 (readMaybe xs :: Maybe Int) -- | Load the network with the given filename. loadNetwork :: (Element a, Binary a) => FilePath -> IO (Network a) loadNetwork = decodeFile