{-# LANGUAGE ScopedTypeVariables, FlexibleContexts, BangPatterns #-} {-# OPTIONS -Wall #-} -- | -- Module : Network -- Copyright : (c) 2017 Christian Merten -- Maintainer : c.merten@gmx.net -- Stability : experimental -- Portability : GHC -- -- An implementation of artifical feed-forward neural networks in pure Haskell. -- -- An example is added in /XOR.hs/ module Network ( -- * Network Network(..), Layer(..), newNetwork, output, -- * Learning functions trainShuffled, trainNTimes, CostFunction(..), getDelta, LearningRate, Lambda, TrainingDataLength, Sample, Samples, (-->), -- * Activation functions ActivationFunction, ActivationFunctionDerivative, sigmoid, sigmoid', -- * Network serialization saveNetwork, loadNetwork ) where import Data.List.Split (chunksOf) import Data.Binary import Data.Maybe (fromMaybe) import Text.Read (readMaybe) import System.Directory import System.Random import Control.Monad (zipWithM, forM) import Data.Array.IO import Debug.Trace (trace) import Text.Regex.PCRE import Numeric.LinearAlgebra -- | The generic feedforward network type, a binary instance is implemented. -- It takes a list of layers -- with a minimum of one (output layer). -- It is usually constructed using the `newNetwork` function. data Network a = Network { layers :: [Layer a] } deriving (Show) -- | One layer of a network, storing the weights matrix and the biases vector -- of this layer. data Layer a = Layer { weights :: Matrix a, biases :: Vector a } deriving (Show) instance (Element a, Binary a) => Binary (Network a) where put (Network ls) = put ls get = Network `fmap` get instance (Element a, Binary a) => Binary (Layer a) where put (Layer ws bs) = do put (toLists ws) put (toList bs) get = do ws <- get bs <- get return $ Layer (fromLists ws) (fromList bs) -- | Cost Function Enum data CostFunction = QuadraticCost | CrossEntropyCost deriving (Show, Eq) -- | getDelta based on the raw input, the activated input and the desired output -- results in different values depending on the CostFunction type. getDelta :: Floating a => CostFunction -> a -> a -> a -> a getDelta QuadraticCost z a y = (a - y) * sigmoid'(z) getDelta CrossEntropyCost _ a y = a - y -- | Activation function used to calculate the actual output of a neuron. -- Usually the 'sigmoid' function. type ActivationFunction a = a -> a -- | The derivative of an activation function. type ActivationFunctionDerivative a = a -> a -- | Training sample that can be used for the training functions. -- -- > trainingData :: Samples Double -- > trainingData = [ fromList [0, 0] --> fromList [0], -- > fromList [0, 1] --> fromList [1], -- > fromList [1, 0] --> fromList [1], -- > fromList [1, 1] --> fromList [0]] type Sample a = (Vector a, Vector a) -- | A list of 'Sample's type Samples a = [Sample a] -- | A simple synonym for the (,) operator, used to create samples very intuitively. (-->) :: Vector a -> Vector a -> Sample a (-->) = (,) -- | The learning rate, affects the learning speed, lower learning rate results -- in slower learning, but usually better results after more epochs. type LearningRate = Double -- | Lambda value affecting the regularization while learning. type Lambda = Double -- | Wrapper around the training data length. type TrainingDataLength = Int -- | Initializes a new network with random values for weights and biases -- in all layers. -- -- > net <- newNetwork [2, 3, 4] newNetwork :: [Int] -> IO (Network Double) newNetwork layerSizes | length layerSizes < 2 = error "Network too small!" | otherwise = do lays <- zipWithM go (init layerSizes) (tail layerSizes) return $ Network lays where go :: Int -> Int -> IO (Layer Double) go inputSize outputSize = do ws <- randn outputSize inputSize seed <- randomIO let bs = randomVector seed Gaussian outputSize return $ Layer ws bs -- | Calculate the output of the network based on the network, a given -- 'ActivationFunction' and the input vector. output :: (Numeric a, Num (Vector a)) => Network a -> ActivationFunction a -> Vector a -> Vector a output net act input = foldl f input (layers net) where f vec layer = cmap act ((weights layer #> vec) + biases layer) rawOutputs :: (Numeric a, Num (Vector a)) => Network a -> ActivationFunction a -> Vector a -> [(Vector a, Vector a)] rawOutputs net act input = scanl f (input, input) (layers net) where f (_, a) layer = let z' = (weights layer #> a) + biases layer in (z', cmap act z') -- | The most used training function, randomly shuffling the training set before -- every training epoch -- -- > trainShuffled 30 (\n e -> "") net CrossEntropyCost 0.5 trainData 10 0.1 trainShuffled :: Int -> (Network Double -> Int -> String) -> Network Double -> CostFunction -> Lambda -> Samples Double -> Int -> Double -> IO (Network Double) trainShuffled 0 _ net _ _ _ _ _ = return net trainShuffled epochs debug net costFunction lambda trainSamples miniBatchSize eta = do spls <- shuffle trainSamples let !net' = trainSGD net costFunction lambda spls miniBatchSize eta trace (debug net' epochs) (trainShuffled (epochs - 1) debug net' costFunction lambda trainSamples miniBatchSize eta) -- | Pure version of 'trainShuffled', training the network /n/ times without -- shuffling the training set, resulting in slightly worse results. trainNTimes :: Int -> (Network Double -> Int -> String) -> Network Double -> CostFunction -> Lambda -> Samples Double -> Int -> Double -> Network Double trainNTimes 0 _ net _ _ _ _ _ = net trainNTimes epochs debug net costFunction lambda trainSamples miniBatchSize eta = trace (debug net' epochs) (trainNTimes (epochs - 1) debug net' costFunction lambda trainSamples miniBatchSize eta) where !net' = trainSGD net costFunction lambda trainSamples miniBatchSize eta trainSGD :: (Numeric Double, Floating Double) => Network Double -> CostFunction -> Lambda -> Samples Double -> Int -> Double -> Network Double trainSGD net costFunction lambda trainSamples miniBatchSize eta = foldl updateMiniBatch net (chunksOf miniBatchSize trainSamples) where updateMiniBatch = update eta costFunction lambda (length trainSamples) update :: LearningRate -> CostFunction -> Lambda -> TrainingDataLength -> Network Double -> Samples Double -> Network Double update eta costFunction lambda n net spls = case newNablas of Nothing -> net Just x -> net { layers = layers' x } where newNablas :: Maybe [Layer Double] newNablas = foldl updateNablas Nothing spls updateNablas :: Maybe [Layer Double] -> Sample Double -> Maybe [Layer Double] updateNablas mayNablas sample = let nablasDelta = backprop net costFunction sample f nabla nablaDelta = nabla { weights = weights nabla + weights nablaDelta, biases = biases nabla + biases nablaDelta } in case mayNablas of Just nablas -> Just $ zipWith f nablas nablasDelta Nothing -> Just $ nablasDelta layers' :: [Layer Double] -> [Layer Double] layers' nablas = zipWith updateLayer (layers net) nablas updateLayer :: Layer Double -> Layer Double -> Layer Double updateLayer layer nabla = let w = weights layer -- weights matrix nw = weights nabla b = biases layer -- biases vector nb = biases nabla fac = 1 - eta * (lambda / fromIntegral n) w' = scale fac w - scale (eta / (fromIntegral $ length spls)) nw b' = b - scale (eta / (fromIntegral $ length spls)) nb in layer { weights = w', biases = b' } backprop :: Network Double -> CostFunction -> Sample Double -> [Layer Double] backprop net costFunction spl = finalNablas where rawFeedforward :: [(Vector Double, Vector Double)] rawFeedforward = reverse $ rawOutputs net sigmoid (fst spl) -- get starting activation and raw value headZ, headA :: Vector Double (headZ, headA) = head rawFeedforward -- get starting delta, based on the activation of the last layer startDelta = getDelta costFunction headZ headA (snd spl) -- calculate weighs of last layer in advance lastNablaB = startDelta lastNablaW = startDelta `outer` previousA where previousA | length rawFeedforward > 1 = snd $ rawFeedforward !! 1 | otherwise = fst spl lastLayer = Layer { weights = lastNablaW, biases = lastNablaB } -- reverse layers, analogy to the reversed (z, a) list layersReversed = reverse $ layers net -- calculate nablas, beginning at the end of the network (startDelta) (finalNablas, _) = foldl calculate ([lastLayer], startDelta) [1..length layersReversed - 1] -- takes the index and updates nablas calculate (nablas, oldDelta) idx = let -- extract raw and activated value (z, _) = rawFeedforward !! idx -- apply prime derivative of sigmoid z' = cmap sigmoid' z -- calculate new delta w = weights $ layersReversed !! (idx - 1) delta = (tr w #> oldDelta) * z' -- nablaB is just the delta vector nablaB = delta -- activation in previous layer aPrevious = snd $ rawFeedforward !! (idx + 1) -- dot product of delta and the activation in the previous layer nablaW = delta `outer` aPrevious -- put nablas into a new layer in (Layer { weights = nablaW, biases = nablaB } : nablas, delta) -- | The sigmoid function sigmoid :: Floating a => ActivationFunction a sigmoid x = 1 / (1 + exp (-x)) -- | The derivative of the sigmoid function. sigmoid' :: Floating a => ActivationFunctionDerivative a sigmoid' x = sigmoid x * (1 - sigmoid x) shuffle :: [a] -> IO [a] shuffle xs = do ar <- newArr n xs forM [1..n] $ \i -> do j <- randomRIO (i,n) vi <- readArray ar i vj <- readArray ar j writeArray ar j vi return vj where n = length xs newArr :: Int -> [a] -> IO (IOArray Int a) newArr len lst = newListArray (1,len) lst -- | Saves the network as the given filename. When the file already exists, -- it looks for another filename by increasing the version, e.g -- /mnist.net/ becomes /mnist1.net/. saveNetwork :: (Element a, Binary a) => FilePath -> Network a -> IO () saveNetwork fp net = do ex <- doesFileExist fp case ex of True -> saveNetwork (newFileName fp) net False -> encodeFile fp net newFileName :: FilePath -> FilePath newFileName fp = case fp =~ "(.+[a-z]){0,1}([0-9]*)(\\..*)" :: [[String]] of [[_, p, v, s]] -> p ++ show (version v + 1) ++ s _ -> fp ++ "l" where version :: String -> Int version xs = fromMaybe 0 (readMaybe xs :: Maybe Int) -- | Load the network with the given filename. loadNetwork :: (Element a, Binary a) => FilePath -> IO (Network a) loadNetwork = decodeFile