christian
/
hann


			
							{-# LANGUAGE ScopedTypeVariables, FlexibleContexts, BangPatterns #-}
{-# OPTIONS -Wall #-}

-- |
-- Module      : Network
-- Copyright   : (c) 2017 Christian Merten
-- Maintainer  : c.merten@gmx.net
-- Stability   : experimental
-- Portability : GHC
--
-- An implementation of artifical feed-forward neural networks in pure Haskell.
--
-- An example is added in /XOR.hs/

module Network (
    -- * Network
    Network(..),
    Layer(..),
    newNetwork,
    output,
    
    -- * Learning functions
    trainShuffled,
    trainNTimes,
    CostFunction(..),
    getDelta,
    LearningRate,
    Lambda,
    TrainingDataLength,
    Sample, Samples, (-->),

    -- * Activation functions
    ActivationFunction, ActivationFunctionDerivative,
    sigmoid,
    sigmoid',

    -- * Network serialization
    saveNetwork,
    loadNetwork
) where

import Data.List.Split (chunksOf)
import Data.List (foldl')
import Data.Binary
import Data.Maybe (fromMaybe)
import Text.Read (readMaybe)

import System.Directory
import System.Random
import Control.Monad (zipWithM, forM)
import Data.Array.IO
import Debug.Trace (trace)
import Text.Regex.PCRE

import Numeric.LinearAlgebra

-- | The generic feedforward network type, a binary instance is implemented.
-- It takes a list of layers
-- with a minimum of one (output layer).
-- It is usually constructed using the `newNetwork` function.
data Network a = Network { layers :: [Layer a] }
                 deriving (Show)

-- | One layer of a network, storing the weights matrix and the biases vector
-- of this layer.
data Layer a = Layer { weights :: Matrix a, biases :: Vector a }
               deriving (Show)

instance (Element a, Binary a) => Binary (Network a) where
    put (Network ls) = put ls
    get = Network `fmap` get

instance (Element a, Binary a) => Binary (Layer a) where
    put (Layer ws bs) = do
        put (toLists ws)
        put (toList bs)
    get = do
        ws <- get
        bs <- get
        return $ Layer (fromLists ws) (fromList bs)

-- | Cost Function Enum
data CostFunction = QuadraticCost
                  | CrossEntropyCost
                    deriving (Show, Eq)

-- | getDelta based on the raw input, the activated input and the desired output
-- results in different values depending on the CostFunction type.
getDelta :: Floating a => CostFunction -> a -> a -> a -> a
getDelta QuadraticCost    z a y = (a - y) * sigmoid'(z)
getDelta CrossEntropyCost _ a y = a - y

-- | Activation function used to calculate the actual output of a neuron.
-- Usually the 'sigmoid' function.
type ActivationFunction a = a -> a

-- | The derivative of an activation function.
type ActivationFunctionDerivative a = a -> a

-- | Training sample that can be used for the training functions.
--
-- > trainingData :: Samples Double
-- > trainingData = [ fromList [0, 0] --> fromList [0],
-- >                  fromList [0, 1] --> fromList [1],
-- >                  fromList [1, 0] --> fromList [1],
-- >                  fromList [1, 1] --> fromList [0]]
type Sample a = (Vector a, Vector a)

-- | A list of 'Sample's
type Samples a = [Sample a]

-- | A simple synonym for the (,) operator, used to create samples very intuitively.
(-->) :: Vector a -> Vector a -> Sample a
(-->) = (,)

-- | The learning rate, affects the learning speed, lower learning rate results
-- in slower learning, but usually better results after more epochs.
type LearningRate = Double

-- | Lambda value affecting the regularization while learning.
type Lambda = Double

-- | Wrapper around the training data length.
type TrainingDataLength = Int

-- | Initializes a new network with random values for weights and biases
-- in all layers.
--
-- > net <- newNetwork [2, 3, 4]
newNetwork :: [Int] -> IO (Network Double)
newNetwork layerSizes
    | length layerSizes < 2 = error "Network too small!"
    | otherwise = do
        lays <- zipWithM go (init layerSizes) (tail layerSizes)
        return $ Network lays
  where go :: Int -> Int -> IO (Layer Double)
        go inputSize outputSize = do
            ws <- randn outputSize inputSize
            seed <- randomIO
            let bs = randomVector seed Gaussian outputSize
            return $ Layer ws bs

-- | Calculate the output of the network based on the network, a given
-- 'ActivationFunction' and the input vector.
output :: (Numeric a, Num (Vector a))
       => Network a
       -> ActivationFunction a
       -> Vector a
       -> Vector a
output net act input = foldl' f input (layers net)
    where f vec layer = cmap act ((weights layer #> vec) + biases layer)

rawOutputs :: (Numeric a, Num (Vector a))
           => Network a
           -> ActivationFunction a
           -> Vector a
           -> [(Vector a, Vector a)]
rawOutputs net act input = scanl f (input, input) (layers net)
    where f (_, a) layer = let z' = (weights layer #> a) + biases layer in
                           (z', cmap act z')

-- | The most used training function, randomly shuffling the training set before
-- every training epoch
--
-- > trainShuffled 30 (\n e -> "") net CrossEntropyCost 0.5 trainData 10 0.1
trainShuffled :: Int
              -> (Network Double -> Int -> String)
              -> Network Double
              -> CostFunction
              -> Lambda
              -> Samples Double
              -> Int
              -> Double
              -> IO (Network Double)
trainShuffled 0 _ net _ _ _ _ _ = return net
trainShuffled epochs debug net costFunction lambda trainSamples miniBatchSize eta = do
    spls <- shuffle trainSamples
    let !net' = trainSGD net costFunction lambda spls miniBatchSize eta
    trace (debug net' epochs)
        (trainShuffled (epochs - 1) debug net' costFunction lambda trainSamples miniBatchSize eta)


-- | Pure version of 'trainShuffled', training the network /n/ times without
-- shuffling the training set, resulting in slightly worse results.
trainNTimes :: Int
            -> (Network Double -> Int -> String)
            -> Network Double
            -> CostFunction
            -> Lambda
            -> Samples Double
            -> Int
            -> Double
            -> Network Double
trainNTimes 0 _ net _ _ _ _ _ = net
trainNTimes epochs debug net costFunction lambda trainSamples miniBatchSize eta =
    trace (debug net' epochs)
    (trainNTimes (epochs - 1) debug net' costFunction lambda trainSamples miniBatchSize eta)
  where !net' = trainSGD net costFunction lambda trainSamples miniBatchSize eta


trainSGD :: (Numeric Double, Floating Double)
         => Network Double
         -> CostFunction
         -> Lambda
         -> Samples Double
         -> Int
         -> Double
         -> Network Double
trainSGD net costFunction lambda trainSamples miniBatchSize eta =
    foldl' updateMiniBatch net (chunksOf miniBatchSize trainSamples)
  where updateMiniBatch = update eta costFunction lambda (length trainSamples)

update :: LearningRate
       -> CostFunction
       -> Lambda
       -> TrainingDataLength
       -> Network Double
       -> Samples Double
       -> Network Double
update eta costFunction lambda n net spls = case newNablas of
        Nothing -> net
        Just x -> net { layers = layers' x }
    where newNablas :: Maybe [Layer Double]
          newNablas = foldl' updateNablas Nothing spls
          updateNablas :: Maybe [Layer Double] -> Sample Double -> Maybe [Layer Double]
          updateNablas mayNablas sample =
              let !nablasDelta = backprop net costFunction sample
                  f nabla nablaDelta =
                      nabla { weights = weights nabla + weights nablaDelta,
                              biases = biases nabla + biases nablaDelta }
              in case mayNablas of
                  Just nablas -> Just $ zipWith f nablas nablasDelta
                  Nothing -> Just $ nablasDelta
          layers' :: [Layer Double] -> [Layer Double]
          layers' nablas = zipWith updateLayer (layers net) nablas
          updateLayer :: Layer Double -> Layer Double -> Layer Double
          updateLayer layer nabla =
              let w = weights layer  -- weights matrix
                  nw = weights nabla
                  b = biases layer  -- biases vector
                  nb = biases nabla
                  fac = 1 - eta * (lambda / fromIntegral n)
                  w' = scale fac w - scale (eta / (fromIntegral $ length spls)) nw
                  b' = b - scale (eta / (fromIntegral $ length spls)) nb
              in layer { weights = w', biases = b' }

backprop :: Network Double -> CostFunction -> Sample Double -> [Layer Double]
backprop net costFunction spl = finalNablas
    where rawFeedforward :: [(Vector Double, Vector Double)]
          rawFeedforward = reverse $ rawOutputs net sigmoid (fst spl)
          -- get starting activation and raw value
          headZ, headA :: Vector Double
          (headZ, headA) = head rawFeedforward
          -- get starting delta, based on the activation of the last layer
          startDelta = getDelta costFunction headZ headA (snd spl)
          -- calculate nabla of biases
          lastNablaB = startDelta
          -- calculate nabla of weighs of last layer in advance
          lastNablaW = startDelta `outer` previousA
            where previousA
                    | length rawFeedforward > 1 = snd $ rawFeedforward !! 1
                    | otherwise                 = fst spl
          lastLayer = Layer { weights = lastNablaW, biases = lastNablaB }
          -- reverse layers, analogy to the reversed (z, a) list
          layersReversed = reverse $ layers net
          -- calculate nablas, beginning at the end of the network (startDelta)
          (finalNablas, _) = foldl' calculate ([lastLayer], startDelta)
                        [1..length layersReversed - 1]
          -- takes the index and updates nablas
          calculate (nablas, oldDelta) idx =
              let -- extract raw and activated value
                  (z, _) = rawFeedforward !! idx
                  -- apply prime derivative of sigmoid
                  z' = cmap sigmoid' z
                  -- calculate new delta
                  w = weights $ layersReversed !! (idx - 1)
                  delta = (tr w #> oldDelta) * z'
                  -- nablaB is just the delta vector
                  nablaB = delta
                  -- activation in previous layer
                  aPrevious = snd $ rawFeedforward !! (idx + 1)
                  -- dot product of delta and the activation in the previous layer
                  nablaW = delta `outer` aPrevious
                  -- put nablas into a new layer
              in (Layer { weights = nablaW, biases = nablaB } : nablas, delta)


-- | The sigmoid function
sigmoid :: Floating a => ActivationFunction a
sigmoid x = 1 / (1 + exp (-x))

-- | The derivative of the sigmoid function.
sigmoid' :: Floating a => ActivationFunctionDerivative a
sigmoid' x = sigmoid x * (1 - sigmoid x)

shuffle :: [a] -> IO [a]
shuffle xs = do
        ar <- newArr n xs
        forM [1..n] $ \i -> do
            j <- randomRIO (i,n)
            vi <- readArray ar i
            vj <- readArray ar j
            writeArray ar j vi
            return vj
  where
    n = length xs
    newArr :: Int -> [a] -> IO (IOArray Int a)
    newArr len lst = newListArray (1,len) lst

-- | Saves the network as the given filename. When the file already exists,
-- it looks for another filename by increasing the version, e.g
-- /mnist.net/ becomes /mnist1.net/.
saveNetwork :: (Element a, Binary a) => FilePath -> Network a -> IO ()
saveNetwork fp net = do
    ex <- doesFileExist fp
    case ex of
        True -> saveNetwork (newFileName fp) net
        False -> encodeFile fp net

newFileName :: FilePath -> FilePath
newFileName fp = case fp =~ "(.+[a-z]){0,1}([0-9]*)(\\..*)" :: [[String]] of
    [[_, p, v, s]] -> p ++ show (version v + 1) ++ s
    _              -> fp ++ "l"
  where version :: String -> Int
        version xs = fromMaybe 0 (readMaybe xs :: Maybe Int)

-- | Load the network with the given filename.
loadNetwork :: (Element a, Binary a) => FilePath -> IO (Network a)
loadNetwork = decodeFile