{-# LANGUAGE ScopedTypeVariables, FlexibleContexts, BangPatterns #-}
{-# OPTIONS -Wall #-}

-- |
-- Module      : Network
-- Copyright   : (c) 2017 Christian Merten
-- Maintainer  : c.merten@gmx.net
-- Stability   : experimental
-- Portability : GHC
--
-- An implementation of artifical feed-forward neural networks in pure Haskell.
--
-- An example is added in /XOR.hs/

module Network (
    -- * Network
    Network(..),
    Layer(..),
    newNetwork,
    output,
    
    -- * Learning functions
    trainShuffled,
    trainNTimes,
    CostFunction(..),
    getDelta,
    LearningRate,
    Lambda,
    TrainingDataLength,
    Sample, Samples, (-->),

    -- * Activation functions
    ActivationFunction, ActivationFunctionDerivative,
    sigmoid,
    sigmoid',

    -- * Network serialization
    saveNetwork,
    loadNetwork
) where

import Data.List.Split (chunksOf)
import Data.Binary
import Data.Maybe (fromMaybe)
import Text.Read (readMaybe)

import System.Directory
import System.Random
import Control.Monad (zipWithM, forM)
import Data.Array.IO
import Debug.Trace (trace)
import Text.Regex.PCRE

import Numeric.LinearAlgebra

-- | The generic feedforward network type, a binary instance is implemented.
-- It takes a list of layers
-- with a minimum of one (output layer).
-- It is usually constructed using the `newNetwork` function.
data Network a = Network { layers :: [Layer a] }
                 deriving (Show)

-- | One layer of a network, storing the weights matrix and the biases vector
-- of this layer.
data Layer a = Layer { weights :: Matrix a, biases :: Vector a }
               deriving (Show)

instance (Element a, Binary a) => Binary (Network a) where
    put (Network ls) = put ls
    get = Network `fmap` get

instance (Element a, Binary a) => Binary (Layer a) where
    put (Layer ws bs) = do
        put (toLists ws)
        put (toList bs)
    get = do
        ws <- get
        bs <- get
        return $ Layer (fromLists ws) (fromList bs)

-- | Cost Function Enum
data CostFunction = QuadraticCost
                  | CrossEntropyCost
                    deriving (Show, Eq)

-- | getDelta based on the raw input, the activated input and the desired output
-- results in different values depending on the CostFunction type.
getDelta :: Floating a => CostFunction -> a -> a -> a -> a
getDelta QuadraticCost    z a y = (a - y) * sigmoid'(z)
getDelta CrossEntropyCost _ a y = a - y

-- | Activation function used to calculate the actual output of a neuron.
-- Usually the 'sigmoid' function.
type ActivationFunction a = a -> a

-- | The derivative of an activation function.
type ActivationFunctionDerivative a = a -> a

-- | Training sample that can be used for the training functions.
--
-- > trainingData :: Samples Double
-- > trainingData = [ fromList [0, 0] --> fromList [0],
-- >                  fromList [0, 1] --> fromList [1],
-- >                  fromList [1, 0] --> fromList [1],
-- >                  fromList [1, 1] --> fromList [0]]
type Sample a = (Vector a, Vector a)

-- | A list of 'Sample's
type Samples a = [Sample a]

-- | A simple synonym for the (,) operator, used to create samples very intuitively.
(-->) :: Vector a -> Vector a -> Sample a
(-->) = (,)

-- | The learning rate, affects the learning speed, lower learning rate results
-- in slower learning, but usually better results after more epochs.
type LearningRate = Double

-- | Lambda value affecting the regularization while learning.
type Lambda = Double

-- | Wrapper around the training data length.
type TrainingDataLength = Int

-- | Initializes a new network with random values for weights and biases
-- in all layers.
--
-- > net <- newNetwork [2, 3, 4]
newNetwork :: [Int] -> IO (Network Double)
newNetwork layerSizes
    | length layerSizes < 2 = error "Network too small!"
    | otherwise = do
        lays <- zipWithM go (init layerSizes) (tail layerSizes)
        return $ Network lays
  where go :: Int -> Int -> IO (Layer Double)
        go inputSize outputSize = do
            ws <- randn outputSize inputSize
            seed <- randomIO
            let bs = randomVector seed Gaussian outputSize
            return $ Layer ws bs

-- | Calculate the output of the network based on the network, a given
-- 'ActivationFunction' and the input vector.
output :: (Numeric a, Num (Vector a))
       => Network a
       -> ActivationFunction a
       -> Vector a
       -> Vector a
output net act input = foldl f input (layers net)
    where f vec layer = cmap act ((weights layer #> vec) + biases layer)

rawOutputs :: (Numeric a, Num (Vector a))
           => Network a
           -> ActivationFunction a
           -> Vector a
           -> [(Vector a, Vector a)]
rawOutputs net act input = scanl f (input, input) (layers net)
    where f (_, a) layer = let z' = (weights layer #> a) + biases layer in
                           (z', cmap act z')

-- | The most used training function, randomly shuffling the training set before
-- every training epoch
--
-- > trainShuffled 30 (\n e -> "") net CrossEntropyCost 0.5 trainData 10 0.1
trainShuffled :: Int
              -> (Network Double -> Int -> String)
              -> Network Double
              -> CostFunction
              -> Lambda
              -> Samples Double
              -> Int
              -> Double
              -> IO (Network Double)
trainShuffled 0 _ net _ _ _ _ _ = return net
trainShuffled epochs debug net costFunction lambda trainSamples miniBatchSize eta = do
    spls <- shuffle trainSamples
    let !net' = trainSGD net costFunction lambda spls miniBatchSize eta
    trace (debug net' epochs)
        (trainShuffled (epochs - 1) debug net' costFunction lambda trainSamples miniBatchSize eta)


-- | Pure version of 'trainShuffled', training the network /n/ times without
-- shuffling the training set, resulting in slightly worse results.
trainNTimes :: Int
            -> (Network Double -> Int -> String)
            -> Network Double
            -> CostFunction
            -> Lambda
            -> Samples Double
            -> Int
            -> Double
            -> Network Double
trainNTimes 0 _ net _ _ _ _ _ = net
trainNTimes epochs debug net costFunction lambda trainSamples miniBatchSize eta =
    trace (debug net' epochs)
    (trainNTimes (epochs - 1) debug net' costFunction lambda trainSamples miniBatchSize eta)
  where !net' = trainSGD net costFunction lambda trainSamples miniBatchSize eta


trainSGD :: (Numeric Double, Floating Double)
         => Network Double
         -> CostFunction
         -> Lambda
         -> Samples Double
         -> Int
         -> Double
         -> Network Double
trainSGD net costFunction lambda trainSamples miniBatchSize eta =
    foldl updateMiniBatch net (chunksOf miniBatchSize trainSamples)
  where updateMiniBatch = update eta costFunction lambda (length trainSamples)

update :: LearningRate
       -> CostFunction
       -> Lambda
       -> TrainingDataLength
       -> Network Double
       -> Samples Double
       -> Network Double
update eta costFunction lambda n net spls = case newNablas of
        Nothing -> net
        Just x -> net { layers = layers' x }
    where newNablas :: Maybe [Layer Double]
          newNablas = foldl updateNablas Nothing spls
          updateNablas :: Maybe [Layer Double] -> Sample Double -> Maybe [Layer Double]
          updateNablas mayNablas sample =
              let nablasDelta = backprop net costFunction sample
                  f nabla nablaDelta =
                      nabla { weights = weights nabla + weights nablaDelta,
                              biases = biases nabla + biases nablaDelta }
              in case mayNablas of
                  Just nablas -> Just $ zipWith f nablas nablasDelta
                  Nothing -> Just $ nablasDelta
          layers' :: [Layer Double] -> [Layer Double]
          layers' nablas = zipWith updateLayer (layers net) nablas
          updateLayer :: Layer Double -> Layer Double -> Layer Double
          updateLayer layer nabla =
              let w = weights layer  -- weights matrix
                  nw = weights nabla
                  b = biases layer  -- biases vector
                  nb = biases nabla
                  fac = 1 - eta * (lambda / fromIntegral n)
                  w' = scale fac w - scale (eta / (fromIntegral $ length spls)) nw
                  b' = b - scale (eta / (fromIntegral $ length spls)) nb
              in layer { weights = w', biases = b' }

backprop :: Network Double -> CostFunction -> Sample Double -> [Layer Double]
backprop net costFunction spl = finalNablas
    where rawFeedforward :: [(Vector Double, Vector Double)]
          rawFeedforward = reverse $ rawOutputs net sigmoid (fst spl)
          -- get starting activation and raw value
          headZ, headA :: Vector Double
          (headZ, headA) = head rawFeedforward
          -- get starting delta, based on the activation of the last layer
          startDelta = getDelta costFunction headZ headA (snd spl)
          -- calculate weighs of last layer in advance
          lastNablaB = startDelta
          lastNablaW = startDelta `outer` previousA
            where previousA
                    | length rawFeedforward > 1 = snd $ rawFeedforward !! 1
                    | otherwise                 = fst spl
          lastLayer = Layer { weights = lastNablaW, biases = lastNablaB }
          -- reverse layers, analogy to the reversed (z, a) list
          layersReversed = reverse $ layers net
          -- calculate nablas, beginning at the end of the network (startDelta)
          (finalNablas, _) = foldl calculate ([lastLayer], startDelta)
                        [1..length layersReversed - 1]
          -- takes the index and updates nablas
          calculate (nablas, oldDelta) idx =
              let -- extract raw and activated value
                  (z, _) = rawFeedforward !! idx
                  -- apply prime derivative of sigmoid
                  z' = cmap sigmoid' z
                  -- calculate new delta
                  w = weights $ layersReversed !! (idx - 1)
                  delta = (tr w #> oldDelta) * z'
                  -- nablaB is just the delta vector
                  nablaB = delta
                  -- activation in previous layer
                  aPrevious = snd $ rawFeedforward !! (idx + 1)
                  -- dot product of delta and the activation in the previous layer
                  nablaW = delta `outer` aPrevious
                  -- put nablas into a new layer
              in (Layer { weights = nablaW, biases = nablaB } : nablas, delta)


-- | The sigmoid function
sigmoid :: Floating a => ActivationFunction a
sigmoid x = 1 / (1 + exp (-x))

-- | The derivative of the sigmoid function.
sigmoid' :: Floating a => ActivationFunctionDerivative a
sigmoid' x = sigmoid x * (1 - sigmoid x)

shuffle :: [a] -> IO [a]
shuffle xs = do
        ar <- newArr n xs
        forM [1..n] $ \i -> do
            j <- randomRIO (i,n)
            vi <- readArray ar i
            vj <- readArray ar j
            writeArray ar j vi
            return vj
  where
    n = length xs
    newArr :: Int -> [a] -> IO (IOArray Int a)
    newArr len lst = newListArray (1,len) lst

-- | Saves the network as the given filename. When the file already exists,
-- it looks for another filename by increasing the version, e.g
-- /mnist.net/ becomes /mnist1.net/.
saveNetwork :: (Element a, Binary a) => FilePath -> Network a -> IO ()
saveNetwork fp net = do
    ex <- doesFileExist fp
    case ex of
        True -> saveNetwork (newFileName fp) net
        False -> encodeFile fp net

newFileName :: FilePath -> FilePath
newFileName fp = case fp =~ "(.+[a-z]){0,1}([0-9]*)(\\..*)" :: [[String]] of
    [[_, p, v, s]] -> p ++ show (version v + 1) ++ s
    _              -> fp ++ "l"
  where version :: String -> Int
        version xs = fromMaybe 0 (readMaybe xs :: Maybe Int)

-- | Load the network with the given filename.
loadNetwork :: (Element a, Binary a) => FilePath -> IO (Network a)
loadNetwork = decodeFile