diff --git a/Backend.hs b/Backend.hs new file mode 100644 index 0000000..ac1143a --- /dev/null +++ b/Backend.hs @@ -0,0 +1,5 @@ +-- | no idea, maybe try to implement the whole shit with repa + +import Data.Array.Repa (Z(..), (:.)(..), DIM0(..), DIM1(..), DIM2(..), U, D, Array(..), + (!)) +import qualified Data.Array.Repa as R diff --git a/MNIST.hs b/MNIST.hs index 72d1e51..d0aaad2 100644 --- a/MNIST.hs +++ b/MNIST.hs @@ -37,7 +37,7 @@ data Arguments = Arguments { eta :: Double, lambda :: Double, arguments = Arguments { eta = 0.5 &= help "Learning rate", lambda = 5 &= help "Lambda of regularization", filePath = "" &= help "Load network from file", - costFunction = Quadratic &= help "Cost function", + costFunction = CrossEntropy &= help "Cost function", epochs = 30 &= help "Number of training epochs", miniBatchSize = 10 &= help "Mini batch size", hiddenNeurons = 30 &= help "Number of neurons in hidden layer" } diff --git a/Network.hs b/Network.hs index e0dc4e1..a3aaf65 100644 --- a/Network.hs +++ b/Network.hs @@ -115,14 +115,18 @@ type Samples a = [Sample a] -- | The learning rate, affects the learning speed, lower learning rate results -- in slower learning, but usually better results after more epochs. -type LearningRate = Double +type LearningRate a = a -- | Lambda value affecting the regularization while learning. -type Lambda = Double +type Lambda a = a -- | Wrapper around the training data length. type TrainingDataLength = Int +-- | Size of one mini batch, subset of training set used to update the weights +-- averaged over this batch. +type MiniBatchSize = Int + -- | Initializes a new network with random values for weights and biases -- in all layers. -- @@ -135,7 +139,7 @@ newNetwork layerSizes return $ Network lays where go :: Int -> Int -> IO (Layer Double) go inputSize outputSize = do - ws <- randn outputSize inputSize + ws <- fmap (/ (sqrt $ fromIntegral inputSize)) (randn outputSize inputSize) seed <- randomIO let bs = randomVector seed Gaussian outputSize return $ Layer ws bs @@ -163,15 +167,16 @@ rawOutputs net act input = scanl f (input, input) (layers net) -- every training epoch -- -- > trainShuffled 30 (\n e -> "") net CrossEntropyCost 0.5 trainData 10 0.1 -trainShuffled :: Int - -> (Network Double -> Int -> String) - -> Network Double +trainShuffled :: forall a. (Numeric a, Floating a, Floating (Vector a)) + => Int + -> (Network a -> Int -> String) + -> Network a -> CostFunction - -> Lambda - -> Samples Double - -> Int - -> Double - -> IO (Network Double) + -> Lambda a + -> Samples a + -> MiniBatchSize + -> LearningRate a + -> IO (Network a) trainShuffled 0 _ net _ _ _ _ _ = return net trainShuffled epochs debug net costFunction lambda trainSamples miniBatchSize eta = do spls <- shuffle trainSamples @@ -182,15 +187,16 @@ trainShuffled epochs debug net costFunction lambda trainSamples miniBatchSize et -- | Pure version of 'trainShuffled', training the network /n/ times without -- shuffling the training set, resulting in slightly worse results. -trainNTimes :: Int - -> (Network Double -> Int -> String) - -> Network Double +trainNTimes :: forall a. (Numeric a, Floating a, Floating (Vector a)) + => Int + -> (Network a -> Int -> String) + -> Network a -> CostFunction - -> Lambda - -> Samples Double - -> Int - -> Double - -> Network Double + -> Lambda a + -> Samples a + -> MiniBatchSize + -> LearningRate a + -> Network a trainNTimes 0 _ net _ _ _ _ _ = net trainNTimes epochs debug net costFunction lambda trainSamples miniBatchSize eta = trace (debug net' epochs) @@ -198,58 +204,86 @@ trainNTimes epochs debug net costFunction lambda trainSamples miniBatchSize eta where !net' = trainSGD net costFunction lambda trainSamples miniBatchSize eta -trainSGD :: (Numeric Double, Floating Double) - => Network Double +-- | Train the network using Stochastic Gradient Descent. +-- This is an improved version of Gradient Descent splitting the training data into +-- subsets to update the networks weights more often resulting in better accuracy. +-- +-- On each mini batch, 'update' is called to calculate the improved network. +trainSGD :: forall a. (Numeric a, Floating a, Floating (Vector a)) + => Network a -> CostFunction - -> Lambda - -> Samples Double - -> Int - -> Double - -> Network Double + -> Lambda a + -> Samples a + -> MiniBatchSize + -> LearningRate a + -> Network a trainSGD net costFunction lambda trainSamples miniBatchSize eta = foldl' updateMiniBatch net (chunksOf miniBatchSize trainSamples) - where updateMiniBatch = update eta costFunction lambda (length trainSamples) - -update :: LearningRate + where -- update network based on given mini batch using Gradient Descent + updateMiniBatch :: Network a -> Samples a -> Network a + updateMiniBatch = update eta costFunction lambda (length trainSamples) + +-- | Update the network using a set of samples and Gradient Descent. +-- This takes one mini batch to perform GD. +update :: forall a. (Numeric a, Floating a, Floating (Vector a)) + => LearningRate a -> CostFunction - -> Lambda + -> Lambda a -> TrainingDataLength - -> Network Double - -> Samples Double - -> Network Double -update eta costFunction lambda n net spls = case newNablas of + -> Network a + -> Samples a + -> Network a +update eta costFunction lambda n net spls = case mayNewNablas of Nothing -> net - Just x -> net { layers = layers' x } - where newNablas :: Maybe [Layer Double] - newNablas = foldl' updateNablas Nothing spls - updateNablas :: Maybe [Layer Double] -> Sample Double -> Maybe [Layer Double] + Just newNablas -> net { layers = applyNablas newNablas } + where -- calculate new nablas based on samples + mayNewNablas :: Maybe [Layer a] + mayNewNablas = foldl' updateNablas Nothing spls + -- update nablas by calculating new nablas and adding them to the + -- existing ones + updateNablas :: Maybe [Layer a] -> Sample a -> Maybe [Layer a] updateNablas mayNablas sample = - let !nablasDelta = backprop net costFunction sample - f nabla nablaDelta = + let -- calculate new nablas for this training sample + !nablasDelta = backprop net costFunction sample + -- takes an existing nabla layer and adds the delta + addDelta nabla nablaDelta = nabla { weights = weights nabla + weights nablaDelta, biases = biases nabla + biases nablaDelta } in case mayNablas of - Just nablas -> Just $ zipWith f nablas nablasDelta + -- if there are already nablas, add the new ones + Just nablas -> Just $ zipWith addDelta nablas nablasDelta + -- otherwise return the new ones Nothing -> Just $ nablasDelta - layers' :: [Layer Double] -> [Layer Double] - layers' nablas = zipWith updateLayer (layers net) nablas - updateLayer :: Layer Double -> Layer Double -> Layer Double - updateLayer layer nabla = + -- apply nablas to layers of the network + applyNablas :: [Layer a] -> [Layer a] + applyNablas nablas = zipWith applyNabla (layers net) nablas + -- apply nabla to one layer + applyNabla :: Layer a -> Layer a -> Layer a + applyNabla layer nabla = let w = weights layer -- weights matrix - nw = weights nabla + nw = weights nabla -- weights nablas matrix b = biases layer -- biases vector - nb = biases nabla + nb = biases nabla -- biases nablas vector fac = 1 - eta * (lambda / fromIntegral n) + -- subtract nablas from weights w' = scale fac w - scale (eta / (fromIntegral $ length spls)) nw + -- subtract nablas from biases b' = b - scale (eta / (fromIntegral $ length spls)) nb in layer { weights = w', biases = b' } -backprop :: Network Double -> CostFunction -> Sample Double -> [Layer Double] +-- | Backpropagate the error and calculate the partial derivatives for the +-- weights and biases in each layer. +-- Returns a list of layers holding the nablas accordingly. +backprop :: forall a. (Numeric a, Floating a, Floating (Vector a)) + => Network a + -> CostFunction + -> Sample a + -> [Layer a] backprop net costFunction spl = finalNablas - where rawFeedforward :: [(Vector Double, Vector Double)] + where rawFeedforward :: [(Vector a, Vector a)] rawFeedforward = reverse $ rawOutputs net sigmoid (fst spl) -- get starting activation and raw value - headZ, headA :: Vector Double + headZ, headA :: Vector a (headZ, headA) = head rawFeedforward -- get starting delta, based on the activation of the last layer startDelta = getDelta costFunction headZ headA (snd spl) @@ -293,6 +327,7 @@ sigmoid x = 1 / (1 + exp (-x)) sigmoid' :: Floating a => ActivationFunctionDerivative a sigmoid' x = sigmoid x * (1 - sigmoid x) +-- | Shuffle a list randomly. shuffle :: [a] -> IO [a] shuffle xs = do ar <- newArr n xs @@ -317,6 +352,7 @@ saveNetwork fp net = do True -> saveNetwork (newFileName fp) net False -> encodeFile fp net +-- | Find a new filename by replacing the old version with the next higher one. newFileName :: FilePath -> FilePath newFileName fp = case fp =~ "(.+[a-z]){0,1}([0-9]*)(\\..*)" :: [[String]] of [[_, p, v, s]] -> p ++ show (version v + 1) ++ s