|
|
@@ -115,14 +115,18 @@ type Samples a = [Sample a] |
|
|
|
|
|
|
|
|
-- | The learning rate, affects the learning speed, lower learning rate results |
|
|
-- | The learning rate, affects the learning speed, lower learning rate results |
|
|
-- in slower learning, but usually better results after more epochs. |
|
|
-- in slower learning, but usually better results after more epochs. |
|
|
type LearningRate = Double |
|
|
|
|
|
|
|
|
type LearningRate a = a |
|
|
|
|
|
|
|
|
-- | Lambda value affecting the regularization while learning. |
|
|
-- | Lambda value affecting the regularization while learning. |
|
|
type Lambda = Double |
|
|
|
|
|
|
|
|
type Lambda a = a |
|
|
|
|
|
|
|
|
-- | Wrapper around the training data length. |
|
|
-- | Wrapper around the training data length. |
|
|
type TrainingDataLength = Int |
|
|
type TrainingDataLength = Int |
|
|
|
|
|
|
|
|
|
|
|
-- | Size of one mini batch, subset of training set used to update the weights |
|
|
|
|
|
-- averaged over this batch. |
|
|
|
|
|
type MiniBatchSize = Int |
|
|
|
|
|
|
|
|
-- | Initializes a new network with random values for weights and biases |
|
|
-- | Initializes a new network with random values for weights and biases |
|
|
-- in all layers. |
|
|
-- in all layers. |
|
|
-- |
|
|
-- |
|
|
@@ -135,7 +139,7 @@ newNetwork layerSizes |
|
|
return $ Network lays |
|
|
return $ Network lays |
|
|
where go :: Int -> Int -> IO (Layer Double) |
|
|
where go :: Int -> Int -> IO (Layer Double) |
|
|
go inputSize outputSize = do |
|
|
go inputSize outputSize = do |
|
|
ws <- randn outputSize inputSize |
|
|
|
|
|
|
|
|
ws <- fmap (/ (sqrt $ fromIntegral inputSize)) (randn outputSize inputSize) |
|
|
seed <- randomIO |
|
|
seed <- randomIO |
|
|
let bs = randomVector seed Gaussian outputSize |
|
|
let bs = randomVector seed Gaussian outputSize |
|
|
return $ Layer ws bs |
|
|
return $ Layer ws bs |
|
|
@@ -163,15 +167,16 @@ rawOutputs net act input = scanl f (input, input) (layers net) |
|
|
-- every training epoch |
|
|
-- every training epoch |
|
|
-- |
|
|
-- |
|
|
-- > trainShuffled 30 (\n e -> "") net CrossEntropyCost 0.5 trainData 10 0.1 |
|
|
-- > trainShuffled 30 (\n e -> "") net CrossEntropyCost 0.5 trainData 10 0.1 |
|
|
trainShuffled :: Int |
|
|
|
|
|
-> (Network Double -> Int -> String) |
|
|
|
|
|
-> Network Double |
|
|
|
|
|
|
|
|
trainShuffled :: forall a. (Numeric a, Floating a, Floating (Vector a)) |
|
|
|
|
|
=> Int |
|
|
|
|
|
-> (Network a -> Int -> String) |
|
|
|
|
|
-> Network a |
|
|
-> CostFunction |
|
|
-> CostFunction |
|
|
-> Lambda |
|
|
|
|
|
-> Samples Double |
|
|
|
|
|
-> Int |
|
|
|
|
|
-> Double |
|
|
|
|
|
-> IO (Network Double) |
|
|
|
|
|
|
|
|
-> Lambda a |
|
|
|
|
|
-> Samples a |
|
|
|
|
|
-> MiniBatchSize |
|
|
|
|
|
-> LearningRate a |
|
|
|
|
|
-> IO (Network a) |
|
|
trainShuffled 0 _ net _ _ _ _ _ = return net |
|
|
trainShuffled 0 _ net _ _ _ _ _ = return net |
|
|
trainShuffled epochs debug net costFunction lambda trainSamples miniBatchSize eta = do |
|
|
trainShuffled epochs debug net costFunction lambda trainSamples miniBatchSize eta = do |
|
|
spls <- shuffle trainSamples |
|
|
spls <- shuffle trainSamples |
|
|
@@ -182,15 +187,16 @@ trainShuffled epochs debug net costFunction lambda trainSamples miniBatchSize et |
|
|
|
|
|
|
|
|
-- | Pure version of 'trainShuffled', training the network /n/ times without |
|
|
-- | Pure version of 'trainShuffled', training the network /n/ times without |
|
|
-- shuffling the training set, resulting in slightly worse results. |
|
|
-- shuffling the training set, resulting in slightly worse results. |
|
|
trainNTimes :: Int |
|
|
|
|
|
-> (Network Double -> Int -> String) |
|
|
|
|
|
-> Network Double |
|
|
|
|
|
|
|
|
trainNTimes :: forall a. (Numeric a, Floating a, Floating (Vector a)) |
|
|
|
|
|
=> Int |
|
|
|
|
|
-> (Network a -> Int -> String) |
|
|
|
|
|
-> Network a |
|
|
-> CostFunction |
|
|
-> CostFunction |
|
|
-> Lambda |
|
|
|
|
|
-> Samples Double |
|
|
|
|
|
-> Int |
|
|
|
|
|
-> Double |
|
|
|
|
|
-> Network Double |
|
|
|
|
|
|
|
|
-> Lambda a |
|
|
|
|
|
-> Samples a |
|
|
|
|
|
-> MiniBatchSize |
|
|
|
|
|
-> LearningRate a |
|
|
|
|
|
-> Network a |
|
|
trainNTimes 0 _ net _ _ _ _ _ = net |
|
|
trainNTimes 0 _ net _ _ _ _ _ = net |
|
|
trainNTimes epochs debug net costFunction lambda trainSamples miniBatchSize eta = |
|
|
trainNTimes epochs debug net costFunction lambda trainSamples miniBatchSize eta = |
|
|
trace (debug net' epochs) |
|
|
trace (debug net' epochs) |
|
|
@@ -198,58 +204,86 @@ trainNTimes epochs debug net costFunction lambda trainSamples miniBatchSize eta |
|
|
where !net' = trainSGD net costFunction lambda trainSamples miniBatchSize eta |
|
|
where !net' = trainSGD net costFunction lambda trainSamples miniBatchSize eta |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
trainSGD :: (Numeric Double, Floating Double) |
|
|
|
|
|
=> Network Double |
|
|
|
|
|
|
|
|
-- | Train the network using Stochastic Gradient Descent. |
|
|
|
|
|
-- This is an improved version of Gradient Descent splitting the training data into |
|
|
|
|
|
-- subsets to update the networks weights more often resulting in better accuracy. |
|
|
|
|
|
-- |
|
|
|
|
|
-- On each mini batch, 'update' is called to calculate the improved network. |
|
|
|
|
|
trainSGD :: forall a. (Numeric a, Floating a, Floating (Vector a)) |
|
|
|
|
|
=> Network a |
|
|
-> CostFunction |
|
|
-> CostFunction |
|
|
-> Lambda |
|
|
|
|
|
-> Samples Double |
|
|
|
|
|
-> Int |
|
|
|
|
|
-> Double |
|
|
|
|
|
-> Network Double |
|
|
|
|
|
|
|
|
-> Lambda a |
|
|
|
|
|
-> Samples a |
|
|
|
|
|
-> MiniBatchSize |
|
|
|
|
|
-> LearningRate a |
|
|
|
|
|
-> Network a |
|
|
trainSGD net costFunction lambda trainSamples miniBatchSize eta = |
|
|
trainSGD net costFunction lambda trainSamples miniBatchSize eta = |
|
|
foldl' updateMiniBatch net (chunksOf miniBatchSize trainSamples) |
|
|
foldl' updateMiniBatch net (chunksOf miniBatchSize trainSamples) |
|
|
where updateMiniBatch = update eta costFunction lambda (length trainSamples) |
|
|
|
|
|
|
|
|
|
|
|
update :: LearningRate |
|
|
|
|
|
|
|
|
where -- update network based on given mini batch using Gradient Descent |
|
|
|
|
|
updateMiniBatch :: Network a -> Samples a -> Network a |
|
|
|
|
|
updateMiniBatch = update eta costFunction lambda (length trainSamples) |
|
|
|
|
|
|
|
|
|
|
|
-- | Update the network using a set of samples and Gradient Descent. |
|
|
|
|
|
-- This takes one mini batch to perform GD. |
|
|
|
|
|
update :: forall a. (Numeric a, Floating a, Floating (Vector a)) |
|
|
|
|
|
=> LearningRate a |
|
|
-> CostFunction |
|
|
-> CostFunction |
|
|
-> Lambda |
|
|
|
|
|
|
|
|
-> Lambda a |
|
|
-> TrainingDataLength |
|
|
-> TrainingDataLength |
|
|
-> Network Double |
|
|
|
|
|
-> Samples Double |
|
|
|
|
|
-> Network Double |
|
|
|
|
|
update eta costFunction lambda n net spls = case newNablas of |
|
|
|
|
|
|
|
|
-> Network a |
|
|
|
|
|
-> Samples a |
|
|
|
|
|
-> Network a |
|
|
|
|
|
update eta costFunction lambda n net spls = case mayNewNablas of |
|
|
Nothing -> net |
|
|
Nothing -> net |
|
|
Just x -> net { layers = layers' x } |
|
|
|
|
|
where newNablas :: Maybe [Layer Double] |
|
|
|
|
|
newNablas = foldl' updateNablas Nothing spls |
|
|
|
|
|
updateNablas :: Maybe [Layer Double] -> Sample Double -> Maybe [Layer Double] |
|
|
|
|
|
|
|
|
Just newNablas -> net { layers = applyNablas newNablas } |
|
|
|
|
|
where -- calculate new nablas based on samples |
|
|
|
|
|
mayNewNablas :: Maybe [Layer a] |
|
|
|
|
|
mayNewNablas = foldl' updateNablas Nothing spls |
|
|
|
|
|
-- update nablas by calculating new nablas and adding them to the |
|
|
|
|
|
-- existing ones |
|
|
|
|
|
updateNablas :: Maybe [Layer a] -> Sample a -> Maybe [Layer a] |
|
|
updateNablas mayNablas sample = |
|
|
updateNablas mayNablas sample = |
|
|
let !nablasDelta = backprop net costFunction sample |
|
|
|
|
|
f nabla nablaDelta = |
|
|
|
|
|
|
|
|
let -- calculate new nablas for this training sample |
|
|
|
|
|
!nablasDelta = backprop net costFunction sample |
|
|
|
|
|
-- takes an existing nabla layer and adds the delta |
|
|
|
|
|
addDelta nabla nablaDelta = |
|
|
nabla { weights = weights nabla + weights nablaDelta, |
|
|
nabla { weights = weights nabla + weights nablaDelta, |
|
|
biases = biases nabla + biases nablaDelta } |
|
|
biases = biases nabla + biases nablaDelta } |
|
|
in case mayNablas of |
|
|
in case mayNablas of |
|
|
Just nablas -> Just $ zipWith f nablas nablasDelta |
|
|
|
|
|
|
|
|
-- if there are already nablas, add the new ones |
|
|
|
|
|
Just nablas -> Just $ zipWith addDelta nablas nablasDelta |
|
|
|
|
|
-- otherwise return the new ones |
|
|
Nothing -> Just $ nablasDelta |
|
|
Nothing -> Just $ nablasDelta |
|
|
layers' :: [Layer Double] -> [Layer Double] |
|
|
|
|
|
layers' nablas = zipWith updateLayer (layers net) nablas |
|
|
|
|
|
updateLayer :: Layer Double -> Layer Double -> Layer Double |
|
|
|
|
|
updateLayer layer nabla = |
|
|
|
|
|
|
|
|
-- apply nablas to layers of the network |
|
|
|
|
|
applyNablas :: [Layer a] -> [Layer a] |
|
|
|
|
|
applyNablas nablas = zipWith applyNabla (layers net) nablas |
|
|
|
|
|
-- apply nabla to one layer |
|
|
|
|
|
applyNabla :: Layer a -> Layer a -> Layer a |
|
|
|
|
|
applyNabla layer nabla = |
|
|
let w = weights layer -- weights matrix |
|
|
let w = weights layer -- weights matrix |
|
|
nw = weights nabla |
|
|
|
|
|
|
|
|
nw = weights nabla -- weights nablas matrix |
|
|
b = biases layer -- biases vector |
|
|
b = biases layer -- biases vector |
|
|
nb = biases nabla |
|
|
|
|
|
|
|
|
nb = biases nabla -- biases nablas vector |
|
|
fac = 1 - eta * (lambda / fromIntegral n) |
|
|
fac = 1 - eta * (lambda / fromIntegral n) |
|
|
|
|
|
-- subtract nablas from weights |
|
|
w' = scale fac w - scale (eta / (fromIntegral $ length spls)) nw |
|
|
w' = scale fac w - scale (eta / (fromIntegral $ length spls)) nw |
|
|
|
|
|
-- subtract nablas from biases |
|
|
b' = b - scale (eta / (fromIntegral $ length spls)) nb |
|
|
b' = b - scale (eta / (fromIntegral $ length spls)) nb |
|
|
in layer { weights = w', biases = b' } |
|
|
in layer { weights = w', biases = b' } |
|
|
|
|
|
|
|
|
backprop :: Network Double -> CostFunction -> Sample Double -> [Layer Double] |
|
|
|
|
|
|
|
|
-- | Backpropagate the error and calculate the partial derivatives for the |
|
|
|
|
|
-- weights and biases in each layer. |
|
|
|
|
|
-- Returns a list of layers holding the nablas accordingly. |
|
|
|
|
|
backprop :: forall a. (Numeric a, Floating a, Floating (Vector a)) |
|
|
|
|
|
=> Network a |
|
|
|
|
|
-> CostFunction |
|
|
|
|
|
-> Sample a |
|
|
|
|
|
-> [Layer a] |
|
|
backprop net costFunction spl = finalNablas |
|
|
backprop net costFunction spl = finalNablas |
|
|
where rawFeedforward :: [(Vector Double, Vector Double)] |
|
|
|
|
|
|
|
|
where rawFeedforward :: [(Vector a, Vector a)] |
|
|
rawFeedforward = reverse $ rawOutputs net sigmoid (fst spl) |
|
|
rawFeedforward = reverse $ rawOutputs net sigmoid (fst spl) |
|
|
-- get starting activation and raw value |
|
|
-- get starting activation and raw value |
|
|
headZ, headA :: Vector Double |
|
|
|
|
|
|
|
|
headZ, headA :: Vector a |
|
|
(headZ, headA) = head rawFeedforward |
|
|
(headZ, headA) = head rawFeedforward |
|
|
-- get starting delta, based on the activation of the last layer |
|
|
-- get starting delta, based on the activation of the last layer |
|
|
startDelta = getDelta costFunction headZ headA (snd spl) |
|
|
startDelta = getDelta costFunction headZ headA (snd spl) |
|
|
@@ -293,6 +327,7 @@ sigmoid x = 1 / (1 + exp (-x)) |
|
|
sigmoid' :: Floating a => ActivationFunctionDerivative a |
|
|
sigmoid' :: Floating a => ActivationFunctionDerivative a |
|
|
sigmoid' x = sigmoid x * (1 - sigmoid x) |
|
|
sigmoid' x = sigmoid x * (1 - sigmoid x) |
|
|
|
|
|
|
|
|
|
|
|
-- | Shuffle a list randomly. |
|
|
shuffle :: [a] -> IO [a] |
|
|
shuffle :: [a] -> IO [a] |
|
|
shuffle xs = do |
|
|
shuffle xs = do |
|
|
ar <- newArr n xs |
|
|
ar <- newArr n xs |
|
|
@@ -317,6 +352,7 @@ saveNetwork fp net = do |
|
|
True -> saveNetwork (newFileName fp) net |
|
|
True -> saveNetwork (newFileName fp) net |
|
|
False -> encodeFile fp net |
|
|
False -> encodeFile fp net |
|
|
|
|
|
|
|
|
|
|
|
-- | Find a new filename by replacing the old version with the next higher one. |
|
|
newFileName :: FilePath -> FilePath |
|
|
newFileName :: FilePath -> FilePath |
|
|
newFileName fp = case fp =~ "(.+[a-z]){0,1}([0-9]*)(\\..*)" :: [[String]] of |
|
|
newFileName fp = case fp =~ "(.+[a-z]){0,1}([0-9]*)(\\..*)" :: [[String]] of |
|
|
[[_, p, v, s]] -> p ++ show (version v + 1) ++ s |
|
|
[[_, p, v, s]] -> p ++ show (version v + 1) ++ s |
|
|
|