Haskell Artificial Neural Networking library
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

367 linhas
14KB

  1. {-# LANGUAGE ScopedTypeVariables, FlexibleContexts, BangPatterns #-}
  2. {-# OPTIONS -Wall #-}
  3. -- |
  4. -- Module : Network
  5. -- Copyright : (c) 2017 Christian Merten
  6. -- Maintainer : c.merten@gmx.net
  7. -- Stability : experimental
  8. -- Portability : GHC
  9. --
  10. -- An implementation of artifical feed-forward neural networks in pure Haskell.
  11. --
  12. -- An example is added in /XOR.hs/
  13. module Network (
  14. -- * Network
  15. Network(..),
  16. Layer(..),
  17. newNetwork,
  18. output,
  19. -- * Learning functions
  20. trainShuffled,
  21. trainNTimes,
  22. CostFunction(..),
  23. getDelta,
  24. LearningRate,
  25. Lambda,
  26. TrainingDataLength,
  27. Sample, Samples, (-->),
  28. -- * Activation functions
  29. ActivationFunction, ActivationFunctionDerivative,
  30. sigmoid,
  31. sigmoid',
  32. -- * Network serialization
  33. saveNetwork,
  34. loadNetwork
  35. ) where
  36. import Data.List (foldl')
  37. import Data.List.Split (chunksOf)
  38. import Data.Maybe (fromMaybe)
  39. import Text.Read (readMaybe)
  40. import Data.Binary
  41. import System.Directory
  42. import System.Random
  43. import Control.Monad (zipWithM, forM)
  44. import Data.Array.IO
  45. import Debug.Trace (trace)
  46. import Text.Regex.PCRE
  47. import Numeric.LinearAlgebra
  48. -- | The generic feedforward network type, a binary instance is implemented.
  49. -- It takes a list of layers
  50. -- with a minimum of one (output layer).
  51. -- It is usually constructed using the `newNetwork` function.
  52. data Network a = Network { layers :: [Layer a] }
  53. deriving (Show)
  54. -- | One layer of a network, storing the weights matrix and the biases vector
  55. -- of this layer.
  56. data Layer a = Layer { weights :: Matrix a, biases :: Vector a }
  57. deriving (Show)
  58. instance (Element a, Binary a) => Binary (Network a) where
  59. put (Network ls) = put ls
  60. get = Network `fmap` get
  61. instance (Element a, Binary a) => Binary (Layer a) where
  62. put (Layer ws bs) = do
  63. put (toLists ws)
  64. put (toList bs)
  65. get = do
  66. ws <- get
  67. bs <- get
  68. return $ Layer (fromLists ws) (fromList bs)
  69. -- | Cost Function Enum
  70. data CostFunction = QuadraticCost
  71. | CrossEntropyCost
  72. deriving (Show, Eq)
  73. -- | getDelta based on the raw input, the activated input and the desired output
  74. -- results in different values depending on the CostFunction type.
  75. getDelta :: Floating a => CostFunction -> a -> a -> a -> a
  76. getDelta QuadraticCost z a y = (a - y) * sigmoid'(z)
  77. getDelta CrossEntropyCost _ a y = a - y
  78. -- | Activation function used to calculate the actual output of a neuron.
  79. -- Usually the 'sigmoid' function.
  80. type ActivationFunction a = a -> a
  81. -- | The derivative of an activation function.
  82. type ActivationFunctionDerivative a = a -> a
  83. -- | Training sample that can be used for the training functions.
  84. --
  85. -- > trainingData :: Samples Double
  86. -- > trainingData = [ fromList [0, 0] --> fromList [0],
  87. -- > fromList [0, 1] --> fromList [1],
  88. -- > fromList [1, 0] --> fromList [1],
  89. -- > fromList [1, 1] --> fromList [0]]
  90. type Sample a = (Vector a, Vector a)
  91. -- | A list of 'Sample's
  92. type Samples a = [Sample a]
  93. -- | A simple synonym for the (,) operator, used to create samples very
  94. -- intuitively.
  95. (-->) :: Vector a -> Vector a -> Sample a
  96. (-->) = (,)
  97. -- | The learning rate, affects the learning speed, lower learning rate results
  98. -- in slower learning, but usually better results after more epochs.
  99. type LearningRate a = a
  100. -- | Lambda value affecting the regularization while learning.
  101. type Lambda a = a
  102. -- | Wrapper around the training data length.
  103. type TrainingDataLength = Int
  104. -- | Size of one mini batch, subset of training set used to update the weights
  105. -- averaged over this batch.
  106. type MiniBatchSize = Int
  107. -- | Initializes a new network with random values for weights and biases
  108. -- in all layers.
  109. --
  110. -- > net <- newNetwork [2, 3, 4]
  111. newNetwork :: [Int] -> IO (Network Double)
  112. newNetwork layerSizes
  113. | length layerSizes < 2 = error "Network too small!"
  114. | otherwise = do
  115. lays <- zipWithM go (init layerSizes) (tail layerSizes)
  116. return $ Network lays
  117. where go :: Int -> Int -> IO (Layer Double)
  118. go inputSize outputSize = do
  119. ws <- (/ (sqrt $ fromIntegral inputSize)) <$> randn outputSize inputSize
  120. seed <- randomIO
  121. let bs = randomVector seed Gaussian outputSize
  122. return $ Layer ws bs
  123. -- | Calculate the output of the network based on the network, a given
  124. -- 'ActivationFunction' and the input vector.
  125. output :: (Numeric a, Num (Vector a))
  126. => Network a
  127. -> ActivationFunction a
  128. -> Vector a
  129. -> Vector a
  130. output net act input = foldl' f input (layers net)
  131. where f vec layer = cmap act ((weights layer #> vec) + biases layer)
  132. rawOutputs :: (Numeric a, Num (Vector a))
  133. => Network a
  134. -> ActivationFunction a
  135. -> Vector a
  136. -> [(Vector a, Vector a)]
  137. rawOutputs net act input = scanl f (input, input) (layers net)
  138. where f (_, a) layer = let z' = (weights layer #> a) + biases layer in
  139. (z', cmap act z')
  140. -- | The most used training function, randomly shuffling the training set before
  141. -- every training epoch
  142. --
  143. -- > trainShuffled 30 (\n e -> "") net CrossEntropyCost 0.5 trainData 10 0.1
  144. trainShuffled :: forall a. (Numeric a, Floating a, Floating (Vector a))
  145. => Int
  146. -> (Network a -> Int -> String)
  147. -> Network a
  148. -> CostFunction
  149. -> Lambda a
  150. -> Samples a
  151. -> MiniBatchSize
  152. -> LearningRate a
  153. -> IO (Network a)
  154. trainShuffled 0 _ net _ _ _ _ _ = return net
  155. trainShuffled epochs debug net costFunction lambda trainSamples miniBatchSize eta = do
  156. spls <- shuffle trainSamples
  157. let !net' = trainSGD net costFunction lambda spls miniBatchSize eta
  158. trace (debug net' epochs)
  159. (trainShuffled (epochs - 1) debug net' costFunction lambda trainSamples miniBatchSize eta)
  160. -- | Pure version of 'trainShuffled', training the network /n/ times without
  161. -- shuffling the training set, resulting in slightly worse results.
  162. trainNTimes :: forall a. (Numeric a, Floating a, Floating (Vector a))
  163. => Int
  164. -> (Network a -> Int -> String)
  165. -> Network a
  166. -> CostFunction
  167. -> Lambda a
  168. -> Samples a
  169. -> MiniBatchSize
  170. -> LearningRate a
  171. -> Network a
  172. trainNTimes 0 _ net _ _ _ _ _ = net
  173. trainNTimes epochs debug net costFunction lambda trainSamples miniBatchSize eta =
  174. trace (debug net' epochs)
  175. (trainNTimes (epochs - 1) debug net' costFunction lambda trainSamples miniBatchSize eta)
  176. where !net' = trainSGD net costFunction lambda trainSamples miniBatchSize eta
  177. -- | Train the network using Stochastic Gradient Descent.
  178. -- This is an improved version of Gradient Descent splitting the training data into
  179. -- subsets to update the networks weights more often resulting in better accuracy.
  180. --
  181. -- On each mini batch, 'update' is called to calculate the improved network.
  182. trainSGD :: forall a. (Numeric a, Floating a, Floating (Vector a))
  183. => Network a
  184. -> CostFunction
  185. -> Lambda a
  186. -> Samples a
  187. -> MiniBatchSize
  188. -> LearningRate a
  189. -> Network a
  190. trainSGD net costFunction lambda trainSamples miniBatchSize eta =
  191. foldl' updateMiniBatch net (chunksOf miniBatchSize trainSamples)
  192. where -- update network based on given mini batch using Gradient Descent
  193. updateMiniBatch :: Network a -> Samples a -> Network a
  194. updateMiniBatch = update eta costFunction lambda (length trainSamples)
  195. -- | Update the network using a set of samples and Gradient Descent.
  196. -- This takes one mini batch to perform GD.
  197. update :: forall a. (Numeric a, Floating a, Floating (Vector a))
  198. => LearningRate a
  199. -> CostFunction
  200. -> Lambda a
  201. -> TrainingDataLength
  202. -> Network a
  203. -> Samples a
  204. -> Network a
  205. update eta costFunction lambda n net spls = case mayNewNablas of
  206. Nothing -> net
  207. Just newNablas -> net { layers = applyNablas newNablas }
  208. where -- calculate new nablas based on samples
  209. mayNewNablas :: Maybe [Layer a]
  210. mayNewNablas = foldl' updateNablas Nothing spls
  211. -- update nablas by calculating new nablas and adding them to the
  212. -- existing ones
  213. updateNablas :: Maybe [Layer a] -> Sample a -> Maybe [Layer a]
  214. updateNablas mayNablas sample =
  215. let -- calculate new nablas for this training sample
  216. !nablasDelta = backprop net costFunction sample
  217. -- takes an existing nabla layer and adds the delta
  218. addDelta nabla nablaDelta =
  219. nabla { weights = weights nabla + weights nablaDelta,
  220. biases = biases nabla + biases nablaDelta }
  221. in case mayNablas of
  222. -- if there are already nablas, add the new ones
  223. Just nablas -> Just $ zipWith addDelta nablas nablasDelta
  224. -- otherwise return the new ones
  225. Nothing -> Just $ nablasDelta
  226. -- apply nablas to layers of the network
  227. applyNablas :: [Layer a] -> [Layer a]
  228. applyNablas nablas = zipWith applyNabla (layers net) nablas
  229. -- apply nabla to one layer
  230. applyNabla :: Layer a -> Layer a -> Layer a
  231. applyNabla layer nabla =
  232. let w = weights layer -- weights matrix
  233. nw = weights nabla -- weights nablas matrix
  234. b = biases layer -- biases vector
  235. nb = biases nabla -- biases nablas vector
  236. fac = 1 - eta * (lambda / fromIntegral n)
  237. -- subtract nablas from weights
  238. w' = scale fac w - scale (eta / (fromIntegral $ length spls)) nw
  239. -- subtract nablas from biases
  240. b' = b - scale (eta / (fromIntegral $ length spls)) nb
  241. in layer { weights = w', biases = b' }
  242. -- | Backpropagate the error and calculate the partial derivatives for the
  243. -- weights and biases in each layer.
  244. -- Returns a list of layers holding the nablas accordingly.
  245. backprop :: forall a. (Numeric a, Floating a, Floating (Vector a))
  246. => Network a
  247. -> CostFunction
  248. -> Sample a
  249. -> [Layer a]
  250. backprop net costFunction spl = finalNablas
  251. where rawFeedforward :: [(Vector a, Vector a)]
  252. rawFeedforward = reverse $ rawOutputs net sigmoid (fst spl)
  253. -- get starting activation and raw value
  254. headZ, headA :: Vector a
  255. (headZ, headA) = head rawFeedforward
  256. -- get starting delta, based on the activation of the last layer
  257. startDelta = getDelta costFunction headZ headA (snd spl)
  258. -- calculate nabla of biases
  259. lastNablaB = startDelta
  260. -- calculate nabla of weighs of last layer in advance
  261. lastNablaW = startDelta `outer` previousA
  262. where previousA
  263. | length rawFeedforward > 1 = snd $ rawFeedforward !! 1
  264. | otherwise = fst spl
  265. lastLayer = Layer { weights = lastNablaW, biases = lastNablaB }
  266. -- reverse layers, analogy to the reversed (z, a) list
  267. layersReversed = reverse $ layers net
  268. -- calculate nablas, beginning at the end of the network (startDelta)
  269. (finalNablas, _) = foldl' calculate ([lastLayer], startDelta)
  270. [1..length layersReversed - 1]
  271. -- takes the index and updates nablas
  272. calculate (nablas, oldDelta) idx =
  273. let -- extract raw and activated value
  274. (z, _) = rawFeedforward !! idx
  275. -- apply prime derivative of sigmoid
  276. z' = cmap sigmoid' z
  277. -- calculate new delta
  278. w = weights $ layersReversed !! (idx - 1)
  279. delta = (tr w #> oldDelta) * z'
  280. -- nablaB is just the delta vector
  281. nablaB = delta
  282. -- activation in previous layer
  283. aPrevious = snd $ rawFeedforward !! (idx + 1)
  284. -- dot product of delta and the activation in the previous layer
  285. nablaW = delta `outer` aPrevious
  286. -- put nablas into a new layer
  287. in (Layer { weights = nablaW, biases = nablaB } : nablas, delta)
  288. -- | The sigmoid function
  289. sigmoid :: Floating a => ActivationFunction a
  290. sigmoid x = 1 / (1 + exp (-x))
  291. -- | The derivative of the sigmoid function.
  292. sigmoid' :: Floating a => ActivationFunctionDerivative a
  293. sigmoid' x = sigmoid x * (1 - sigmoid x)
  294. -- | Shuffle a list randomly.
  295. shuffle :: [a] -> IO [a]
  296. shuffle xs = do
  297. ar <- newArr n xs
  298. forM [1..n] $ \i -> do
  299. j <- randomRIO (i,n)
  300. vi <- readArray ar i
  301. vj <- readArray ar j
  302. writeArray ar j vi
  303. return vj
  304. where
  305. n = length xs
  306. newArr :: Int -> [a] -> IO (IOArray Int a)
  307. newArr len lst = newListArray (1,len) lst
  308. -- | Saves the network as the given filename. When the file already exists,
  309. -- it looks for another filename by increasing the version, e.g
  310. -- /mnist.net/ becomes /mnist1.net/.
  311. saveNetwork :: (Element a, Binary a) => FilePath -> Network a -> IO ()
  312. saveNetwork fp net = do
  313. ex <- doesFileExist fp
  314. case ex of
  315. True -> saveNetwork (newFileName fp) net
  316. False -> encodeFile fp net
  317. -- | Find a new filename by replacing the old version with the next higher one.
  318. newFileName :: FilePath -> FilePath
  319. newFileName fp = case fp =~ "(.+[a-z]){0,1}([0-9]*)(\\..*)" :: [[String]] of
  320. [[_, p, v, s]] -> p ++ show (version v + 1) ++ s
  321. _ -> fp ++ "l"
  322. where version :: String -> Int
  323. version xs = fromMaybe 0 (readMaybe xs :: Maybe Int)
  324. -- | Load the network with the given filename.
  325. loadNetwork :: (Element a, Binary a) => FilePath -> IO (Network a)
  326. loadNetwork = decodeFile