Haskell Artificial Neural Networking library
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

366 行
14KB

  1. {-# LANGUAGE ScopedTypeVariables, FlexibleContexts, BangPatterns #-}
  2. {-# OPTIONS -Wall #-}
  3. -- |
  4. -- Module : Network
  5. -- Copyright : (c) 2017 Christian Merten
  6. -- Maintainer : c.merten@gmx.net
  7. -- Stability : experimental
  8. -- Portability : GHC
  9. --
  10. -- An implementation of artifical feed-forward neural networks in pure Haskell.
  11. --
  12. -- An example is added in /XOR.hs/
  13. module Network (
  14. -- * Network
  15. Network(..),
  16. Layer(..),
  17. newNetwork,
  18. output,
  19. -- * Learning functions
  20. trainShuffled,
  21. trainNTimes,
  22. CostFunction(..),
  23. getDelta,
  24. LearningRate,
  25. Lambda,
  26. TrainingDataLength,
  27. Sample, Samples, (-->),
  28. -- * Activation functions
  29. ActivationFunction, ActivationFunctionDerivative,
  30. sigmoid,
  31. sigmoid',
  32. -- * Network serialization
  33. saveNetwork,
  34. loadNetwork
  35. ) where
  36. import Data.List.Split (chunksOf)
  37. import Data.List (foldl')
  38. import Data.Binary
  39. import Data.Maybe (fromMaybe)
  40. import Text.Read (readMaybe)
  41. import System.Directory
  42. import System.Random
  43. import Control.Monad (zipWithM, forM)
  44. import Data.Array.IO
  45. import Debug.Trace (trace)
  46. import Text.Regex.PCRE
  47. import Numeric.LinearAlgebra
  48. -- | The generic feedforward network type, a binary instance is implemented.
  49. -- It takes a list of layers
  50. -- with a minimum of one (output layer).
  51. -- It is usually constructed using the `newNetwork` function.
  52. data Network a = Network { layers :: [Layer a] }
  53. deriving (Show)
  54. -- | One layer of a network, storing the weights matrix and the biases vector
  55. -- of this layer.
  56. data Layer a = Layer { weights :: Matrix a, biases :: Vector a }
  57. deriving (Show)
  58. instance (Element a, Binary a) => Binary (Network a) where
  59. put (Network ls) = put ls
  60. get = Network `fmap` get
  61. instance (Element a, Binary a) => Binary (Layer a) where
  62. put (Layer ws bs) = do
  63. put (toLists ws)
  64. put (toList bs)
  65. get = do
  66. ws <- get
  67. bs <- get
  68. return $ Layer (fromLists ws) (fromList bs)
  69. -- | Cost Function Enum
  70. data CostFunction = QuadraticCost
  71. | CrossEntropyCost
  72. deriving (Show, Eq)
  73. -- | getDelta based on the raw input, the activated input and the desired output
  74. -- results in different values depending on the CostFunction type.
  75. getDelta :: Floating a => CostFunction -> a -> a -> a -> a
  76. getDelta QuadraticCost z a y = (a - y) * sigmoid'(z)
  77. getDelta CrossEntropyCost _ a y = a - y
  78. -- | Activation function used to calculate the actual output of a neuron.
  79. -- Usually the 'sigmoid' function.
  80. type ActivationFunction a = a -> a
  81. -- | The derivative of an activation function.
  82. type ActivationFunctionDerivative a = a -> a
  83. -- | Training sample that can be used for the training functions.
  84. --
  85. -- > trainingData :: Samples Double
  86. -- > trainingData = [ fromList [0, 0] --> fromList [0],
  87. -- > fromList [0, 1] --> fromList [1],
  88. -- > fromList [1, 0] --> fromList [1],
  89. -- > fromList [1, 1] --> fromList [0]]
  90. type Sample a = (Vector a, Vector a)
  91. -- | A list of 'Sample's
  92. type Samples a = [Sample a]
  93. -- | A simple synonym for the (,) operator, used to create samples very intuitively.
  94. (-->) :: Vector a -> Vector a -> Sample a
  95. (-->) = (,)
  96. -- | The learning rate, affects the learning speed, lower learning rate results
  97. -- in slower learning, but usually better results after more epochs.
  98. type LearningRate a = a
  99. -- | Lambda value affecting the regularization while learning.
  100. type Lambda a = a
  101. -- | Wrapper around the training data length.
  102. type TrainingDataLength = Int
  103. -- | Size of one mini batch, subset of training set used to update the weights
  104. -- averaged over this batch.
  105. type MiniBatchSize = Int
  106. -- | Initializes a new network with random values for weights and biases
  107. -- in all layers.
  108. --
  109. -- > net <- newNetwork [2, 3, 4]
  110. newNetwork :: [Int] -> IO (Network Double)
  111. newNetwork layerSizes
  112. | length layerSizes < 2 = error "Network too small!"
  113. | otherwise = do
  114. lays <- zipWithM go (init layerSizes) (tail layerSizes)
  115. return $ Network lays
  116. where go :: Int -> Int -> IO (Layer Double)
  117. go inputSize outputSize = do
  118. ws <- fmap (/ (sqrt $ fromIntegral inputSize)) (randn outputSize inputSize)
  119. seed <- randomIO
  120. let bs = randomVector seed Gaussian outputSize
  121. return $ Layer ws bs
  122. -- | Calculate the output of the network based on the network, a given
  123. -- 'ActivationFunction' and the input vector.
  124. output :: (Numeric a, Num (Vector a))
  125. => Network a
  126. -> ActivationFunction a
  127. -> Vector a
  128. -> Vector a
  129. output net act input = foldl' f input (layers net)
  130. where f vec layer = cmap act ((weights layer #> vec) + biases layer)
  131. rawOutputs :: (Numeric a, Num (Vector a))
  132. => Network a
  133. -> ActivationFunction a
  134. -> Vector a
  135. -> [(Vector a, Vector a)]
  136. rawOutputs net act input = scanl f (input, input) (layers net)
  137. where f (_, a) layer = let z' = (weights layer #> a) + biases layer in
  138. (z', cmap act z')
  139. -- | The most used training function, randomly shuffling the training set before
  140. -- every training epoch
  141. --
  142. -- > trainShuffled 30 (\n e -> "") net CrossEntropyCost 0.5 trainData 10 0.1
  143. trainShuffled :: forall a. (Numeric a, Floating a, Floating (Vector a))
  144. => Int
  145. -> (Network a -> Int -> String)
  146. -> Network a
  147. -> CostFunction
  148. -> Lambda a
  149. -> Samples a
  150. -> MiniBatchSize
  151. -> LearningRate a
  152. -> IO (Network a)
  153. trainShuffled 0 _ net _ _ _ _ _ = return net
  154. trainShuffled epochs debug net costFunction lambda trainSamples miniBatchSize eta = do
  155. spls <- shuffle trainSamples
  156. let !net' = trainSGD net costFunction lambda spls miniBatchSize eta
  157. trace (debug net' epochs)
  158. (trainShuffled (epochs - 1) debug net' costFunction lambda trainSamples miniBatchSize eta)
  159. -- | Pure version of 'trainShuffled', training the network /n/ times without
  160. -- shuffling the training set, resulting in slightly worse results.
  161. trainNTimes :: forall a. (Numeric a, Floating a, Floating (Vector a))
  162. => Int
  163. -> (Network a -> Int -> String)
  164. -> Network a
  165. -> CostFunction
  166. -> Lambda a
  167. -> Samples a
  168. -> MiniBatchSize
  169. -> LearningRate a
  170. -> Network a
  171. trainNTimes 0 _ net _ _ _ _ _ = net
  172. trainNTimes epochs debug net costFunction lambda trainSamples miniBatchSize eta =
  173. trace (debug net' epochs)
  174. (trainNTimes (epochs - 1) debug net' costFunction lambda trainSamples miniBatchSize eta)
  175. where !net' = trainSGD net costFunction lambda trainSamples miniBatchSize eta
  176. -- | Train the network using Stochastic Gradient Descent.
  177. -- This is an improved version of Gradient Descent splitting the training data into
  178. -- subsets to update the networks weights more often resulting in better accuracy.
  179. --
  180. -- On each mini batch, 'update' is called to calculate the improved network.
  181. trainSGD :: forall a. (Numeric a, Floating a, Floating (Vector a))
  182. => Network a
  183. -> CostFunction
  184. -> Lambda a
  185. -> Samples a
  186. -> MiniBatchSize
  187. -> LearningRate a
  188. -> Network a
  189. trainSGD net costFunction lambda trainSamples miniBatchSize eta =
  190. foldl' updateMiniBatch net (chunksOf miniBatchSize trainSamples)
  191. where -- update network based on given mini batch using Gradient Descent
  192. updateMiniBatch :: Network a -> Samples a -> Network a
  193. updateMiniBatch = update eta costFunction lambda (length trainSamples)
  194. -- | Update the network using a set of samples and Gradient Descent.
  195. -- This takes one mini batch to perform GD.
  196. update :: forall a. (Numeric a, Floating a, Floating (Vector a))
  197. => LearningRate a
  198. -> CostFunction
  199. -> Lambda a
  200. -> TrainingDataLength
  201. -> Network a
  202. -> Samples a
  203. -> Network a
  204. update eta costFunction lambda n net spls = case mayNewNablas of
  205. Nothing -> net
  206. Just newNablas -> net { layers = applyNablas newNablas }
  207. where -- calculate new nablas based on samples
  208. mayNewNablas :: Maybe [Layer a]
  209. mayNewNablas = foldl' updateNablas Nothing spls
  210. -- update nablas by calculating new nablas and adding them to the
  211. -- existing ones
  212. updateNablas :: Maybe [Layer a] -> Sample a -> Maybe [Layer a]
  213. updateNablas mayNablas sample =
  214. let -- calculate new nablas for this training sample
  215. !nablasDelta = backprop net costFunction sample
  216. -- takes an existing nabla layer and adds the delta
  217. addDelta nabla nablaDelta =
  218. nabla { weights = weights nabla + weights nablaDelta,
  219. biases = biases nabla + biases nablaDelta }
  220. in case mayNablas of
  221. -- if there are already nablas, add the new ones
  222. Just nablas -> Just $ zipWith addDelta nablas nablasDelta
  223. -- otherwise return the new ones
  224. Nothing -> Just $ nablasDelta
  225. -- apply nablas to layers of the network
  226. applyNablas :: [Layer a] -> [Layer a]
  227. applyNablas nablas = zipWith applyNabla (layers net) nablas
  228. -- apply nabla to one layer
  229. applyNabla :: Layer a -> Layer a -> Layer a
  230. applyNabla layer nabla =
  231. let w = weights layer -- weights matrix
  232. nw = weights nabla -- weights nablas matrix
  233. b = biases layer -- biases vector
  234. nb = biases nabla -- biases nablas vector
  235. fac = 1 - eta * (lambda / fromIntegral n)
  236. -- subtract nablas from weights
  237. w' = scale fac w - scale (eta / (fromIntegral $ length spls)) nw
  238. -- subtract nablas from biases
  239. b' = b - scale (eta / (fromIntegral $ length spls)) nb
  240. in layer { weights = w', biases = b' }
  241. -- | Backpropagate the error and calculate the partial derivatives for the
  242. -- weights and biases in each layer.
  243. -- Returns a list of layers holding the nablas accordingly.
  244. backprop :: forall a. (Numeric a, Floating a, Floating (Vector a))
  245. => Network a
  246. -> CostFunction
  247. -> Sample a
  248. -> [Layer a]
  249. backprop net costFunction spl = finalNablas
  250. where rawFeedforward :: [(Vector a, Vector a)]
  251. rawFeedforward = reverse $ rawOutputs net sigmoid (fst spl)
  252. -- get starting activation and raw value
  253. headZ, headA :: Vector a
  254. (headZ, headA) = head rawFeedforward
  255. -- get starting delta, based on the activation of the last layer
  256. startDelta = getDelta costFunction headZ headA (snd spl)
  257. -- calculate nabla of biases
  258. lastNablaB = startDelta
  259. -- calculate nabla of weighs of last layer in advance
  260. lastNablaW = startDelta `outer` previousA
  261. where previousA
  262. | length rawFeedforward > 1 = snd $ rawFeedforward !! 1
  263. | otherwise = fst spl
  264. lastLayer = Layer { weights = lastNablaW, biases = lastNablaB }
  265. -- reverse layers, analogy to the reversed (z, a) list
  266. layersReversed = reverse $ layers net
  267. -- calculate nablas, beginning at the end of the network (startDelta)
  268. (finalNablas, _) = foldl' calculate ([lastLayer], startDelta)
  269. [1..length layersReversed - 1]
  270. -- takes the index and updates nablas
  271. calculate (nablas, oldDelta) idx =
  272. let -- extract raw and activated value
  273. (z, _) = rawFeedforward !! idx
  274. -- apply prime derivative of sigmoid
  275. z' = cmap sigmoid' z
  276. -- calculate new delta
  277. w = weights $ layersReversed !! (idx - 1)
  278. delta = (tr w #> oldDelta) * z'
  279. -- nablaB is just the delta vector
  280. nablaB = delta
  281. -- activation in previous layer
  282. aPrevious = snd $ rawFeedforward !! (idx + 1)
  283. -- dot product of delta and the activation in the previous layer
  284. nablaW = delta `outer` aPrevious
  285. -- put nablas into a new layer
  286. in (Layer { weights = nablaW, biases = nablaB } : nablas, delta)
  287. -- | The sigmoid function
  288. sigmoid :: Floating a => ActivationFunction a
  289. sigmoid x = 1 / (1 + exp (-x))
  290. -- | The derivative of the sigmoid function.
  291. sigmoid' :: Floating a => ActivationFunctionDerivative a
  292. sigmoid' x = sigmoid x * (1 - sigmoid x)
  293. -- | Shuffle a list randomly.
  294. shuffle :: [a] -> IO [a]
  295. shuffle xs = do
  296. ar <- newArr n xs
  297. forM [1..n] $ \i -> do
  298. j <- randomRIO (i,n)
  299. vi <- readArray ar i
  300. vj <- readArray ar j
  301. writeArray ar j vi
  302. return vj
  303. where
  304. n = length xs
  305. newArr :: Int -> [a] -> IO (IOArray Int a)
  306. newArr len lst = newListArray (1,len) lst
  307. -- | Saves the network as the given filename. When the file already exists,
  308. -- it looks for another filename by increasing the version, e.g
  309. -- /mnist.net/ becomes /mnist1.net/.
  310. saveNetwork :: (Element a, Binary a) => FilePath -> Network a -> IO ()
  311. saveNetwork fp net = do
  312. ex <- doesFileExist fp
  313. case ex of
  314. True -> saveNetwork (newFileName fp) net
  315. False -> encodeFile fp net
  316. -- | Find a new filename by replacing the old version with the next higher one.
  317. newFileName :: FilePath -> FilePath
  318. newFileName fp = case fp =~ "(.+[a-z]){0,1}([0-9]*)(\\..*)" :: [[String]] of
  319. [[_, p, v, s]] -> p ++ show (version v + 1) ++ s
  320. _ -> fp ++ "l"
  321. where version :: String -> Int
  322. version xs = fromMaybe 0 (readMaybe xs :: Maybe Int)
  323. -- | Load the network with the given filename.
  324. loadNetwork :: (Element a, Binary a) => FilePath -> IO (Network a)
  325. loadNetwork = decodeFile