diff --git a/Backend.hs b/Backend.hs
new file mode 100644
index 0000000..ac1143a
--- /dev/null
+++ b/Backend.hs
@@ -0,0 +1,5 @@
+-- | no idea, maybe try to implement the whole shit with repa
+
+import Data.Array.Repa (Z(..), (:.)(..), DIM0(..), DIM1(..), DIM2(..), U, D, Array(..),
+                        (!))
+import qualified Data.Array.Repa as R
diff --git a/MNIST.hs b/MNIST.hs
index 72d1e51..d0aaad2 100644
--- a/MNIST.hs
+++ b/MNIST.hs
@@ -37,7 +37,7 @@ data Arguments = Arguments { eta :: Double, lambda :: Double,
 arguments = Arguments { eta = 0.5 &= help "Learning rate",
                         lambda = 5 &= help "Lambda of regularization",
                         filePath = "" &= help "Load network from file",
-                        costFunction = Quadratic &= help "Cost function",
+                        costFunction = CrossEntropy &= help "Cost function",
                         epochs = 30 &= help "Number of training epochs",
                         miniBatchSize = 10 &= help "Mini batch size",
                         hiddenNeurons = 30 &= help "Number of neurons in hidden layer" }
diff --git a/Network.hs b/Network.hs
index e0dc4e1..a3aaf65 100644
--- a/Network.hs
+++ b/Network.hs
@@ -115,14 +115,18 @@ type Samples a = [Sample a]
 
 -- | The learning rate, affects the learning speed, lower learning rate results
 -- in slower learning, but usually better results after more epochs.
-type LearningRate = Double
+type LearningRate a = a
 
 -- | Lambda value affecting the regularization while learning.
-type Lambda = Double
+type Lambda a = a
 
 -- | Wrapper around the training data length.
 type TrainingDataLength = Int
 
+-- | Size of one mini batch, subset of training set used to update the weights
+-- averaged over this batch.
+type MiniBatchSize = Int
+
 -- | Initializes a new network with random values for weights and biases
 -- in all layers.
 --
@@ -135,7 +139,7 @@ newNetwork layerSizes
         return $ Network lays
   where go :: Int -> Int -> IO (Layer Double)
         go inputSize outputSize = do
-            ws <- randn outputSize inputSize
+            ws <- fmap (/ (sqrt $ fromIntegral inputSize)) (randn outputSize inputSize)
             seed <- randomIO
             let bs = randomVector seed Gaussian outputSize
             return $ Layer ws bs
@@ -163,15 +167,16 @@ rawOutputs net act input = scanl f (input, input) (layers net)
 -- every training epoch
 --
 -- > trainShuffled 30 (\n e -> "") net CrossEntropyCost 0.5 trainData 10 0.1
-trainShuffled :: Int
-              -> (Network Double -> Int -> String)
-              -> Network Double
+trainShuffled :: forall a. (Numeric a, Floating a, Floating (Vector a))
+              => Int
+              -> (Network a -> Int -> String)
+              -> Network a
               -> CostFunction
-              -> Lambda
-              -> Samples Double
-              -> Int
-              -> Double
-              -> IO (Network Double)
+              -> Lambda a
+              -> Samples a
+              -> MiniBatchSize
+              -> LearningRate a
+              -> IO (Network a)
 trainShuffled 0 _ net _ _ _ _ _ = return net
 trainShuffled epochs debug net costFunction lambda trainSamples miniBatchSize eta = do
     spls <- shuffle trainSamples
@@ -182,15 +187,16 @@ trainShuffled epochs debug net costFunction lambda trainSamples miniBatchSize et
 
 -- | Pure version of 'trainShuffled', training the network /n/ times without
 -- shuffling the training set, resulting in slightly worse results.
-trainNTimes :: Int
-            -> (Network Double -> Int -> String)
-            -> Network Double
+trainNTimes :: forall a. (Numeric a, Floating a, Floating (Vector a))
+            => Int
+            -> (Network a -> Int -> String)
+            -> Network a
             -> CostFunction
-            -> Lambda
-            -> Samples Double
-            -> Int
-            -> Double
-            -> Network Double
+            -> Lambda a
+            -> Samples a
+            -> MiniBatchSize
+            -> LearningRate a
+            -> Network a
 trainNTimes 0 _ net _ _ _ _ _ = net
 trainNTimes epochs debug net costFunction lambda trainSamples miniBatchSize eta =
     trace (debug net' epochs)
@@ -198,58 +204,86 @@ trainNTimes epochs debug net costFunction lambda trainSamples miniBatchSize eta
   where !net' = trainSGD net costFunction lambda trainSamples miniBatchSize eta
 
 
-trainSGD :: (Numeric Double, Floating Double)
-         => Network Double
+-- | Train the network using Stochastic Gradient Descent.
+-- This is an improved version of Gradient Descent splitting the training data into
+-- subsets to update the networks weights more often resulting in better accuracy.
+--
+-- On each mini batch, 'update' is called to calculate the improved network.
+trainSGD :: forall a. (Numeric a, Floating a, Floating (Vector a))
+         => Network a
          -> CostFunction
-         -> Lambda
-         -> Samples Double
-         -> Int
-         -> Double
-         -> Network Double
+         -> Lambda a
+         -> Samples a
+         -> MiniBatchSize
+         -> LearningRate a
+         -> Network a
 trainSGD net costFunction lambda trainSamples miniBatchSize eta =
     foldl' updateMiniBatch net (chunksOf miniBatchSize trainSamples)
-  where updateMiniBatch = update eta costFunction lambda (length trainSamples)
-
-update :: LearningRate
+  where -- update network based on given mini batch using Gradient Descent
+        updateMiniBatch :: Network a -> Samples a -> Network a
+        updateMiniBatch = update eta costFunction lambda (length trainSamples)
+
+-- | Update the network using a set of samples and Gradient Descent.
+-- This takes one mini batch to perform GD.
+update :: forall a. (Numeric a, Floating a, Floating (Vector a))
+       => LearningRate a
        -> CostFunction
-       -> Lambda
+       -> Lambda a
        -> TrainingDataLength
-       -> Network Double
-       -> Samples Double
-       -> Network Double
-update eta costFunction lambda n net spls = case newNablas of
+       -> Network a
+       -> Samples a
+       -> Network a
+update eta costFunction lambda n net spls = case mayNewNablas of
         Nothing -> net
-        Just x -> net { layers = layers' x }
-    where newNablas :: Maybe [Layer Double]
-          newNablas = foldl' updateNablas Nothing spls
-          updateNablas :: Maybe [Layer Double] -> Sample Double -> Maybe [Layer Double]
+        Just newNablas -> net { layers = applyNablas newNablas }
+    where -- calculate new nablas based on samples
+          mayNewNablas :: Maybe [Layer a]
+          mayNewNablas = foldl' updateNablas Nothing spls
+          -- update nablas by calculating new nablas and adding them to the
+          -- existing ones
+          updateNablas :: Maybe [Layer a] -> Sample a -> Maybe [Layer a]
           updateNablas mayNablas sample =
-              let !nablasDelta = backprop net costFunction sample
-                  f nabla nablaDelta =
+              let -- calculate new nablas for this training sample
+                  !nablasDelta = backprop net costFunction sample
+                  -- takes an existing nabla layer and adds the delta
+                  addDelta nabla nablaDelta =
                       nabla { weights = weights nabla + weights nablaDelta,
                               biases = biases nabla + biases nablaDelta }
               in case mayNablas of
-                  Just nablas -> Just $ zipWith f nablas nablasDelta
+                  -- if there are already nablas, add the new ones
+                  Just nablas -> Just $ zipWith addDelta nablas nablasDelta
+                  -- otherwise return the new ones
                   Nothing -> Just $ nablasDelta
-          layers' :: [Layer Double] -> [Layer Double]
-          layers' nablas = zipWith updateLayer (layers net) nablas
-          updateLayer :: Layer Double -> Layer Double -> Layer Double
-          updateLayer layer nabla =
+          -- apply nablas to layers of the network
+          applyNablas :: [Layer a] -> [Layer a]
+          applyNablas nablas = zipWith applyNabla (layers net) nablas
+          -- apply nabla to one layer
+          applyNabla :: Layer a -> Layer a -> Layer a
+          applyNabla layer nabla =
               let w = weights layer  -- weights matrix
-                  nw = weights nabla
+                  nw = weights nabla  -- weights nablas matrix
                   b = biases layer  -- biases vector
-                  nb = biases nabla
+                  nb = biases nabla  -- biases nablas vector
                   fac = 1 - eta * (lambda / fromIntegral n)
+                  -- subtract nablas from weights
                   w' = scale fac w - scale (eta / (fromIntegral $ length spls)) nw
+                  -- subtract nablas from biases
                   b' = b - scale (eta / (fromIntegral $ length spls)) nb
               in layer { weights = w', biases = b' }
 
-backprop :: Network Double -> CostFunction -> Sample Double -> [Layer Double]
+-- | Backpropagate the error and calculate the partial derivatives for the
+-- weights and biases in each layer.
+-- Returns a list of layers holding the nablas accordingly.
+backprop :: forall a. (Numeric a, Floating a, Floating (Vector a))
+         => Network a
+         -> CostFunction
+         -> Sample a
+         -> [Layer a]
 backprop net costFunction spl = finalNablas
-    where rawFeedforward :: [(Vector Double, Vector Double)]
+    where rawFeedforward :: [(Vector a, Vector a)]
           rawFeedforward = reverse $ rawOutputs net sigmoid (fst spl)
           -- get starting activation and raw value
-          headZ, headA :: Vector Double
+          headZ, headA :: Vector a
           (headZ, headA) = head rawFeedforward
           -- get starting delta, based on the activation of the last layer
           startDelta = getDelta costFunction headZ headA (snd spl)
@@ -293,6 +327,7 @@ sigmoid x = 1 / (1 + exp (-x))
 sigmoid' :: Floating a => ActivationFunctionDerivative a
 sigmoid' x = sigmoid x * (1 - sigmoid x)
 
+-- | Shuffle a list randomly.
 shuffle :: [a] -> IO [a]
 shuffle xs = do
         ar <- newArr n xs
@@ -317,6 +352,7 @@ saveNetwork fp net = do
         True -> saveNetwork (newFileName fp) net
         False -> encodeFile fp net
 
+-- | Find a new filename by replacing the old version with the next higher one.
 newFileName :: FilePath -> FilePath
 newFileName fp = case fp =~ "(.+[a-z]){0,1}([0-9]*)(\\..*)" :: [[String]] of
     [[_, p, v, s]] -> p ++ show (version v + 1) ++ s