MasterThesis/tesi.tex at master · Mat092/MasterThesis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[12pt,a4paper]{report}
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\usepackage{newlfont}
\usepackage{color}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{epigraph}
\usepackage[overload]{empheq}
\usepackage[inline, shortlabels]{enumitem}
\usepackage{bm}
\usepackage[table]{xcolor}
\usepackage{color}
\usepackage{natbib}
\usepackage{float}
\usepackage{textcomp}
\usepackage{adjustbox}
\usepackage{hyperref}
\usepackage{svg}

\definecolor{grayrow}{rgb}{0.85, 0.85, 0.85}
\definecolor{lightgray}{rgb}{0.83, 0.83, 0.83}
\definecolor{darkgrayrow}{rgb}{0.7, 0.7, 0.7}
\definecolor{RoyalRed}{rgb}{0.61,0.11,0.19}

\usepackage{listings}

\definecolor{codegreen}{rgb}{0,0.6,0}
\definecolor{codegray}{rgb}{0.5,0.5,0.5}
\definecolor{codepurple}{rgb}{0.58,0,0.82}
\definecolor{backcolour}{rgb}{0.95,0.95,0.92}

\lstdefinestyle{snippet}{
    backgroundcolor=\color{backcolour},
    commentstyle=\color{codegreen},
    keywordstyle=\color{magenta},
    numberstyle=\tiny\color{codegray},
    stringstyle=\color{codepurple},
    basicstyle=\ttfamily\footnotesize,
    breakatwhitespace=false,
    breaklines=true,
    captionpos=t,
    keepspaces=true,
    numbers=left,
    numbersep=5pt,
    showspaces=false,
    showstringspaces=false,
    showtabs=false,
    tabsize=2
}

\usepackage{emptypage} % remove header in blanck pages

\usepackage[a4paper,top=4cm,bottom=4cm,left=3cm,right=3cm,marginparwidth=1.75cm]{geometry}

\hypersetup{
colorlinks,
citecolor=black,
filecolor=black,
linkcolor=black,
urlcolor=blue
}

\textwidth=450pt\oddsidemargin=0pt

\begin{document}
\begin{titlepage}
%
%
% ONCE YOU ARE FINISHED WITH YOUR CHANGES MODIFY "RED" WITH "BLACK" IN ALL \textcolor COMMENTS
%
%
\begin{center}
{{\Large{\textsc{Alma Mater Studiorum $\cdot$ University of  Bologna}}}}
\rule[0.1cm]{15.8cm}{0.1mm}
\rule[0.5cm]{15.8cm}{0.6mm}
\\\vspace{3mm}
{\small{\bf School of Science \\
Department of Physics and Astronomy\\
Master Degree in Physics}}
\end{center}

\vspace{17mm}

\begin{center}
%
% INSERT THE TITLE OF YOUR THESIS
%
{\LARGE{\bf OPTIMIZATION AND APPLICATIONS\\
OF DEEP LEARNING ALGORITHMS\\
\vspace{3mm}
FOR SUPER-RESOLUTION IN MRI}}\\
\end{center}

\vspace{30mm} \par \noindent

\begin{minipage}[t]{0.47\textwidth}

{\large{\bf Supervisor: \vspace{2mm}\\
Prof. Gastone Castellani\\\\

\bf Co-supervisor:
\vspace{2mm}
\\
Dr. Nico Curti\\\\}}
\end{minipage}
%
\hfill
%
\begin{minipage}[t]{0.47\textwidth}\raggedleft \textcolor{black}{
{\large{\bf Submitted by:
\vspace{2mm}\\
Mattia Ceccarelli}}}
\end{minipage}

\vspace{40mm}

\begin{center}

Academic Year 2019/2020
\end{center}

\end{titlepage}

\newpage

\vspace*{125px}
\LARGE\textit{Abstract}
\normalsize
\vspace{2mm}

The increasing amount of data produced by modern infrastructures requires instruments of analysis more and more precise, quick, and efficient.
For these reasons in the last decades, Machine Learning (ML) and Deep Learning (DL) techniques saw exponential growth in publications and research from the scientific community.
In this work are proposed two new frameworks for Deep Learning: Byron written in C++, for fast analysis in a parallelized CPU environment, and NumPyNet written in Python, which provides a clear and understandable interface on deep learning tailored around readability.
Byron will be tested on the field of Single Image Super-Resolution for NMR imaging of brains (Nuclear Magnetic Resonance) using pre-trained models for x2 and x4 upscaling which exhibit greater performance than most common non-learning-based algorithms.
The work will show that the reconstruction ability of DL models surpasses the interpolation of a bicubic algorithm even with images totally different from the dataset in which they were trained, indicating that the {\it generalization} abilities of those deep learning models can be sufficient to perform well even on biomedical data, which contains particular shapes and textures.
Ulterior studies will focus on how the same algorithms perform with different conditions for the input, showing a large variance between results.


\newpage
\normalsize

\tableofcontents

\chapter{Introduction}

This thesis work aim is to evaluate the upsampling perfomances of pre-trained Deep Learning Single Image Super-Resolution models on Biomedical images of human brains.
\\
The first chapter focuses on the fundamentals of the techniques named in this work with special emphasis on Deep Learning, Super Resolution and image analysis, essential for understanding the implementations of the two frameworks and the main methodologies applied during the study.
\\
The second chapter includes the mathematical and numerical explanations of the most important algorithms implemented in Byron and NumPyNet and a brief description of the two frameworks.
Moreover I will report the timing measurements againts a popular deep learning framworks called Tensorflow for the most importants layers in image analysis.
\\
In the third chapter, I firstly describe the models in details by focusing on the reasoning the respective authors put during the construction of the architectures.
Then the attention is moved to the description of the dataset used for training (DIV2K) and for testing (NMR) and how the images has been fed to the networks.
\\
The final results are collected in the last chapter divided into subsection which answer different questions: how well DL models can reconstruct an High Resolution image starting from a Low Resolution one? How well they can {\it generalize} their ``knowledge'' on new data?  How the orientation of the input influences the result? In which parts of the images the models struggle the most?
The methods has been evaluated through common metrics in image analysis: {\it Peak Signal to Noise Ratio} (PSNR) and {\it Structural SIMilarity index} (SSIM).
Secondarily, to remove eventual backgrounds effect from the analysis, we introduced FSL BET (Brain Extraction Tool) which is a software frequently used in NMR studies to mask images and remove uninteresting data.
\\
In the end, I discuss the conclusions and propose possibile future developments for the work.

\section{Neural Network and Deep Learning}

  A neural network is an interconnected structure of simple procedurals units, called nodes. Their functionality is inspired by the animals' brain and from the works on learning and neural plasticity of Donald Hebb \cite{hebb-learning}. From his book:

\begin{quote}
 \begin{center}
  \textit{Let us assume that the persistence or repetition of a reverberatory activity (or "trace") tends to induce lasting cellular changes that add to its stability.[…] When an axon of cell A is near enough to excite a cell B and repeatedly or persistently takes part in firing it, some growth process or metabolic change takes place in one or both cells such that A's efficiency, as one of the cells firing B, is increased}
 \end{center}
\end{quote}

which is an attempt to describe the change of strenght in neural relations as a consequence of stimulations.
From the so-called \textit{Hebbian Theory} rose the first computational models such as the \textit{Perceptron}, \textit{Neural Networks} and the modern \textit{Deep Learning}.
The development of learning-based algorithms did not catch up with the expected results until recently, mainly due to the exponential increase in available computational resources.

From a mathematical point of view, a neural network is a composition of non-linear multi-parametric functions.
During the {\it training phase} the model tunes its parameters, starting from random ones, by minimizing the error function (called also loss or cost).
Infact, machine learning problems are just optimization problems where the solution is not given in an analytical form, therefore trough iteratives techniques (generally some kind of gradient descent) we progressively approximate the correct result.

In general, there are 3 different approaches to learning:

\begin{itemize}
 \item {\bf supervised} It exists a labeled dataset in which the relationship between features (input) and expected output is known.
 During training, the model is presented with many examples and it corrects its answers based on the expected response.
 Some problems tied to supervised algorithms are classification, regression, object detection, segmentation and super-resolution.
 \item {\bf unsupervised} In this case, a labeled dataset does not exist, only the inputs data are available.
 The training procedure must be tailored around the problem under study. Some examples of unsupervised algorithms are clustering, autoencoders, anomaly detection.
 \item {\bf reinforced} the model interacts with a dynamic environment and tries to reach a goal (e.g. winning in a competitive game).
 For each iteration of the training process we assign a reward or a punishment, relatively to the progress in reaching the objective.
\end{itemize}

This work will focus on models trained using labeled samples, therefore in a supervised environment.

\subsection*{Perceptron}

The Perceptron (also called \textit{artificial neuron}) is the fundamental unit of every neural network and it is a simple model for a biological neuron, based on the works of Rosenblatt \cite{perceptron}.
The \textit{perceptron} receives $N$ input values $x_1, x_2, ... x_N$ and the output is just a linear combination of the inputs plus a bias:

\begin{equation}
y = \sigma(\sum_{k=1}^N w_kx_k + w_0)
\end{equation}

where $\sigma$ is called \textit{activation function} and $w_0, w_1, ... w_N$ are the trainable weights.

Originally, the activation function was the \textit{Heaviside step function} whose value is zero for negative arguments and one for non-negative arguments:

\begin{equation}
H(x) :=
\begin{cases}
  0 \text{ if } x < 0 \\
  1 \text{ if } x \geq 0 \\
\end{cases}
\end{equation}

In this case the perceptron is a \textit{linear discriminator} and as such, it is able to learn an hyperplane which linearly separates two set of data.
The weights are tuned during the training phase following the given update rule, usually:

\begin{equation}
 \bm{w}_{n+1} = \bm{w}_n + \eta (t - y)\bm{x}
 \label{eq:perceptron}
\end{equation}

where $\eta$ is the learning rate ($\eta \in [0,1]$) and $t$ is the true output.
If the input instance is correctly classified,  the error $(t - y)$ would be zero and weights do not change.
Otherwise, the hyperplane is moved towards the misclassified example.
Repeating this process will lead to a convergence only if the two classes are linearly separable.

\subsection*{Fully Connected Structure}

The direct generalization of a simple perceptron is the \textit{Fully Connected Artificial Neural Network} (or {\it Multy Layer Perceptron}).
It is composed by many Perceptron-like units called nodes, any one of them performs the same computation as formula \ref{eq:perceptron} and \textit{feed} their output \textit{forward} to the next layer of nodes.
A typical representation of this type of network is shown in figure \ref{fig:ann}.

\begin{figure}[h]
 \centering
 \includegraphics[scale=0.35]{./images/neural_net.png}
 \caption{{\it A common representation of a neural network: a single node works as the perceptron described before. The network is composed by the input layer, the hidden layers and the output layer. The depth of the network is determined by the number of hidden layers.}}
 \label{fig:ann}
\end{figure}

While the number of nodes in the input and output layers is fixed by the data under analysis, the best configuration of hidden layers is still an open problem.

The mathematical generalization from the perceptron is simple, indeed given the $i$-th layer its output vector $\bm{y}_i$ reads:

\begin{equation}
 \bm{y}_i = \sigma(W_i \bm{y}_{i-1} + \bm{b}_i)
\end{equation}

where $W_i$ is the weights matrix of layer $i$ and $\bm{b}_i$ is the $i$-th bias vector, equivalent to $w_0$ in the perceptron case.
The output of the $i$-th layer becomes the input of the next one until the output layer yields the network's answer.

As before, $\sigma$ is the activation function which can be different for every node, but it usually differs only from layer to layer.
How to chose the best activation function is yet to be understood, and most works rely on experimental results.

In a supervised environment, the model output is compared to the desired output ({\it truth}) by means of a cost function.
An example of cost function is the sum of squared error:

\begin{equation}
  C(W) = \frac{1}{N} \sum_{j=1}^{N} (y_j - t_j)^2
\end{equation}

where $N$ is the dimensionality of the output space. $C$ is considered as a function of the model's weights only since input data and {\it truth labels} $t$ are fixed.

Those architectures are {\it universal approximators}, that means given an arbitrarly complex function, there is a fully connected neural network that can approximate it.

This type of network is called {\it feed forward} because the information flows directly from the input to the output layer: however, it exists a class of models called {\it Recurrent} where this is not the case anymore and feedback loops are possible, but they are outside the scope of this work.

\subsection*{Gradient Descent}

To minimize the loss function an update rule for the weights is needed.
Given a cost funtion $C(w)$, the most simple one is the gradient descent:
\begin{equation}
 w \leftarrow w - \eta \nabla_w C
\end{equation}
The core idea is to modify the parameters by a small step in the direction that minimizes the error function
The lenght of the step is given by the {\it learning rate} $\eta$, which is a hyperparameter chosen by the user, while the direction of the step is given by $-\nabla_w C$, which point towards the steepest descent of the function landscape.

\begin{figure}[H]
 \centering
 \includegraphics[scale=0.35]{./images/sgd.png}
 \caption{\it Visual example of gradient descent for a model with 2 weights. The idea is to modify the weights to follow the direction of the steepest descent for the landscape of the error function}
 \label{fig:gd}
\end{figure}
The speed at which the algorithm converges to a solution and the precision of said solution are greatly influenced by the update rule. More complex and efficient update rules do exist, but they follow the same idea as the gradient descent.

\subsection*{Error Back Propagation}

The most common algorithm used to compute the updates to weights in the learning phase is the {\it Error Back Propagation}.
Consider a Neural Network with $L$ total layers, each with a weight matrix $W_l$ with $l = 1, 2, \dots L$.
Given a differentiable cost function $C$, which depends from $W_1, W_2,\dots W_L$, let's define:
\begin{align}
 \bm{z}_l &= W_l \bm{y}_{l-1} + \bm{b}_l  \\
 \bm{a}_l &= \sigma(\bm{z}_l)
\end{align}
respectively the {\it de-activated} and {\it activated} output vectors of layer $l$ with $N$ neurons (or $N$ outputs.) and define:
\begin{equation}
 \bm{\delta}_l = (\frac{\partial C}{\partial z_l^1}, \dots , \frac{\partial C}{\partial z_l^N})
\end{equation}
the vector of errors of layer $l$. Then we can write the 4 equations of back propagation for the fully-connected neural network \cite{neural-net-nielsen}:
\begin{align}
  &\bm{\delta}_L = \nabla_a C \odot \sigma'(\bm{z}_L) & \mbox{The network error vector} \\
  &\bm{\delta}_l = (W_{l+1}^T \bm{\delta}_{l+1}) \odot \sigma'(\bm{z}_l) & \mbox{The error vector for layer $l$} \\
  &\frac{\partial C}{\partial b_l^j} = \delta_l^j & \mbox{The j-th bias update} \\
  &\frac{\partial C}{\partial w_l^{j k}} = a_{l-1}^k \delta_{l}^{j} & \mbox{The update for the weight indexed $j,k$ }
\end{align}
where  $\odot$ is the Hadamard's product.
Those equations can be generalized for others kind of layers, as I will show in the next chapters.

The full training algorithm is:
\begin{itemize}
 \setlength\itemsep{-0.3em}
 \item [1] Define the model with random parameters
 \item [2] Compute the output for one of the inputs
 \item [3] Compute the loss function $C(W)$ and the gradients $\frac{\partial C}{\partial w_l^{j k}}$ and $\frac{\partial C}{\partial b_l^j}$ for each $l$.
 \item [4] Updates the parameters following the update rule,
 \item [5] Iterate from step 2 until the loss is sufficiently small
\end{itemize}

\section{Super Resolution}

The term Super-Resolution (SR) refers to a class of techniques that enhance the spatial resolution of an image, thus converting a given low resolution (LR) image to a corresponding high resolution (HR) one, with better visual quality and refined details.
Image super-resolution is also called by other names like image scaling, interpolation, upsampling and zooming \cite{survey-sr}.
Super resolution can also refers to its ``hardware'' (and best-known) implementation, the {\it super resolution microscopy}, which aim is to overcome the diffraction limit: indeed, the development of super-resolved fluorescense microscopy won a Nobel price in chemistry in 2014, though its technicalities reside outside the scope of this work, which focused on its numerical counterpart.

As described before, the training of a supervised model happens by means of examples: in the case of classification the network is presented with many couples {\it features-label} that compose the {\it train set}. The objective is to find the correct labels for a set of samples never saw before called {\it test set}.

For digital images, the {\it features} are the pixels which compose a 2 dimensional or 3 dimensional (for RGB picture) grid-like structure, the label is usually represented as 1 dimensional vector as large as the binary representation of the number of classes the model is supposed to  discern: a neural network produces a map between a very large {\it features space} and a smaller one.

This behaviour is slightly different for Super-Resolution: indeed, when training a SR model we are talking about {\it image-to-image} processing and as such, both the features space and the labels are images.
The dataset is built from a single series of high resolution (HR) images which are downsampled to obtain the low resolution (LR) counterpart: the couples LR-HR are fed to the network respectively as input and label just like in a classification problem; this time though, the network will map a smaller feature space into a larger one.

The models I'm going to use in this work are trained on images downsampled using the {\it bicubic interpolation}.

\subsection{Bicubic Interpolation}

The {\it Bicubic interpolation} is a common algorithm used in image analysis either to downsample or upsample an image.
This operation is also called \textit{re-scaling} and its purpose is to interpolate the pixel values after a resize of the image, respectively after shrinking or expanding it, e.g as a consequence of zooming.
The name comes from the highest order of complexity of the operation used in the algorithm, which is a cubic function.
Given a pixel, the interpolation function evaluates the 4 pixel around it by applying a filter defined as:

\begin{equation}
\hspace*{-1.cm}
k(x) = \frac{1}{6} \left\{ \begin{array}{rc}
  (12 - 9B - 6C) |x|^3 + (-18 + 12B + 6C) |x|^2 + (6 - 2B)           & \mbox{if }        |x| < 1 \\
  (−B − 6C) |x|^3 + (6B + 30C) |x|^2 + (−12B − 48C) |x| + (8B + 24C) & \mbox{if } 1 \leq |x| < 2 \\
  0                                                                  & \mbox{otherwise}         \\
  \end{array}
  \right.
\end{equation}

where $x$ identifies each pixel below the filter.
Common values used for the filter parameters are $B=0$ and $C=0.75$ (used by \textsf{OpenCV} library) or $B=0$ and $C=0.5$ used by \textsf{Matlab} and \textsf{Photoshop}.
The scale factor of the down/up sampling can assume different values according to the user needs; for this work, I used an upsampling factor of $\times 2$ and $\times 4$ and the algorithm is from the \textsf{Python} version of the library \textsf{OpenCV} \cite{OpenCV}.
The main aims of SR algorithms are to provide a better alternative to standard upsampling and obtain a better quality image both from a qualitative (visual perception) and a quantitative poin of view.

\subsection{Image Quality}

While the human eye is a good qualitative evaluator, it is possible to define different quantitative measures between two images to quantify their similiraties.

\subsection*{PSNR}

One of the most common in Image Analysis is the {\it Peak Signal To Noise Ratio} or PSNR.
It is usually employed to quantify the recostruction capabilities of an algorithm given a lossy compression, w.r.t the original image. The mathematical espression reads:

\begin{equation}
 PSNR = 20 \cdot \log_{10} (\frac{max(I)}{MSE})
\end{equation}

where $max(I)$ is the maximum available value for the image $I$, namely $1$ for floating point representation and $255$ for an integer one.
$MSE$ is the {\it Mean Squared Error}, which is a common metrics in data analysis used to quantify the mean error of a model. It is defined as:

\begin{equation}
 MSE = \frac{1}{HW}\sum_{i=1}^{H} \sum_{j=1}^W (I(i, j) - K(i,j))^2
\end{equation}

where $H$ and $W$ are the spatial dimensions of the original image $I$ and the recostruction $K$.
The metric can be generalized to colored image by simply adding the depth (RGB channel) dimension.

Even though an higher PSNR generally means an higher recostruction quality, this metric may performs poorly compared to other quality metrics when it comes to estimate the quality as perceived by human eyes. An increment of $0.25$ in PSNR corresponds to a visible improvement.

\subsection*{SSIM}

Another common metric is the {\it Structural SIMilarity index} or SSIM. It has been developed to evaluate the structural similraties between two images,  while incorporating important perceptual phenomena, including luminance and contrast terms. For that, it should be more representative of the qualitative evaluation as seen by humans.
The SSIM index is defined as:

\begin{equation}
 SSIM(I, K) = \frac{1}{N} \sum_{i=1}^{N} SSIM_i(x_i, y_i)
\end{equation}

Where $N$ is the number of windows in the images, usually of size $11 \times 11$ or $8 \times 8$. For every box, the index is:

 \begin{equation}
    \hspace{-0.75cm}
    SSIM(x_i, y_i) = \frac{(2\mu_x\mu_y + c_1)(2\sigma_{xy} + c_2)}{ ({\mu_x}^2 + {\mu_y}^2 + c_1)({\sigma_x}^2 + {\sigma_y}^2 + c_2) },
    \label{SSIM}
\end{equation}

where $x$ and $y$ are two equally-sized regions in two different images, $\mu$ is the average value in the region, $\sigma^2$ is the variance, $\sigma_{xy}$ is the covariance between the regions and $c_1$ and $c_2$ are two constants to stabilize the division.

Both SSIM and PSNR can be useful in Deep Learning applications as target functions or as post-training quality measures.
To compute PSNR and SSIM I used the function of the Python library {\it scikit image} \cite{scikit}, for their precision, efficency and ease to use.

SR pre-trained models will be evaluated in their reconstruction capabilities against the bicubic interpolation using as benchmark an available dataset of NMR images of brain.
I'd like to point out that the deep learning architecture tested in this work are trained  on general purpose datasets which are very different from biomedical pictures available:  the first problem is that MRI images are single channeled (gray-scaled) as opposed to the RGB images which those models are trained on, however this can be easily solved by artificially add depth by concatenating the same image 3 times; by doing so, the models elaborate tree different outputs that can be compared against each others.
The second issue is that the models never had a chance to learn the particular shapes contained in animals' brain: although that could be seen as a major drawback, their generalization capability should be sufficient to perform well even outside their optimal "environment".
The datasets will be discussed in later chapters.

\section{Nuclear Magnetic Resonance}

The term NMR identifies an experimental technique called Nuclear Magnetic Resonance.
It has been indipendently developed by two reserch groups led Felix Bloch and Edward Purcell, both awarded with the Nobel prize in Physics in 1952.
Initially, NMR was tied to studies in fundamental physics and particularly to solid state physics: its theoretical and technological evolution in the years allowed numerous applications also in the biological and medical fields.
Differently from other invasive techniques from nuclear medicine or radiology, which employ ionising radiations dangerous for the organism, in NMR the only source of energy administration is represented by two types of electromagnetic fields: static and radio-frequency.
By applying those fields to nuclei which posses magnetic properties it is possible to analyze the macroscopic structure of the sample.
The most used nuclei are ${}^1H, {}^2H, {}^{31}P, {}^{23}Na, {}^{14}N, {}^{13}C, {}^{19}F$, which is really advantegeous in the biological field, given that the subjects are most likely rich in $H_2O$, therefore in ${}^1H$.
This allows to detect signals of large intensity from the samples and measure a high ratio signal/noise even with few acquisitions.

As stated before, NMR is applied to nuclei which posses magnetic properties, therefore they have an angular momentum $\bm{I}$ (spin) associated with a magnetic momentum $\bm{\mu} = \gamma(\frac{h}{2\pi})\bm{I}$ where $\gamma$ is the gyromagnetic ratio, which depends from the nucleus.

Considering an ensemble of ${}^1H$ with $I = \frac{1}{2}$, if a static magnetic field $\bm{B}_0 = H_0 \hat z$ is applied, the magnetic momenta of the nucleus will orientate themselves along the parallel or anti-parallel direction of the fields, assuming two discrete values of energy.
According to Boltzmann statistic, the majority of nuclei will orientate in such a way to minimize the energy (parallel to $\bm{B}_0$).

The energy difference between the two level is given by:

\begin{equation}
 \Delta E = \gamma \left(\frac{h}{2\pi}B_0\right) = \left(\frac{h}{2\pi}\omega_0\right)
 \label{eq:res-cond}
\end{equation}

where $\omega_0$ is the Larmor Frequency.
The energy that must be given to perturb the system from its equilibrium condition follow the resonance condition \ref{eq:res-cond}.
The perturbation is represented by a radio-frequency electromagnetic field $\bm{B}_1$ perpendicular to $\bm{B}_0$, oscillating at the Larmor frequency of the system $\omega_0$.
After removal of the perturbation, it is possible to measure the relaxation times $T_1$ and $T_2$ of the longitudinal and trasversal components of the nuclei magnetization.

For the nuclei, the temporal evolution of $\bm{\mu}$ under the influence of the static fields $\bm{B}_0$ is given by:

\begin{equation}
  \frac{d \bm{\mu}}{d t} = \gamma \bm{\mu} \times \bm{B}_0
\end{equation}

and by separating the three components it yields:

\begin{align}
  \frac{d \mu_x}{d t} &= \gamma \mu_x B_0 \\
  \frac{d \mu_y}{d t} &= \gamma \mu_y B_0 \\
  \frac{d \mu_z}{d t} &= 0
\end{align}

which highlights how the magnetic moment performs a precession around the $z$ axis with frequency $\omega_0 = \gamma B_0$

The radio-frequency magnetic field $\bm{B}_1$, with $B_1 << B_0$ and frequency $\omega$ is perpendicular to $\bm{B}_0$ lying in the $xy$ plane and can be obtained by an oscillating fields generated by a coil traversed by a radio-frequency current.

In the most general case of nuclear magnetization, if the system is influenced by field $\bm{B}_0$ and $\bm{B}_1$ it is in a non-equilibrium condition described by the {\it Block Equations} for each components:

\begin{align}
 \frac{d M_{z}}{d t}  &= - \frac{M_z(t) - M_z(0)}{T_1} & \mbox{longitudinal relaxation}\\
 \frac{d M_{xy}}{d t} &= - \frac{M_{xy}(t) - M_{xy}(0)}{T_2} & \mbox{trasversal relaxation}
\end{align}

Their integration brings:

\begin{align}
 M_z(t)    &= M_z(0)exp\left(\frac{-t}{T_1}\right) + M_z(0)\left(1 - exp\left(\frac{-t}{T_1}\right)\right) \\
 M_{xy}(t) &= M_{xy}(0)exp\left(\frac{-t}{T_2}\right)
\end{align}

Bloch's equations are fundamental for the choice of the sequence of excitation and subsequent acquisition and elaboration of the signal.
Once the perturbation action ends, its possible to follow the de-excitation of the macroscopic magnetization M , which tends to realign to the field $\bm{B}_0$. The signal produced by the variation of M is measured by an induction electromagnetic coil around the sample in an ortogonal direction w.r.t the static field.
The NMR signal, called FID (Free Induction Decay), is approximately monochromatic and oscillates at Larmor frequency, decaying exponenetially as a functio of $T_2$.
For Image formation (MRI), excitation sequences are opportunely chosen in such a way to emphasize the dipendence of FID from three parameters: protonic density $\rho$, $T_1$ and $T_2$.
\\
MRI can be differentiated in two types: T1-weighted sequences and T2-weighted sequences, which show different information.
The former are considered the most ``anatomical'' and result in images that most closely appoximate the appearences of tissues: fluid  have low signal intensity (black), muscle and gray-matter has a intermediate signal intensity (grey) and fat and white-matter have a high signal intensity (white).
\\
The latter instead, have a high signal intensity for fluid an fat, an intermediate intensity for muscle and grey-matter, and low intensity for white-matter, which appears dark-ish.
\\
In figure \ref{fig:diff-t1-t2} is shown a comparison between the two:

\begin{figure}[H]
 \centering
 \includegraphics[scale=0.4]{./images/t1_t2_diff.png}
 \caption{\it Comparison between a T1-weighted slice (left) and a T2-weighted slice (right) for the same patient}
 \label{fig:diff-t1-t2}
\end{figure}


\chapter{Algorithms}

A wide range of documentations and implementations have been written on the topic of Deep Learning and it is more and more difficult to move around the different sources.
In recent years, leaders in DL applications became the multiple open-source \textsf{Python} libraries available on-line as \textsf{Tensorflow}~\cite{tensorflow2015-whitepaper}, \textsf{Pytorch}~\cite{paszke2017automatic} and \textsf{Caffe}~\cite{Jia:2014:Caffe}.
Their portability and efficiency are closely related on the simplicity of the \textsf{Python} language and on the simplicity in writing complex models in a minimum number of code lines.
Only a small part of the research community uses deeper implementation in \textsf{C++} or other low-level programming languages and between them should be mentioned the \textsf{darknet project} of Redmon J. et al. which has created a sort of standard in object detection applications using a pure \textsf{Ansi-C} library.
The library was developed only for Unix OS but in its many branches (literally \emph{forks}) a complete porting for each operative system was provided.
The code is particularly optimized for GPUs using CUDA support, i.e only for NVidia GPUs.
It is particularly famous for object detection applications since its development is tightly associated to an innovative approach at multi-scale object detections called YOLO (\emph{You Only Look Once}), that recently reached its fourth release \cite{yolov4}.
The libraries built during the develompment of this thesis are all inspired by the efficiency and modularity of darknet and make an effort to not only replicate but expand on their work, both in performances, functionalities and solved issues.

In this section I will describe the mathematical background of these models and
to most theoretical explanation  discuss the numerical problems associated, tied to the development of two new libraries: \textsf{NumPyNet} \cite{NumPyNet} and \textsf{Byron} \cite{Byron}.

\section{Frameworks}

\textsf{NumPyNet} is born as an educational framework for the study of Neural Network models.
It is written trying to balance code readability and computational performances and it is enriched with a large documentation to better understand the functionality of each script.
The library is written in pure \textsf{Python} and the only external library used is \textsf{Numpy}~\cite{Numpy} (a base package for the scientific research).
As I will show in the next sections, \textsf{Numpy} allows a relatively efficient implementation of complex algorithms by keeping the code as similar as possible to the mathematic computations involved.

Despite being supplied by wide documentations, it is often difficult for novel users to move around the many hyper-links and papers cited in all common libraries.
\textsf{NumPyNet} tries to overcome this problem with a minimal mathematical documentation associated to each script and a wide range of comments inside the code.

An other "problem" to take into account is associated to performances.
On one hand, libraries like \textsf{Tensorflow} are certainly efficient from a computational point-of-view and the numerous wraps (like \emph{Keras}) guarantee an extremely simple user interface.
On the other hand, the deeper functionalities of the code and the implementation strategies used are unavoidably hidden behind tons of code lines.
In this way the user can perform complex computational tasks using the library as black-box package.
\textsf{NumPyNet} wants avoid this problem using simple \textsf{Python} codes, with extreme readability also for new users, to better understand the symmetry between mathematical formulas and code.
The simplicity of this library allows us to give a first numerical analysis of the model functions and, moreover, to show the results of each function on an image to better understand the effects of their applications on real data.
Each \textsf{NumPyNet} function was tested against the equivalent \textsf{Tensorflow} implementation, using an automatic testing routine through \textsf{PyTest}~\cite{PyTest}.
The full code is open-source on the \textsf{Github} page of the project.
Its installation is guaranteed by a continuous integration framework of the code through \textsf{Travis CI} for Unix environments and \textsf{Appveyor CI} for Windows OS.
The library supports \textsf{Python} versions $\ge2.7$.

As term of comparison we discuss the more sophisticated implementation given by the \textsf{Byron} library.
\textsf{Byron} (\emph{Build YouR Own Neural network}) library is written in pure \textsf{C++} with the support of the modern standard \textsf{C++17}.
We deeply use the \textsf{C++17} functionality to reach the better performances and flexibility of our code.
What makes \textsf{Byron} an efficient alternative to the competition is the complete multi-threading environment in which it works.
Despite the most common Neural Network libraries are optimized for GPU environments, there are only few implementations which exploit the full set of functionalities of a multiple CPUs architecture.
This gap discourages multiple research groups on the usage of such computational intensive models in their applications.
\textsf{Byron} works in a fully parallel section in which each single computational function is performed using the entire set of available cores.
To further reduce the time of thread spawning, and so optimize as much as possible the code performances, the library works using a single parallel section which is opened at the beginning of the computation and closed at the end.

The \textsf{Byron} library is released under \textsf{MIT} license and publicly available on the \textsf{Github} page of the project.
The project includes a list of common examples like object detection, super resolution, segmentation.
The library is also completely wrapped using \textsf{Cython} to enlarge the range of users also to the \textsf{Python} ones.
The complete guide about its installation is provided; the installation can be done using \textsf{CMake}, \textsf{Make} or \textsf{Docker} and the \textsf{Python} version is available with a simple \textsf{setup.py}.
The testing of each function is performed using \textsf{Pytest} framework against the \textsf{NumPyNet} implementation (faster and lighter to import than \textsf{Tensorflow}) \cite{nicotesi}.

\section{Layers}

As described above, a neural network can be considered as a composition of function: for this reason every Deep Learning framework (e.g. Keras/Tensorflow, Pytorch, Darknet) implement each function as an independent object called {\it Layer}. In Byron and NumPyNet, each layer contains at least 3 methods:
\begin{itemize}
 \setlength\itemsep{-0.3em}
 \item {\bf forward} the forward method compute the output of the layer, given as input the previous output.
 \item {\bf backward} the backward method is essential for the training phase of the model: indeed, it computes all the updates for the layer weights and backpropagates the error to the previous layers in the chain.
 \item {\bf update} the update method applies the given update rules to the layer's weights.
\end{itemize}
By stacking different kind of layers one after another, it is possible to build
complex models with tens of millions of parameters.
For the purposes of this work, I'm going to describe layers used in super resolution, however, Byron is developed also for different applications (object detection, classification, segmentation, style transfer, natural language processing etc...) and as such, many more layers are available.

\subsection*{Convolutional Layer}

A Convolutional Neural Network (CNN) is a specialized kind of neural network for processing data that has known grid-like topology \cite{Goodfellow-et-al-2016}, like images, that can be considered as a grid of pixels.
The name indicates that at least one of the functions employed by the network is a convolution. In a continuos domain the convolution between two functions $f$ and $g$ is defined as:
\begin{equation}
 (f * g)(t) = \int_{-\infty}^{+\infty} f(\tau)g(t-\tau)d\tau
\end{equation}
The first function $f$ is usually referred to as the input and the second function $g$ as kernel.
For Image Processing applications we can define a 2-dimensional discrete version of the convolution in a finite domain using an image $I$ as input and a 2 dimensional kernel $k$:
\begin{equation}
 C[i, j] = \sum_{u=-N}^N \sum_{v=-M}^M k[u, v] \cdot I[i-u, j-v]
 \label{eq:conv}
\end{equation}
where $C[i, j]$ is the pixel value of the output image and $N$, $M$ are the kernel dimensions.
Practically speaking, a convolution is performed by sliding a kernel of dimension $N \times M$ over the image, each kernel position corresponds to a single output pixel, the value of which is calculated by multiplying together the kernel value and the underlaying pixel value for each cell of the kernel and summing all the results, as shown in figure \ref{fig:convolution}:

\begin{figure}[H]
 \centering
 \includegraphics[scale=0.5]{./images/conv.png}
 \caption{\it Visual representation of a convolution of an Image $I$ with a kernel of size 3}
 \label{fig:convolution}
\end{figure}

The convolution operation is also called $filtering$. By chosing the right kernel (filter) it is possible to highlight different features. For this reason the convolution operation is commonly used in image analysis: some of the most common applications are denoising, edge detection and edge enhancement.

The convolutional layer (CL) object is the most used layer in DL image analysis, therefore its implementation must be as efficient as possible.
Its purpose is to perform multiple (sometimes thousands) convolution over the input to extract different high-level features, which are compositions of many low-level attributes of the image (e.g edges, simple shapes).
In the brain/neuron analogy, every entry in the output volume can also be interpreted as an output of a neuron that looks at only a small region, the neuron's {\it receptive field} in the input and shares parameters with all the neuron spatially close.
As more CLs are stacked, the receptive field of a single neuron grows and with that, the complexity of the features it is able to extract.
The local nature of the receptive field allows the models to recognize features regardless of the position in the images. In other words, it is independent from translations \cite{Goodfellow-et-al-2016}.

The difference from a traditional convolutional approach is that instead of using pre-determined filters, the network is supposed to learn its own.
A CL si defined by the following parameters:

\begin{itemize}
 \setlength\itemsep{-0.2em}
 \item [-] {\bf kernel size}: it is the size of the sliding filters. The depth of the filters is decided by the depth of the input images (which is the number of channels.). The remanining 2 dimensions (width and height) can be indipendent from one another, but most implementations require squared kernels.
 \item [-] {\bf strides}: defines the movement of the filters. With a low stride (e.g. unitary) the windows tends to overlap. With high stride values we have less overlap (or none) and the dimension of the output decrease.
 \item [-] {\bf number of filters}: is the number of different filters to apply to the input. It also indicates the depth of the output.
 \item [-] {\bf padding}: is the dimension of an artificial enlargement of the input to allow the application of filters on borders. Usually, it can be interpreted as the number of rows/columns of pixel to add to the input, however some libraries (e.g Keras) consider it only as binary: in case is true, only the minimum number of rows/columns are appended to keep the same spatial dimension.
\end{itemize}
Given the parameters, it is straightforward to compute the number of weights and bias needed for the initialization of the CL: indeed, suppose an image of dimensions $(H, W, C)$ slided by $n$ different 3-D filters of size $(k_x, k_y)$ with strides $(s_x, s_y)$ and padding $p$, then:
\begin{align}
 \# weights &= n \times k_x \times k_y \times C \\
 \# bias &= n
\end{align}
Note that the number of weights does not depend on the input spatial size but only on its depth. It is important because a fully convolutional network can receives images of any size as long as they have the correct depth. Moreover, using larger inputs do not requires more weights, as is the case for fully connected structure.

The output dimensions are $(out\_H, out\_W, n)$ where:
\begin{align}
 out\_H &= \lfloor\frac{H - k_x + p}{s_x}\rfloor + 1 \\
 out\_W &= \lfloor\frac{W - k_y + p}{s_y}\rfloor + 1
\end{align}

Even if the operation can be implemented as described above in equation \ref{eq:conv}, this is never the case: it is certainly easier but also order of magnitude slower than more common algorithms.
A huge speed up in performances is given by realising that a discrete convolution can be viewed as a single matrix multiplication. The first matrix has as rows each filters of the CL, while the second matrix has as columns every windows of the image traversed by the kernels, as shown in figure \ref{fig:im2col}.

\begin{figure}[h]
 \centering
 \includegraphics[scale=0.35]{./images/im2col.png}
 \caption{\it Scheme of the im2col algorithm using a $2 \times 2 \times 3$ filter with stride 1 on a $4 \times 4 \times 3$ image. The matrix multiplication is between a $n \times 12$ and a $12 \times 9$ matrixes.}
 \label{fig:im2col}
\end{figure}

This re-arrengement is commonly called im2col. The main downside is that a lot more memory is needed to store the newly arranged matrix.
The larger the number of kernels, the higher is the time gain of this implementation over a naive one.

Another important optimization comes from linear algebra considerations and is called {\it Coppersmith-Winograd algorithm}, which was designed to optimize the matrix product.
Suppose we have an input image of just 4 elements and a 1-D filter mask with size 3:

\begin{equation}
\mbox{img} = \left[\begin{array}{cccc} d0 & d1 & d2 & d3 \end{array}\right] \quad\quad \mbox{weights} = \left[\begin{array}{ccc} g0 & g1 & g2 \end{array}\right]
\end{equation}
\\
we can now use the \textsf{im2col} algorithm previously described and reshape our input image and weights into

\begin{equation}
\mbox{img} = \left[
\begin{array}{ccc}
d0 & d1 & d2 \\
d1 & d2 & d3
\end{array}
\right],
\quad\quad
\mbox{weights} = \left[
\begin{array}{c}
g0 \\
g1 \\
g2
\end{array}
\right]
\end{equation}
\\
given this data, we can simply compute the output as the matrix product of this two matrices:

\begin{equation}
\mbox{output} = \left[
\begin{array}{ccc}
d0 & d1 & d2 \\
d1 & d2 & d3
\end{array}
\right]
\left[
\begin{array}{c}
g0 \\
g1 \\
g2
\end{array}
\right] = \left[
\begin{array}{c}
d0 \cdot g0 + d1 \cdot g1 + d2 \cdot g2 \\
d1 \cdot g0 + d2 \cdot g1 + d3 \cdot g2
\end{array}
\right]
\end{equation}
\\
The Winograd algorithm rewrites this computation as follow:

\begin{equation}
\mbox{output} = \left[
\begin{array}{ccc}
d0 & d1 & d2 \\
d1 & d2 & d3
\end{array}
\right]
\left[
\begin{array}{c}
g0 \\
g1 \\
g2
\end{array}
\right] = \left[
\begin{array}{c}
m1 + m2 + m3 \\
m2 - m3 - m4
\end{array}
\right]
\end{equation}
\\
where

\begin{equation}
\begin{aligned}
m1 = (d0 - d2)g0\quad\quad m2 = (d1 + d2)\frac{g0 + g1 + g2}{2}
\\
m4 = (d1 - d3)g2\quad\quad m3 = (d2 - d1)\frac{g0 - g1 + g2}{2}
\end{aligned}
\end{equation}

The two fractions in $m2$ and $m3$ involve only weight's values, so they can be computed once per filter. Moreover, the normal matrix multiplication is composed of 6 multiplications and 4 addition, while the winograd algorithm reduce the number of multiplication to 4, that is very significant, considering that a  single multiplication takes 7 clock-cycles and an addition only 3.
In Byron we provide the winograd algorithm for square kernels of size 3 and stride 1, since it is one of the most common combinations in Deep Learning and the generalization is not straightforward.

In the backward operation is important to remember that each weight in the filter contributes to each pixel in the output map.
Thus, any change in a weight in the filter will affect all the output pixels.
Note that the backward function can still be seen as a convolution between the input and the matrix of errors $\delta^l$ for the updates and as a full convolution between $\delta^l$ and the flipped kernel for the error $\delta^{l-1}$.
In the case the windows of kernels overlap, updates are the sum of all the contributing elements of $\delta^l$.

\subsection*{Pooling}

Pooling operations are down-sampling operations, so that the spatial dimensions of the input are reduced. Similarly to what happens in a CL, in pooling layers a 3-D kernel of size $k_x \times k_y \times C$ slides across an image of size $H \times W \times C$, however the operation performed by this kind of layers is fixed and does not change during the course of training.
The two main pooling functions are max-pooling and average-pooling: as suggested by the names, the former
returns the maximum value of every window of the images super-posed by the kernel, as shown in figure \ref{fig:maxpool}.

\begin{figure}[h]
 \centering
 \includegraphics[scale=0.35]{./images/maxpool.png}
 \caption{\it Scheme of maxpool operations with a kernel of size $2 \times 2$ and stride $2$ over an image of size $4 \times 4$. Picture from CS231n}
 \label{fig:maxpool}
\end{figure}

The latter instead, returns the average value of the window and can be seen as a convolution where every weight in the kernel is $\frac{1}{k_x \cdot k_y}$.
The results expected from an Average pooling operations are shown in figure \ref{fig:avgpool:ex}.

\begin{figure}[h]
 \centering
 \includegraphics[scale=0.7]{./images/avgpool_layer.png}
 \caption{\it Average pooling applied to a test image: (left) the original image, (center) average pooling with a $3 \times 3$ kernel, (right) average pooling with a $30 \times 30$ kernel. The images have been obtained using NumPyNet}
 \label{fig:avgpool:ex}
\end{figure}

Other popular pooling functions include the $L^2$ norm of a rectangular neighborhood or a weighted average based on the distance from the central pixel.

A typical block of a convolutional network consists of three stages: In the first stage a CL performs several convolutions in parallel, in the second stage each convolution result is run through a non-linear activation function (sometimes called {\it detector}) and in the third stage a pooling function is used to further modify the output.
The modification brought by pooling is helpful in different ways: first of all, it is a straightforward computational performance improvement, since less features also means less operations.
Moreover, in all cases, pooling helps to make representation approximately invariant to small translation of the input and invariance to local translation can be a useful property if the objective is to decide wether a feature is present rather than where it is located \cite{Goodfellow-et-al-2016}.
The reductions of features can also prevent over-fitting problems during training, improving the general performances of the model.

A pooling layer is defined by the same parameters as a CL, minus the number of filters; moreover, also the output dimensions for Pooling layers are the same as for CLs, however, they have no weights to be trained.

Due to the similarities with the CL it is possible to implement a pooling layers through the im2col algorithm, as an example, the NumPyNet implementation shown in the snippet below make use of the function \texttt{asStride} to create a \texttt{view} of the input array:

\lstset{style=snippet}
\begin{lstlisting}[language=Python, caption=NumPyNet version of Maxpool function, label=code:py_avgpool]
import numpy as np

class Maxpool_layer(object):

  def __init__(self, size=(3, 3), stride=(2, 2)):

    self.size = size
    self.stride = stride
    self.batch, self.w, self.h, self.c = (0, 0, 0, 0)
    self.output, self.delta = (None, None)

  def _asStride(self, input, size, stride):

    batch_stride, s0, s1 = input.strides[:3]
    batch,        w,  h  = input.shape[:3]
    kx, ky     = size
    st1, st2   = stride

    view_shape = (batch, 1 + (w - kx)//st1, 1 + (h - ky)//st2) + input.shape[3:] + (kx, ky)

    strides = (batch_stride, st1 * s0, st2 * s1) + input.strides[3:] + (s0, s1)

    subs = np.lib.stride_tricks.as_strided(input, view_shape, strides=strides)

    return subs

  def forward(self, input):

    kx , ky  = self.size
    st1, st2 = self.stride
    _, w, h, _ = self.input_shape

    view = self._asStride(input)

    self.output = np.nanmax(view, axis=(4,5))

    new_shape = view.shape[:4] + (kx*ky, )

    self.indexes = np.nanargmax(view.reshape(new_shape), axis=4)

    self.indexes = np.unravel_index(self.indexes.ravel(), shape=(kx, ky))

    self.delta = np.zeros(shape=self.out_shape, dtype=float)

    return self

\end{lstlisting}

A \texttt{view} is a special \texttt{numpy} object which retains the same information of the original array arranged in a different way, but without occupying more memory. In this case, the re-arrengement is very similar to an im2col, with the only difference that we are not bound to any number of dimensions. The resulting tensor has indeed 6 dimensions.
Since no copy is produces in this operation we can obtain a faster execution.

In pooling layer the backward function is similar to what we saw for convolutional layers, this time we don't have to compute the weights updates though, only the error to back-progate along the network.
For maxpool layers, only the maximum input pixel for every window is involved in the backward pass. Indeed, if we consider the simple case in which the forward function is:

\begin{equation}
 m = max(a, b)
\end{equation}

and, as described in the dedicated chapter, we know that $\frac{\partial C}{\partial m}$ is the error passed back from the next layer: the objective is to compute $\frac{\partial C}{\partial a}$ and $\frac{\partial C}{\partial b}$.
If $a > b$ we have:

\begin{equation}
 m = a \quad \Rightarrow \quad \frac{\partial C}{\partial m} = \frac{\partial C}{\partial a}
\end{equation}
$m$ does not depends on $b$ so $\frac{\partial C}{\partial b} = 0$.

So the error is passed only to those pixels which value is maximum in the considered window, the other are zeros.
In figure \ref{fig:maxpool:ex} an example of forward and backward pass for a maxpool kernel of size 30 and stride 20.

\begin{figure}[H]
 \centering
 \includegraphics[scale=0.53]{./images/maxpool_30_20.png}
 \caption{\it Max pooling applied to a test image: (left) the original image, (center) max pooling with a $30 \times 3$0 kernel and stride $20$, (right) max pooling errors image. Only few of the pixels are responsible for the error backpropagation.
 The images have been obtained using NumPyNet}
 \label{fig:maxpool:ex}
\end{figure}

The backward pass for the average pool layer is the same as for the CL, considering that in this case the ``weights'' are fixed.

% According to \cite{nopool}, it is possible to build models exclusively out of convolutional layers without the need for pooling and reach state-of-the art performances

\subsection*{Shortcut Connections}

An important advancement in network architecture has been brought by the introduction of Shortcut (or Residual) Connections \cite{residual}. It is well know that deep models suffer from {\it degradation problems} after reaching a maximum depth. Adding more layers, thus increasing the depth of the model, saturates the accuracy which eventually starts to rapidly decrease.
The main cause of this dergradation is not overfitting, but numerical instability tied to gradient backpropagation: indeed, as the gradient is back-propagated through the network, repeated multiplications can make those gradients very small or, alternatevely, very big,
This problem is well known in Deep Learning and takes the name of {\it vanishing}/{\it exploding gradients} and it makes almost impossible to train very large models, since early layers may not learn anything even after hundreds of epochs.
A residual connection is a special shortcut which connects 2 different part of the network with a simple linear combination.
Instead of learning a function $F(x)$ we try to learn $H(x) = F(x) + x$, as shown in figure \ref{fig:shortcut}:

\begin{figure}[h]
 \centering
 \includegraphics[scale=0.4]{./images/shortcut.png}
 \caption{\it Scheme of the shortcut layer as designed by the authors \cite{residual}. The output of the second layer become a linear combination of the input x and its own output.}
 \label{fig:shortcut}
\end{figure}
During the back propagation the gradient of higher layers can easily pass to the lower layers, without being mediated, which may cause vanishing or exploding gradient.
\\
Both in NumPyNet and Byron, we chose to generalize the formula as:

\begin{equation}
 H(x_1, x_2) = \alpha x_1 + \beta x_2
\end{equation}
Where $x_1$ is the output of the previous layer and $x_2$ is the output of the layer selected by \texttt{index} paramter. Indeed, even the shortcut connection can be implemented as a stand-alone layer, defined by the following parameters:

\begin{itemize}
 \setlength\itemsep{-0.2em}
 \item {\bf index} is the index of the second input of this layer $x_2$ (the first one $x_1$ is the output of the previous layer).
 \item {\bf alpha} the first coefficient of the linear combination, multiplied by $x_1$.
 \item {\bf beta} the second coefficient of the linear combination, multiplied by $x_2$.
\end{itemize}
.
The backward function is simply:

\begin{equation}
 \frac{\partial C}{\partial x_1} = \frac{\partial C}{\partial H}\frac{\partial H}{\partial x_1} = \delta \cdot \alpha
\end{equation}
for the first layer and:

\begin{equation}
 \frac{\partial C}{\partial x_2} = \frac{\partial C}{\partial H}\frac{\partial H}{\partial x_2} = \delta \cdot \beta
\end{equation}
for the second layer. Again, $\delta$ is the error backpropagated from the next layer.
Residuals connections were first introduced for image classification problems, but they rapidly become part of numerous models for every kind of application tied to Image Analysis.

\subsection*{Pixel Shuffle}

Using pooling and convolutional layers with non unitarian strides is a simple way to downsample the input dimension.
For some applications though, we may be interested in upsampling the input, for example:

\begin{itemize}
 \setlength\itemsep{-0.2em}
 \item in image to image processing (input and output are images of the same size) it is common to perform a compression to an internal encoding (e.g Deblurring, U-Net Segmentation).
 \item project feature maps to a higher dimensional space, i.d. to obtain a image of higher resolution (e.g Super-Resolution)
\end{itemize}
for those purposes the {\it transposed convolution} (also called {\it deconvolution}) was introduced.
The transposed convolution can be treated as a normal convolution with a sub-unitarian stride, by upsampling the input with empty rows and columns and then apply a single strided convolution, as shown in figure \ref{fig:deconv}.

\begin{figure}[h]
 \centering
 \includegraphics[scale=0.35]{./images/deconv.png}
 \caption{\it example of deconvolution: (left) a normal convolution with size 3 and stride 1, (right) after applying a "zeros upsampling" the convolution of size 3 and stride 1 become a deconvolution}
 \label{fig:deconv}
\end{figure}

Although it works, the transposed convolution is not efficient in terms of computational and memory cost, therefore not suited for modern convolutional neural networks.
An alternative is the recently introduced {\it sub-pixel convolution} \cite{pixelshuffle} (also called Pixel Shuffle). The main advantages over the deconvolution operation is the absence of weights to train: indeed the operation performed by the Pixel Shuffle (PS) Layer is deterministic and it is very efficient if compared to the deconvolution, since it is only a re-arrengement of the pixels.

Given a scale factor $r$, the PS organizes an input $H \times W \times C \cdot r^2$ into an output tensor $r \cdot H \times r \cdot W \times C$, which generally is the dimension of the high resolution space.
So, strictly speaking, the PS does not perform any upsample, since the number of pixels stays the same.
In figure \ref{fig:pixelshuffle1} is shown an example with $C=1$:

\begin{figure}[h]
 \centering
 \includegraphics[scale=0.35]{./images/pixelshuffle.png}
 \caption{\it Example of pixel shuffling proposed by the authors \cite{pixelshuffle}. In this example, $r^2$ features maps are re-arranged into a single-channeled high resolution output.}
 \label{fig:pixelshuffle1}
\end{figure}

As suggested by the authors, the best practice to improve performances is to upscale from low resolution to high resolution only at the very end of the model.
In this way the CL can efficienlty produce an high number of low resolution feature maps that the PS can organize into the final output.

In both NumPyNet and Byron, the pixel shuffle layer is defined only by the \texttt{scale} parameter, which lead the entire transformation.
In the first case, it is possible to implement forward and backward using the functions \texttt{split}, \texttt{reshape}, \texttt{concatenate} and \texttt{transpose} of the \texttt{numpy} library \cite{Numpy}.
This implementation has been tested against \texttt{tensorflow}'s \texttt{depth\_to\_space} and \texttt{space\_to\_depth}.
Despite being available in most deep learning library, a low level  \texttt{C++} implementation for the PS algorithm is hard to find.
In Byron we propose a dynamic algorithm able to work for both \texttt{channel last} and \texttt{channel first} input.
The algorithm is essentially a re-indexing of the input array in six nested for-loops. The first solution taken into account during the development was the contraction of the loops into a single one using divisions to obtain the correct indexes: however the amount of required divisions weights on the computational performances, given that divisions are the most expensive operation in terms of CPU clock-cycles.

The backward function of this layer does not involve any gradient computation: instead, it is the inverse of the re-arrengement performed in the forward function.

\subsection*{Batch Normalization}

When training a neural network, the standard approach is to separate the dataset in groups, called {\it batches} or {\it mini-batches}.
In this way the network can be trained with multiple input at a time and the updates for the weights are usually computed by averaging in the batch.
The number of examples in each batch is called \textit{batch size}: this can varies from 1 to the size of the dataset.
Using batch sizes different from one is beneficial in several ways.
First, the gradient of the loss over a mini-batch is a better estimate of the gradient over the train set, whose quality improves as the batch size increases, but using the entire train set can be very costly in terms of memory usage and often impossible to achieve.
Second, it can be much more efficient in modern architecture due to the  parallelism instead of performing M sequential computations for single examples. \cite{batchnorm}

Batch normalization is the operation that normalizes the features of the input along the batch axis, which allows to overcome a phenomenon in Deep Network training called {\it internal covariate shift}: whenever the parameters of the model change, the input distributions of every layer change accordingly.
This behaviour produces a slow down in the training convergence because each layer has to adapt itself to a new distribution of data for each epoch. Moreover, the parameters must be carefully initialized.
By making the normalization a part of the model architecture, the layer acts also as a regularizer, which in turn allows better generalization perfomances.

Let's $M$ be the number of examples in the group and $\epsilon$ a small variable added for numerical stability, the batch normalization function is defined as:

\begin{align}
  &\mu = \frac{1}{M} \sum_{i=1}^{M} x_i \\
  &\sigma^2 = \frac{1}{M} \sum_{i=1}^{M} (x_i - \mu)^2 \\
  &\hat x_i = \frac{(x_i - \mu)^2}{\sqrt{\sigma^2 + \epsilon}} \\
  &y_i = \gamma \bar x_i + \beta
\end{align}

where $\gamma$ and $\beta$ are the trainable weights of this layer. In the case of a tensor of images of size $M \times H \times W \times C$ all the quantities are multidimensional tensors as well and all the operations are performed element-wise.

The backward function can be computed following the chain rule for derivatives. As usual, define $\delta^l = \frac{\partial C}{\partial y}$ as the error coming from the next layer, the goal is to compute the updates for $\gamma$ and $\beta$ and the error for the previous layer.
The updates are straightforward:

\begin{align}
 &\frac{\partial C}{\partial \gamma} = \frac{\partial C}{\partial y_i} \cdot \frac{\partial y_i}{\partial \gamma} = \sum_{i=1}^{M} \delta_i^l \cdot \hat x_i \\
 &\frac{\partial C}{\partial \beta} = \frac{\partial C}{\partial y_i} \cdot \frac{\partial y_i}{\partial \beta} = \sum_{i=1}^M \delta_i^l
\end{align}

while the error requires more steps:

\begin{equation}
 \frac{\partial C}{\partial x} := \delta^{l-1} = \delta^l \cdot \frac{\partial y}{\partial  x}
\end{equation}

where:

\begin{align}
 \frac{\partial y}{\partial x} = \frac{\partial y}{\partial \hat x}
 (\frac{\partial \hat x}{\partial \mu}\frac{\partial \mu}{\partial x} +
  \frac{\partial \hat x}{\partial \sigma^2}\frac{\partial \sigma^2}{\partial x} +
\frac{\partial \hat x}{\partial \sigma^2}\frac{\partial \sigma^2}{\partial \mu}\frac{\partial \mu}{\partial x})
\end{align}

By considering all the derivatives, we find:

\begin{equation}
 \frac{\partial C}{\partial x_i} := \delta^{l-1}_i =
 \frac{M\delta^l_i \cdot \gamma_i - \sum_{j=1}^{M} \delta^l_j \cdot \gamma_i - \hat x_i \cdot \sum_{j=1}^{M} \delta^l_j \cdot \hat x_j}
 {M\sqrt{\sigma^2 + \epsilon}}
\end{equation}

Knowing the correct operations, an example of implementation is shown in the snippet \ref{code:py_batchnorm}:

\lstset{style=snippet}
\begin{lstlisting}[language=Python, caption=NumPyNet version of batchnorm function, label=code:py_batchnorm]

def forward(self, inpt):

  self._check_dims(shape=self.input_shape, arr=inpt, func='Forward')

  self.x    = inpt.copy()
  self.mean = self.x.mean(axis=0)
  self.var  = 1. / np.sqrt((self.x.var(axis=0)) + self.epsil)

  self.x_norm = (self.x - self.mean) * self.var
  self.output = self.x_norm.copy()

  self.output = self.output * self.scales + self.bias

  self.delta = np.zeros(shape=self.out_shape, dtype=float)

  return self

def backward(self, delta=None):

  invN = 1. / np.prod(self.mean.shape)

  # Those are the explicit computation of every derivative involved in BackPropagation

  self.bias_update = self.delta.sum(axis=0)
  self.scales_update = (self.delta * self.x_norm).sum(axis=0)

  self.delta *= self.scales