NumDL-CourseNotes/00-NumDNN-Notation.tex at master · EmoryMLIP/NumDL-CourseNotes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
\documentclass[12pt,fleqn]{beamer}

\input{beamerStyle.tex}
\input{abbrv.tex}


\title[Notations]{Notation}
\subtitle{Numerical Methods for Deep Learning}
\date{}

\begin{document}
\makebeamertitle

\begin{frame}
	\frametitle{Data}

	\begin{itemize}
		\item $n$ - number of examples
		\item $n_f$ - dimension of feature vector
		\item $n_c$ - dimension of prediction (e.g., number of classes)
		\item $\bfy_1,\bfy_2,\ldots,\bfy_n \in\R^{n_f}$ - input features
		\item $\bfY = \left[\bfy_1,\bfy_2,\ldots,\bfy_n \right] \in \R^{n_f \times n}$ - feature matrix
		\item $\bfc_1,\bfc_,\ldots,\bfc_n \in \R^{n_c}$ - output observations
		\item $\bfC = \left[\bfc_1, \bfc_2,\ldots,\bfc_n\right] \in \R^{n_c \times n}$ - observation matrix
		\item $\R, \R_+, \R_{++}$ - all, non-negative, and positive real numbers
	\end{itemize}
\end{frame}

\begin{frame}
	\frametitle{Neural Networks}

	\begin{itemize}
		\item $f(\bfy,\theta) = \bfc$ - model represented by neural net
		\item $\theta \in \R^{n_p}$ - parameters of model
		\item $\theta^{(1)}, \theta^{(2)}, \ldots$ - parts of weights. Division clear from context. Examples
		\begin{enumerate}
			\item $\theta^{(j)}$ are weights of $j$th layer.
			\item $\theta^{(1)}$ are weights for convolution kernel, $\theta^{(2)}$ are weights for bias
		\end{enumerate}

		\item $N$ - number of layers
		\item $\bfK$ - linear operator applied to features
		\item $b$    - bias
		\item $\sigma : \R\to\R$ - activation function
	\end{itemize}
\end{frame}

\begin{frame}
	\frametitle{Optimization and Loss}

	\begin{itemize}
		\item $E(\bfY,\bfC,\bfW)$ - loss function parameterized by weights $\bfW$
		\item $\phi : \R^{k} \to \R$ - generic objective function
		\item $\theta^*$ - minimizer of a function, i.e.,
		$$
			\theta^* = \argmin_{\theta} \phi(\theta)
		$$
		\item $\theta_1,\theta_2, \ldots$ - iterates
		\item $\bfd, \bfD$ - search directions
		\item $\alpha$ - step size
		\item $\lambda$ - regularization parameter
		\item $\nabla_\bfx F$ - gradient, if $ F : \R^k \to \R^l $, then $\nabla F(\bfx) \in \R^{k\times l}$
		\item $\bfJ_\bfx F$ - Jacobian of $F$ with respect to $\bfx$, $\bfJ_{\bfx} F = (\nabla_\bfx F)^\top$
	\end{itemize}
\end{frame}

 \begin{frame}
	\frametitle{Linear Algebra - 1}
	\begin{itemize}
		\item $\bfe_k \in\R^k$ - vector of all ones
		\item $\bfI_k$ - $k\times k$ identity matrix
		\item $\kappa(\bfA)$ - condition number of $\bfA$
		\item $\sigma_1(\bfA)\geq\ldots\geq\sigma_k(\bfA)\geq 0$ - singular values of $\bfA$
		\item $\lambda_1(\bfA),\ldots$ - eigenvalues of $\bfA$
		\item ${\rm tr}(\bfA)$ - trace of square matrix, i.e., sum of diagonal elements
	\end{itemize}
\end{frame}

\begin{frame}
	\frametitle{Linear Algebra - 2}
	\begin{itemize}
		\item $\odot$ - Hadamard product
		$$
			\bfC_{ij} = \bfA_{ij} \cdot \bfB_{ij}, \quad \text{for} \quad \bfB,\bfA \in \R^{k\times l}
		$$
		MATLAB: $\texttt{C = A.*B}$
		\item $\otimes$ - Kronecker product
		$$
			\bfA \otimes \bfB = \left(
				\begin{array}{rrrrr}
					\bfA_{11} \bfB & \bfA_{12} \bfB & \dots & \bfA_{1l} \bfB \\
					\vdots         & \vdots         & \vdots& \vdots\\
					\bfA_{k1} \bfB & \bfA_{k2} \bfB & \dots & \bfA_{kl} \bfB \\
				\end{array}
			\right)
		$$
		MATLAB: \texttt{C = kron(A,B)}
		\item ${\rm vec}(\bfA)$ - reshape matrix $\bfA$ into vector (column-wise).
		$$
		\text{Example:}\quad
			{\rm vec}\left(
				\left(
					\begin{array}{rr}
						\bfA_{11} & \bfA_{12} \\
						\bfA_{21} & \bfA_{22}
					\end{array}
				\right)
			\right) = \left(
				\begin{array}{r}
					\bfA_{11}\\
					\bfA_{21} \\
					\bfA_{12}\\
					\bfA_{22}
				\end{array}
			\right)
		$$
		MATLAB: $\texttt{a = A(:)}$
	\end{itemize}
\end{frame}


\begin{frame}
	\frametitle{Linear Algebra - 3}
	\begin{itemize}
		\item $\bfA^{\dag}$ - Moore-Penrose inverse of full-rank matrix $\bfA$, i.e.,
		$$
			\bfA^{\dag} = \begin{cases}
				(\bfA^{\top} \bfA)^{-1}\bfA, & \bfA \text{ has linearly independent columns}\\
				\bfA^\top (\bfA \bfA^\top)^{-1}, & \bfA \text{ has linearly independent rows}\\
			\end{cases}
		$$
		\item ${\rm mat}(\bfv,k,l)$ - reshape vector $\bfv\in\R^{kl}$ into matrix. $k,l$ omitted when dimension clear from context. Note
		$$
			{\rm mat}({\rm vec} (\bfA)) = \bfA.
		$$
		MATLAB: \texttt{V = reshape(v,k,l)}.

		\item ${\rm diag}(\bfv)$ - diagonal matrix with elements of $\bfv \in \R^k$ on diagonal

		MATLAB: \texttt{V = diag(v(:))}

		\item ${\rm diag}(\bfA)$ - diagonal matrix obtained by vectorizing $\bfA$
	\end{itemize}
\end{frame}

\begin{frame}
	\frametitle{Acronyms}
	\begin{itemize}
		\item CG - Conjugate Gradient Method
		\item VarPro - Variable Projection
		\item SD - Steepest Descent
		\item SGD - Stochastic Gradient Descent
		\item SA  - Stochastic Approximation
		\item SAA - Stochastic Average Approximation
		\item SPD - symmetric positive definite
		\item SPSD - symmetric positive semi-definite
		\item CV - Cross Validation
	\end{itemize}
\end{frame}
\end{document}