Computational-Intelligence-Lab-ETH-FS19/WordEmbedding.tex at master · anklinv/Computational-Intelligence-Lab-ETH-FS19 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
\section*{Word Embeddings}
\textbf{Distributional Model:}\\
$p_\theta(w|w')$ = Pr[$w$ occurs in context of $w'$]\\
\textbf{Log-likelihood:}\\
$L(\theta; \mathbf{w}) = \sum_{t=1}^T\sum_{\Delta \in I}{\log p_\theta(w^{(t+\Delta)}|w^{(t)})}$\\
\textbf{Latent Vector Model:} $w \rightarrow (\mathbf{x}_w, b_w) \in \mathbb{R}^{D+1} \\p_{\theta}(w|w') = \frac{\exp[\langle \mathbf{x}_w,\mathbf{x}_{w'}\rangle + b_w]}{\sum_{v\in V}{\exp[\langle \mathbf{x}_v,\mathbf{x}_{w'}\rangle + b_v ]}}$ (soft-max).\\
\textbf{Skip Gram Model:}\\
$\mathcal{L}(\theta ;\mathbf{w}) = \sum_t \sum_{\Delta\in \mathcal{I}}b_{w^{(t+\Delta)}}+ \langle \mathbf{x}_{w^{(t+\Delta)}}, \mathbf{x}_{w^{(t)}}\rangle - \log \sum_{v\in \mathcal{V}} \exp [\langle \mathbf{x}_v, \mathbf{x}_{w^{(t)}} \rangle + b_v]$\\
\textbf{Modifications:}\\
$\log p_{\theta}(w|w') = \langle  y_{w} , x_{w'} \rangle + b_w$,  word embedding $y_w$, context embeddings $x_{w'}$. Alternative to MLE (partition calculation is hard): negative sampling (modify objective into logistic classification), PMI
\subsection*{GloVe (Weighted Square Loss)}
\textbf{Co-occurence Matrix:}\\
$\mathbf{N} = (n_{ij}) \in \mathbb{N}^{|V|\times|C|} = \#\text{occ. of }w_i$ in context $w_j$\\
\textbf{Obj:} $\mathcal{H}(\theta;\mathbf{N}) = \sum_{i,j} f(n_{ij})(\log n_{ij} - \log \tilde p_\theta(w_i |w_j) $\\
Unnorm. dist: $\tilde p_\theta(w_i|w_j) =\exp[\langle \mathbf{x}_i, \mathbf{y}_j \rangle + b_i + c_j])^2$
with $f(n) = \min\{1, (\frac{n}{n_{max}})^\alpha\}$, $\alpha \in (0;1]$.\\
\textbf{normalized:} need to compute partition function, but cannot be large everywhere. \\
\textbf{unnormalized: }can use two-sided loss function \\
Perform SGD to find local minimum\\
1. sample $(i,j) \text{ u.a.r, s.t. } n_{ij}>0$\\
2. $\mathbf{x}_i^{new} \leftarrow \mathbf{x}_i + 2\eta f(n_{ij})(\log n_{ij} - \langle \mathbf{x}_i, \mathbf{y}_j \rangle)\mathbf{y}_j$\\
3. $\mathbf{y}_j^{new} \leftarrow \mathbf{y}_j + 2\eta f(n_{ij})(\log n_{ij} - \langle \mathbf{x}_i, \mathbf{y}_j \rangle)\mathbf{x}_i$ \\
\textbf{Discussion:}
can model analogies and relatedness, but antonyms are usually not well captured.