cf_paper/cu_app_fbf.tex at master · wotaoyin/cf_paper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
% !TEX root = ./amsa_main.tex
\subsubsection{Network}\label{sec:network}
Consider that $m$ worker agents and one master agent form a star-shaped network, where the master agent at the center connects to each of the worker agents. The $m+1$ agents collaboratively solve the consensus problem:  $$\Min_{x} \sum_{i=1}^m f_i(x),$$ where $x\in \RR^d$ is the common variable and each proximable function $f_i$ is held privately by agent $i$. The problem can be reformulated as
\begin{align}
\Min_{x_1,\dots,x_m,y\in \RR^d} F(x) := \sum_{i=1}^mf_i(x_i),\quad \St~ x_i=y, ~\forall i \in [m],
\end{align}
{which has the  KKT condition}
\begin{equation}\label{pro:decentral}
0\in\underbrace{
\begin{bmatrix}
\partial F&0&0\\
0&0&0 \\
0&0&0
\end{bmatrix}}_{\mbox{operator}~\cA}\begin{bmatrix}
x\\
y\\
s
\end{bmatrix}+\underbrace{\begin{bmatrix}
0 & 0&I\\
0 & 0 & -e^\top \\
I & -e & 0
\end{bmatrix}}_{\mbox{operator}~\cC}\begin{bmatrix}
x\\
y\\
s
\end{bmatrix},
\end{equation}
where $s$ is the dual variable.

Applying the FBFS scheme \eqref{eqn:fbf} to~\eqref{pro:decentral} yields the following full update:
\begin{subequations}
\begin{align}
x_i^{k+1} &= \prox_{\gamma f_i}(x_i^k-\gamma s_i^k)+\gamma^2 x_i^{k}-\gamma^2 y^k-2\gamma s_i^k, \label{fbfs_a} \\
y^{k+1} &= (1+m\gamma^2)y^k +3\gamma \sum_{j=1}^m s_j^k -\gamma^2 \sum_{j=1}^m x_j^k,\\
s_i^{k+1} &= s_i^k-2\gamma x_i^k-\gamma \prox_{\gamma f_i}(x_i^k-\gamma s_i^k)+3\gamma y^k+ \gamma^2 \sum_{j=1}^m s_j^k, \label{fbfs_c}
\end{align}
\end{subequations}
where \eqref{fbfs_a} and \eqref{fbfs_c} are applied to all $i\in[m]$.
Hence, for each $i$, we group  $x_i$ and $s_i$ together and assign them on agent $i$.  {We let the master agent } maintain $\sum_j s_j$ and $\sum_j x_j$. Therefore, {in the FBFS coordinate update, updating any $(x_i,s_i)$ needs only $y$ and $\sum_j s_j$ from the master agent, and updating $y$ is done on the master agent. In synchronous parallel setting, at each iteration, each worker agent $i$ computes $s_i^{k+1}, x_i^{k+1}$, then the master agent collects the updates from all of the worker agents and then updates $y$ and $\sum_j s_j$. The above update can be relaxed to be asynchronous. In this case, the master and worker agents work concurrently, the master agent updates $y$ and $\sum_j s_j$ as soon as it receives the updated $s_i$ and $x_i$ from any of the worker agents. It also periodically broadcasts $y$ back to the worker agents.

%can consider $(x_i,s_i)$ as one coordinate and store each pair of such variables on one agent. Updating $(x_i,s_i)$ will need the latest value of $y$ from the master node only, and updating $y$ will need the latest values all  $(x_i,y_i)$ from all the agents. However, the information needed is just the summation of $x_i$ and $s_i$, which is easy to update.

\cut{
\subsection{Total Variation Image Processing}\label{sec:tvip}
The most widely studied (non-local) Total Variation (TV) model \cite{?} for image processing can been written as
\begin{align}
\Min_x h(Ax) + f(x)
\end{align}
where $f(Ax)$ is the (non-local) TV regularization term and $f(x)$ is the data fitting term that dependent on the image processing problems. If we rearange the image pixels into a vector, and $A$ can be expressed as a sparse matrix for (non-local) TV. For example, the matrix $A$ for the isotropic TV on a $m$ by $n$ image has the size $2mn\times mn$ and can be expressed as
\begin{align}
(Ax)_{(i-1)*m+j} = x_{i*m+j}-x_{(i-1)*m+j},
\end{align}
i.e., the difference between values at the $(i+1,j)$ pixel and the $(i,j)$ pixel.

In order to solve this optimization problem, we can derive the equivalent KKT condition as follows:
\begin{equation}
0\in\underbrace{
\begin{bmatrix}
\partial f(x)\\
\partial h^*(s)
\end{bmatrix}}_{\mbox{operator}~\cA}+\underbrace{\begin{bmatrix}
0&A\\
-A&0
\end{bmatrix}\begin{bmatrix}
x\\
s
\end{bmatrix}}_{\mbox{operator}~\cB}
\end{equation}
}

%We consider the optimization problem that the decentralized gradient descent algorithm solves
%\begin{align}
%\Min_{x_1,\cdots,x_m\in R^d} F(x) := \sum_{i=1}^mf_i(x_i) + {1\over 2\gamma}x^\top(I-W)x,
%\end{align}
%where $W$ is a sparse, symmetric, and doubly stochastic matrix describing the network between $m$ agents. The optimality condition can be expressed as
%\begin{align}
%0 \in \underbrace{\begin{bmatrix}\partial f_1(x_1) \\ \cdots\\\partial f_m(x_m)\end{bmatrix}}_{\mbox{operator}~\cA} + \underbrace{{1\over \gamma}(I-W)x}_{\mbox{operator}~\cB},
%\end{align}