-
Notifications
You must be signed in to change notification settings - Fork 1
/
talk1-boyd-chap7.tex
360 lines (315 loc) · 22.3 KB
/
talk1-boyd-chap7.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
\usepackage{empheq}
\usepackage{cancel}
\begin{document}
\title{Talk 1: Distributed Optimization and Statistical Learning via ADMM (I)}
\date{2021-4-29}
\author{WEN Hao}
\maketitle
{\bfseries Main Resource: Chapter 7 of \cite{boyd2011distributed}}
\section{Recall of basic ADMM}
A general ADMM optimization problem is formulated as
\begin{align*}
& \text{minimize} \quad f(x) + g(z) \\
& \text{subject to} \quad Ax + Bz = c
\end{align*}
The augmented Lagrangian of this problem is given by
$$\mathcal{L}_{\rho}(x,z,y) = f(x) + g(z) + \langle y, Ax+Bz-c \rangle + \dfrac{1}{\rho}\lVert Ax+Bz-c \rVert^2.$$
The iterations are given by
\begin{align*}
x^{k+1} & = \argmin_{x} \left\{ \mathcal{L}_{\rho}(x,z^k,y^k) \right\} \\
z^{k+1} & = \argmin_{z} \left\{ \mathcal{L}_{\rho}(x^{k+1},z,y^k) \right\} \\
y^{k+1} & = y^{k} + \rho (Ax^{k+1}+Bz^{k+1}-c)
\end{align*}
Convergence of ADMM: under the conditions
\begin{itemize}
\item $f,g$ closed, proper convex;
\item $\mathcal{L}_{0}(x,z,y)$ has a saddle point,
\end{itemize}
as $k\rightarrow\infty$ one has
\begin{itemize}
\item feasibility: $Ax^k + By^k - c \rightarrow 0$
\item objective: $f(x^k) + g(y^k) \rightarrow p^*$
\item dual: $y^k \rightarrow y^*$
\end{itemize}
\section{Consensus Problem}
Assume we have global variable $x \in \mathbb{R}^n$ and ``split'' (or distributed) objective function
$$f(x) = \sum\limits_{i=1}^N f_i(x)$$
e.g. $x$ can be (global) model parameters, DNN weights (and biases, etc.), $f_i$ can be the loss function associated with the $i$-th block (``client'') of data. The optimization problem is
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^N f_i(x)
\end{align*}
Problem: $f$ is NOT block-separable.
Solution: add a common global variable $z \in \mathbb{R}^n$, so that the optimization problem is formulated as (equivalent to)
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) \\
& \text{subject to} \quad x_i - z = 0, \quad i=1,\cdots,N
\end{align*}
which is called a {\bfseries consensus problem}. One has the augmented Lagrangian
$$\mathcal{L}_{\rho}(x_1,\cdots,x_N,z,y) = \sum\limits_{i=1}^N \left[ f_i(x_i) + \langle y_i, x_i-z \rangle + \dfrac{\rho}{2} \lVert x_i-z \rVert^2 \right],$$
and ADMM iterations
\begin{align*}
\phantom{\leadsto} & \left( x = (x_1^T,\cdots,x_N^T)^T, x^{k+1} = \argmin_x \left\{ \sum\limits_{i=1}^N \left[ f_i(x_i) + \langle y^k_i, x_i-z^k \rangle + \dfrac{\rho}{2} \lVert x_i-z^k \rVert^2 \right] \right\} \right) \\
\leadsto \quad & x_i^{k+1} = \argmin_{x_i} \left\{ f_i(x_i) + \langle y^k_i, x_i-z^k \rangle + \dfrac{\rho}{2} \lVert x_i-z^k \rVert^2 \right\} \\
& z^{k+1} = \argmin_{z} \left\{ \sum\limits_{i=1}^N \left[ f_i(x_i^{k+1}) + \langle y^k_i, x_i^{k+1}-z \rangle + \dfrac{\rho}{2} \lVert x_i^{k+1}-z \rVert^2 \right] \right\} \\
& \phantom{z^{k+1}} = \argmin_{z} \left\{ \dfrac{N\rho}{2} \lVert z \rVert^2 - \langle z, \sum\limits_{i=1}^N (y_i^k+\rho x_i^{k+1}) \rangle + \cdots \right\} \\
& \phantom{z^{k+1}} = \dfrac{1}{N} \sum\limits_{i=1}^N \left( \dfrac{y_i^{k}}{\rho} + x_i^{k+1} \right) \\
& y_i^{k+1} = y_i^k + \rho (x_i^{k+1} - z^{k+1})
\end{align*}
Simplify notations by letting
$$
\begin{cases}
\overline{x}^k := \dfrac{1}{N} \sum\limits_{i=1}^N x_i^k \\
\overline{y}^k := \dfrac{1}{N} \sum\limits_{i=1}^N y_i^k \\
\end{cases}
$$
then one has the following observations
$$\displaystyle z^{k+1} = \dfrac{1}{N\rho} \sum\limits_{i=1}^N y_i^k + \dfrac{1}{N} \sum\limits_{i=1}^N x_i^{k+1} = \dfrac{\overline{y}^k}{\rho} + \overline{x}^{k+1}$$
and
\begin{align*}
& y^{k+1}_i = y_i^k + \rho(x_i^{k+1}-z^{k+1}) = y_i^k + \rho(x_i^{k+1}-\dfrac{\overline{y}^k}{\rho} - \overline{x}^{k+1}) \\
\Rightarrow \quad & \overline{y}^{k+1} = \overline{y}^{k} + \rho (\overline{x}_i^{k+1} - \dfrac{\overline{y}^k}{\rho} - \overline{x}^{k+1}) = 0 \\
\Rightarrow \quad & \overline{z}^{k+1} = \dfrac{0}{\rho} + \overline{x}^{k+1} = \overline{x}^{k+1} \\
\end{align*}
Then one can rewrite the iterations as
\begin{align*}
x^{k+1}_i & = \argmin\limits_{x_i} \left\{ f_i(x_i) + \langle y_i^k, x_i-\overline{x}^k \rangle + \dfrac{\rho}{2}\lVert x_i-\overline{x}^k \rVert^2 \right\} \\
(z^{k+1} & = \overline{x}^{k+1}) \\
y_i^{k+1} & = y_i^k + \rho(x^{k+1}_i-\overline{x}^{k+1})
\end{align*}
This can be further simplified by setting $u_i = \dfrac{y_i}{\rho}$:
\begin{align*}
x^{k+1}_i & = \argmin\limits_{x_i} \left\{ f_i(x_i) + \dfrac{\rho}{2}\lVert x_i-\overline{x}^k + u_i^k \rVert^2 \right\} = \operatorname{prox}_{f_i,\rho}(\overline{x}^k - u_i^k) \\
(z^{k+1} & = \overline{x}^{k+1}) \\
u_i^{k+1} & = u_i^k + (x^{k+1}_i-\overline{x}^{k+1})
\end{align*}
{\bfseries Statistical Interpretation} for the ADMM iterations for consensus problem: at iteration $k+1$, assume $x_i$ has prior distribution
$$x_i \sim N(\overline{x}^k-u_i^k, \rho I_n)$$
or equivalently
$$p(x_i) = \det(2\pi\rho I)^{-1/2} \exp\left(-\dfrac{1}{2} \lVert x_i - \overline{x}^k + u_i^k \rVert_{\rho I}^2 \right)$$
\begin{remark}
As stated in \cite{boyd2011distributed}, the bias of the mean of the above normal distribution, which is $-u_i^k$, can be interpreted as the ``price'' of ``client'' i disagreeing with the consensus $\overline{x}^k$ in the previous (the $k$
-th) iteration. As for why the ``price'' is $-u_i^k$, note that the previous scaled dual update $u_i^k$ is augmented by the bias of the $k$-th $x_i$-update, hence the accumulation of the biases of $x_i$-updates.
\end{remark}
Let
$$f_i(x_i) = \operatorname{NLL}(x_i) = -\log\operatorname{LH}(x_i)$$
be the negative log likelihood function\footnote{for a good visualization of NLL, ref. \href{https://ljvmiranda921.github.io/notebook/2017/08/13/softmax-and-the-negative-log-likelihood/}{https://ljvmiranda921.github.io/notebook/2017/08/13/softmax-and-the-negative-log-likelihood/}} of $x_i$ (w.r.t. the data (or observations) at the $i$-th ``client''). Then the max a posteriori estimates (MAP) of the parameters $x_i$ are
\begin{align*}
\operatorname{MAP}(x_i) & = \argmax_{x_i} \left\{ p(x_i) \cdot \operatorname{LH}(x_i) \right\} \\
& = \argmax_{x_i} \left\{ \exp(-f_i(x_i)) \cdot \det(2\pi\rho I)^{-1/2} \cdot \exp\left( -\dfrac{\rho}{2} \lVert x_i - \overline{x}_i^k + u_i^k \rVert^2 \right) \right\} \\
& = \argmin_{x_i} \left\{ f_i(x_i) + \dfrac{\rho}{2} \lVert x_i - \overline{x}_i^k + u_i^k \rVert^2 \right\} = x_i^{k+1}
\end{align*}
i.e. the $(k+1)$-th update of $x_i$ are just the MAP of $x_i$ with given prior distribution.
\begin{remark}
Most federated learning optimization algorithms fall into paradigm of this basic consensus problem (with inexact inner minimization loops), including FedAvg \cite{mcmahan2017fed_avg}, FedOpt(FedAdam, FedAdagrad, ...) \cite{reddi2020fed_opt}, etc.
\end{remark}
\section{Consensus with Regularization}
Consider the problem
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + \framebox{g(z)} \tikz[overlay, remember picture]{\node[] (cc_item) {}}
\tikz[overlay, remember picture]{\node[above right = 0.3cm and 1cm of cc_item.east] (cc_text) {regularization on consensus}}
\tikz[overlay, remember picture]{\path[->] ([yshift = -0.05cm]cc_text.west) edge ([yshift = 0.2cm]cc_item.east);} \\
& \text{subject to} \quad x_i-z = 0, \quad i=1,\cdots,N
\end{align*}
with regularization term $g(z)$ in the objective function.
The ADMM iterations:
\begin{align*}
x^{k+1}_i & = \argmin\limits_{x_i} \left\{ f_i(x_i) + \langle y_i^k, x_i-z^k \rangle + \dfrac{\rho}{2}\lVert x_i-z^k \rVert^2 \right\} \\
z^{k+1} & = \argmin\limits_{z} \left\{ g(z) + \sum\limits_{i=1}^N \left( \langle y_i^k, x_i^{k+1}-z \rangle + \dfrac{\rho}{2}\lVert x_i^{k+1}-z \rVert^2 \right) \right\} \tikz[overlay, remember picture]{\node[] (update_z) {}} \\
y_i^{k+1} & = y_i^k + \rho (x^{k+1}_i-z^{k+1})
\end{align*}
\tikz[overlay, remember picture]{\node[below left = 1cm and -2cm of update_z.east] (update_z_text) {has no analytic expression in general}}
\tikz[overlay, remember picture]{\path[->] ([yshift = 0.4cm, xshift = -1cm]update_z_text.east) edge ([yshift = -0.2cm]update_z.east);}
One similarly has the following reductions by letting $u_i = \dfrac{y_i}{\rho}$:
\begin{align*}
\tikz[overlay, remember picture]{\node[] (simp_z) {}} z^{k+1} & = \argmin_z \left\{ g(z) + \sum\limits_{i=1}^N \left( \dfrac{\rho}{2}\lVert z \rVert^2 - \langle \rho x_i^{k+1}+y_i^k, z \rangle + \cdots \right) \right\} \\
& = \argmin_z \left\{ g(z) + N \left( \dfrac{\rho}{2}\lVert z \rVert^2 - \langle \rho \overline{x}^{k+1}+\overline{y}^k, z \rangle + \cdots \right) \right\} \\
& = \argmin_z \left\{ g(z) + \dfrac{N\rho}{2} \lVert z - \overline{x}^{k+1} - \dfrac{\overline{y}^k}{\rho} \rVert^2 \right\} \\
& = \argmin_z \left\{ g(z) + \dfrac{N\rho}{2} \lVert z - \overline{x}^{k+1} - \overline{u}^k \rVert^2 \right\} = \operatorname{prox}_{g,N\rho}(\overline{x}^{k+1} + \overline{u}^k) \\
\tikz[overlay, remember picture]{\node[] (simp_x) {}} x_i^{k+1} & = \argmin_{x_i} \left\{ f_i(x_i) + \dfrac{\rho}{2} \lVert x_i - z^k + u_i^k \rVert^2 \right\} = \operatorname{prox}_{f_i,\rho}(z^k - u_i^k) \\
u_i^{k+1} & = u_i^k + (x_i^{k+1} - z^{k+1})
\end{align*}
\tikz[overlay, remember picture]{\path[->] ([yshift = 0.1cm]simp_x.west) edge[bend left] ([yshift = 0.6cm, xshift = 0.3cm]simp_z.north)}
\begin{eg}\
\begin{itemize}
\item[(1)] $g(z) = \lambda \lVert z \rVert_1, \lambda > 0,$ then
\begin{align*}
z^{k+1} & = \argmin_z \left\{ \lambda \lVert z \rVert_1 + \dfrac{N\rho}{2} \lVert z - \overline{x}^{k+1} - \overline{u}^k \rVert^2 \right\} \\
& = \operatorname{S}_{\lambda/N\rho} (\overline{x}^{k+1} + \overline{u}^{k+1}) \quad \leftarrow \quad \textbf{soft thresholding}
\end{align*}
\item[(2)] $g(z) = I_{\mathbb{R}^n_+}(z)$ the indicator function of $\mathbb{R}^n_+$, then
\begin{align*}
z^{k+1} & = \argmin_z \left\{ I_{\mathbb{R}^n_+}(z) + \dfrac{N\rho}{2} \lVert z - \overline{x}^{k+1} - \overline{u}^k \rVert^2 \right\} \\
& = (\overline{x}^{k+1} + \overline{u}^{k+1})_+
\end{align*}
\end{itemize}
\end{eg}
% \begin{remark}
% A recent paper \cite{hanzely2020federated} (although rejected by CVPR2021) considered new federated learning problems formulated as consensus with regularization.
% \end{remark}
\section{General Form Consensus}
Now consider even more general setting:
\begin{align*}
& x_i \in \mathbb{R}^{n_i}, z \in \mathbb{R}^n, \\
& \text{$x_i$ consists of a selection of components of $z$,} \\
& \text{i.e. } \forall i \in [1,N], \forall j \in [1,n_i], \exists \mathcal{G}(i,j) \text{ s.t. } (x_i)_j = z_{\mathcal{G}(i,j)}
\end{align*}
This general setting is of interest in cases where $n_i \ll n$, e.g. large global model and small local model (a small part of global params related to local data, corr. to vertical split of data?)
Let $\widetilde{z}_i \in \mathbb{R}^{n_i}$ be s.t. $(\widetilde{z}_i)_j = z_{\mathcal{G}(i,j)}$, then the general form consensus problem is formulated as
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) \\
& \text{subject to} \quad x_i - \widetilde{z}_i = 0
\end{align*}
The augmented Lagrangian:
$$\mathcal{L}_{\rho} = \sum\limits_{i=1}^N \left( f_i(x_i) + \langle y_i, x_i-\widetilde{z}_i \rangle + \dfrac{\rho}{2} \lVert x_i - \widetilde{z}_i \rVert^2 \right)$$
ADMM iterations:
\begin{align*}
& x_i^{k+1} = \argmin_{x_i} \left\{ f_i(x_i) + \langle y_i^k, x_i \rangle + \dfrac{\rho}{2}\lVert x_i-\widetilde{z}_i^k \rVert^2 \right\} \\
& z^{k+1} = \argmin_{z} \left\{ \sum\limits_{i=1}^N \left( -\langle y_i^k, \widetilde{z}_i \rangle + \dfrac{\rho}{2}\lVert x_i^{k+1}-\widetilde{z}_i \rVert^2 \right) \right\} \\
& y_i^{k+1} = y_i^k + \rho (x_i^{k+1} - \widetilde{z}_i^{k+1})
\end{align*}
Rewrite
\begin{align*}
z^{k+1} & = \argmin_{z} \left\{ \sum\limits_{i=1}^N \left( \dfrac{\rho}{2} \lVert \widetilde{z}_i \rVert^2 - \rho \langle x_i^{k+1} + \dfrac{1}{\rho} y_i^k, \widetilde{z}_i \rangle + \cdots \right) \right\} \\
& = \argmin_{z} \left\{ \sum\limits_{i=1}^N \dfrac{\rho}{2} \lVert \widetilde{z}_i - x_i^{k+1} -\dfrac{1}{\rho}y_i^k \rVert^2 \right\} \\
& = \argmin_{z} \left\{ \sum\limits_{i=1}^N \sum\limits_{j=1}^{n_i} \left( (\widetilde{z}_i)_j - (x^{k+1})_j - \dfrac{1}{\rho} (y_i^k)_j \right)^2 \right\} \\
& \qquad ( \text{since } \sum\limits_{i=1}^N \sum\limits_{j=1}^{n_i} = \sum\limits_{g=1}^n \sum\limits_{\mathcal{G}(i,j)=g} ) \\
& = \argmin_{z} \left\{ \sum\limits_{g=1}^n \left[ \sum\limits_{\mathcal{G}(i,j)=g} \left( z_g - (x_i^{k+1})_j - \dfrac{1}{\rho} (y_i^k)_j \right)^2 \right] \right\} \\
& \qquad ( \text{write } k_g = \# \{ (i,j) | \mathcal{G}(i,j) = g \} ) \\
& = \argmin_z \left\{ \sum\limits_{g=1}^n \left[ k_g\cdot z_g^2 - 2 \sum\limits_{\mathcal{G}(i,j)=g} \left( (x_i^{k+1})_j + \dfrac{1}{\rho} (y_i^k)_j \right) + \cdots \right] \right\} \\
\Rightarrow & \quad z_g^{k+1} = \dfrac{1}{k_g} \sum\limits_{\mathcal{G}(i,j)=g} \left( (x_i^{k+1})_j + \dfrac{1}{\rho} (y_i^k)_j \right) \leftarrow \text{local average}
\end{align*}
For the dual $y$-update, locally one has
\begin{align*}
\sum\limits_{\mathcal{G}(i,j)=g} (y_i^{k+1})_j & = \sum\limits_{\mathcal{G}(i,j)=g} (y_i^k)_j + \rho \left( \sum\limits_{\mathcal{G}(i,j)=g} (x_i^{k+1})_j - \sum\limits_{\mathcal{G}(i,j)=g} (\widetilde{z}_i^{k+1})_j \right) \\
& = \sum\limits_{\mathcal{G}(i,j)=g} (y_i^k)_j + \rho \left( \sum\limits_{\mathcal{G}(i,j)=g} (x_i^{k+1})_j - k_g\cdot z_g^{k+1} \right) \\
& = \sum\limits_{\mathcal{G}(i,j)=g} (y_i^k)_j + \rho \left( \sum\limits_{\mathcal{G}(i,j)=g} (x_i^{k+1})_j - \sum\limits_{\mathcal{G}(i,j)=g} \left( (x_i^{k+1})_j + \dfrac{1}{\rho} (y_i^k)_j \right) \right) \\
& = 0 \\
\Rightarrow & \quad z_g^{k+1} = \dfrac{1}{k_g} \sum\limits_{\mathcal{G}(i,j)=g} (x_i^{k+1})_j
\end{align*}
Hence the iterations simplifies to
\begin{align*}
x_i^{k+1} & = \argmin_{x_i} \left\{ f_i(x_i) + \langle y_i^k,x_i \rangle + \dfrac{\rho}{2} \lVert x_i-\widetilde{z}_i^k \rVert^2 \right\} \\
& = \argmin_{x_i} \left\{ f_i(x_i) + \dfrac{\rho}{2} \lVert x_i - \widetilde{z}_i^k + u_i^k \rVert^2 \right\} = \operatorname{prox}_{f_i,\rho} (\widetilde{z}_i^k - u_i^k) \\
z_g^{k+1} & = \dfrac{1}{k_g} \sum\limits_{\mathcal{G}(i,j)=g} (x_i^{k+1})_j \\
u_i^{k+1} & = u_i^k + (x_i^{k+1} - \widetilde{z}^{k+1}_i)
\end{align*}
\section{General Form Consensus with Regularization}
General form consensus + consensus with regularization:
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + \framebox{g(z)} \\
& \text{subject to} \quad x_i - \widetilde{z}_i = 0
\end{align*}
ADMM iterations:
\begin{align*}
x_i^{k+1} & = \argmin_{x_i} \left\{ f_i(x_i) + \langle y_i^k, x_i-\widetilde{z}_i^k \rangle + \dfrac{\rho}{2}\lVert x_i-\widetilde{z}_i^k \rVert^2 \right\} \\
& = \operatorname{prox}_{f_i,\rho} (\widetilde{z}_i^{k} - u_i^{k}) \\
z^{k+1} & = \argmin_z \left\{ g(z) + \sum\limits_{i=1}^N \left( -\langle y_i^k, \widetilde{z}_i^k \rangle + \dfrac{\rho}{2}\lVert x_i^{k+1}-\widetilde{z}_i \rVert^2 \right) \right\} \\
& = \operatorname{prox}_{g,k_g\rho} (v_{\mathcal{G}}) \\
& \text{where } (v_{\mathcal{G}}) = (v_1,\cdots,v_n)^T, \text{ s.t. } v_g = \dfrac{1}{k_g} \sum\limits_{\mathcal{G}(i,j)=g} \left( (x_i^{k+1})_j + (u_i^k)_j \right) \\
u^{k+1}_i & = u_i^k + (x_i^{k+1} - \widetilde{z}^{k+1}_i)
\end{align*}
\section{Sharing Problem}
A sharing problem is an optimization problem formulated as
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + g(\sum\limits_{i=1}^N x_i) \\
& \text{where $f_i$: local cost} \\
& \text{\phantom{where }$g$: shared cost}
\end{align*}
\begin{remark}
A sharing problem is dual to a consensus problem.
\end{remark}
Indeed, rewrite a sharing problem in the ADMM form
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + g(\sum\limits_{i=1}^N z_i) \\
& \text{subject to} \quad x_i - z_i = 0
\end{align*}
Its dual function is
\begin{align*}
\Gamma(v_1,\cdots,v_N) & = \inf\limits_{x,z} \left\{ \sum\limits_{i=1}^N f_i(x_i) + g(\sum\limits_{i=1}^N z_i) + \sum\limits_{i=1}^N \langle v_i, x_i-z_i \rangle \right\} \\
& = \inf\limits_{x} \left\{ \sum\limits_{i=1}^N \left( f_i(x_i) + \langle v_i, x_i \rangle \right) \right\} + \inf\limits_{z} \left\{ g(\sum\limits_{i=1}^N z_i) - \sum\limits_{i=1}^N \langle v_i, z_i \rangle \right\} \\
& = -\sup\limits_{x} \left\{ \sum\limits_{i=1}^N \left( -f_i(x_i) + \langle -v_i, x_i \rangle \right) \right\} + \inf\limits_{z} \left\{ g(\sum\limits_{i=1}^N z_i) - \sum\limits_{i=1}^N \langle v_i, z_i \rangle \right\} \\
& = -\sum\limits_{i=1}^N f_i^*(-v_i) + \framebox{$\inf\limits_{z} \left\{ g(\sum\limits_{i=1}^N z_i) - \sum\limits_{i=1}^N \langle v_i, z_i \rangle \right\}$} \leftarrow \textcircled{$\star$}
\end{align*}
For \textcircled{$\star$}, assume $v_s\neq v_t$, and $\{z_i^*\}_{i=1}^N$ s.t. \textcircled{$\star$}$>-\infty$. Then let $\{\widetilde{z}_i\}_{i=1}^N$ be s.t. $\widetilde{z}_i = z_i^*$ for $i\neq s,t$, $\widetilde{z}_s = z_s^* + w, \widetilde{z}_t = z_t^* - w, w\neq 0$, one has
\begin{align*}
& g(\sum\limits_{i=1}^N \widetilde{z}_i) - \sum\limits_{i=1}^N \langle v_i, \widetilde{z}_i \rangle \\
= & g(\sum\limits_{i=1}^N z_i^*) - \sum\limits_{i=1}^N \langle v_i, z_i^* \rangle + \langle w, -v_s+v_t \rangle
\end{align*}
One can always choose $w$ so that $\langle w, -v_s+v_t \rangle < 0$, contradiction (with $v_s\neq v_t$ for some $s, t$). Hence
\begin{align*}
\textcircled{$\star$} & = \begin{cases}
\inf\limits_{z} \left\{ g(\sum\limits_{i=1}^N z_i) - \langle v_1, \sum\limits_{i=1}^N z_i \rangle \right\}, & v_1 = \cdots = v_N \\
-\infty, & \text{otherwise}
\end{cases} \\
& = \begin{cases}
-g^*(v_1), & v_1 = \cdots = v_N \\
-\infty, & \text{otherwise}
\end{cases}
\end{align*}
i.e. the dual function is
$$
\Gamma(v_1,\cdots,v_N) = \begin{cases}
-g^*(v_1) - \sum\limits_{i=1}^N f_i^*(-v_i), & v_1 = \cdots = v_N \\
-\infty, & \text{otherwise}
\end{cases}
$$
and the dual problem is
\begin{align*}
& \text{minimize} \quad g^*(v) + \sum\limits_{i=1}^N f_i^*(-v_i) \\
& \text{subject to} \quad v_i = v
\end{align*}
a consensus problem with regularization.
One can show that the dual of this consensus problem is the original sharing problem.
ADMM iterations for sharing problem:
\begin{align*}
& x_i^{k+1} = \argmin_{x_i} \left\{ f_i(x_i) + \dfrac{\rho}{2} \lVert x_i - z_i^k + u_i^k \rVert^2 \right\} \\
& \framebox{$z^{k+1} = \argmin\limits_z \left\{ g(\sum\limits_{i=1}^N z_i) + \dbox{$\dfrac{\rho}{2} \sum\limits_{i=1}^N \lVert z_i - x_i^{k+1} - u_i^k \rVert^2$} \right\}$} \tikz[overlay, remember picture]{\node[] (sharing_z) {}} \quad z = \begin{pmatrix} z_1 \\ \vdots \\ z_N \end{pmatrix} \\
& u_i^{k+1} = u_i^k + (x_i^{k+1}-z_i^{k+1})
\end{align*}
\tikz[overlay, remember picture]{\node[below right = 0.8cm and -2cm of sharing_z] (sharing_z_text) {\# variables can be reduced from $Nn$ to $n$}}
\tikz[overlay, remember picture]{\path[->] ([xshift=-1cm]sharing_z_text.north) edge (sharing_z.south)}
Write $a_i = u_i^k + x_i^{k+1}, \overline{z} = \dfrac{1}{N} \sum\limits_{i=1}^N z_i$, then the $(k+1)$-th $z$-update is formulated as (equivalent to)
\begin{align*}
& \text{minimize} \quad g(N\overline{z}) + \dfrac{\rho}{2} \sum\limits_{i=1}^N \lVert z_i - a_i \rVert^2 \\
& \text{subject to} \quad N\overline{z} - \sum\limits_{i=1}^N z_i = 0
\end{align*}
Since
$$\dfrac{\rho}{2}\sum\limits_{i=1}^N \lVert z_i - a_i \rVert^2 \geqslant \dfrac{\rho}{2} \dfrac{\lVert \sum\limits_{i=1}^N(z_i - a_i) \rVert^2}{N} = \dfrac{N\rho}{2} \lVert \overline{z} - \overline{a} \rVert^2$$
``='' holds only when $z_i = a_i + \overline{z} - \overline{a}$, i.e.
$$z_i^{k+1} = u_i^{k} + x_i^{k+1} + \overline{z}^{k+1} - \overline{u}^{k} - \overline{x}^{k+1}$$
Hence the constrained optimization problem of $z$-update is equivalent to the following unconstrained problem
$$\text{minimize} \quad \quad g(N\overline{z}) + \dfrac{N\rho}{2} \lVert \overline{z} - \overline{a} \rVert^2$$
Another consequence is
$$(\text{for simplicity } u^{k+1} = ) u_1^{k+1} = \cdots = u_N^{k+1} = \overline{u}^k + \overline{x}^{k+1} - \overline{z}^{k+1}$$
and further
$$z_i^{k+1} = \framebox{$\cancel{u_i^{k}}$} + x_i^{k+1} + \overline{z}^{k+1} - \framebox{$\cancel{\overline{u}^{k}}$} - \overline{x}^{k+1} = x_i^{k+1} + \overline{z}^{k+1} - \overline{x}^{k+1}$$
The ADMM iterations for the whole equivalent optimization problem:
\begin{align*}
& x_i^{k+1} = \argmin_{x_i} \left\{ f_i(x_i) + \dfrac{\rho}{2} \lVert x_i - x_i^k - \overline{z}^k + \overline{x}^k + u^k \rVert^2 \right\} = \operatorname{prox}_{f_i,\rho}(x_i^k + \overline{z}^k - \overline{x}^k - u^k) \\
& \overline{z}^{k+1} = \argmin\limits_{\overline{z}} \left\{ g(N\overline{z}) + \dfrac{N\rho}{2} \lVert \overline{z} - \overline{x}^{k+1} - u^k \rVert^2 \right\} = \operatorname{prox}_{\widetilde{g},N\rho}(\overline{x}^{k+1} + u^k) \\
& u^{k+1} = u^k + (\overline{x}^{k+1}-\overline{z}^{k+1})
\end{align*}
where $\widetilde{g}(\overline{z}) = g(N\overline{z})$.
\section*{Problems NOT discussed (and difficult)}
\begin{itemize}
\item convergence (rate) analysis of the optimization problems, e.g. \cite{li2019convergence}
\item ``infeasible'' problems, e.g. totally distributed cases where there's no ``central collector'', e.g. \cite{elgabli2020gadmm,issaid2020cq-ggadmm,francca2020distributed}, or ``weak'' consensus \cite{hanzely2020federated}
\item new ADMM developments, e.g. \cite{bai2021sas-admm}
\item etc.
\end{itemize}
For example, in \cite{hanzely2020federated}, the authors considered a ``weak'' consensus problem
$$\text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + \dfrac{\lambda}{2} \sum\limits_{i=1}^N \lVert x_i - \overline{x} \rVert^2$$
which can be reformulated as constrained optimization problems
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + \dfrac{\lambda}{2} \sum\limits_{i=1}^N \lVert x_i - z \rVert^2 \\
& \text{subject to} \quad Nz - \sum\limits_{i=1}^N x_i = 0
\end{align*}
or
\begin{align*}
& \text{minimize} \quad \sum\limits_{i=1}^N f_i(x_i) + \dfrac{\lambda}{2} \sum\limits_{i=1}^N \lVert x_i \rVert^2 -\dfrac{\lambda N}{2} \lVert z \rVert^2 \\
& \text{subject to} \quad Nz - \sum\limits_{i=1}^N x_i = 0
\end{align*}
which is a nonconvex sharing problem considered in \cite{hong2016convergence} (Eq. (3.2)). Under certain assumptions, this latter problem is a DC (difference-of-convex) programming problem. Note the difference with the a normal consensus problem with proximal term technique, e.g. as in \cite{sahu2018fedprox}.
\bibliographystyle{ieeetr}
\bibliography{references}
\end{document}