Verified_Numerical_Methods/Perceptron.thy at main · isabelle-utp/Verified_Numerical_Methods · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
section \<open> Perceptron Convergence \<close>

theory Perceptron
  imports
    "ITree_Numeric_VCG.ITree_Numeric_VCG"
begin

alphabet 'n state =
   theta                   :: "real vec['n]"
   bias                    :: real
   updates                 :: nat
   x_i                     :: "real vec['n]"
   y_i                     :: real
   i                       :: nat
   changed                 :: bool

instantiation state_ext :: (finite, default) default
begin

definition default_state_ext :: "('a,'b) state_scheme"
  where
  "default_state_ext =
     \<lparr> theta\<^sub>v   = 0,
       bias\<^sub>v    = 0,
       updates\<^sub>v = 0,
       x_i\<^sub>v     = 0,
       y_i\<^sub>v     = 0,
       i\<^sub>v       = 0,
       changed\<^sub>v = False,
       \<dots>         = default \<rparr>"

instance ..

end

program pass_through "data :: ((real vec['n]) \<times> real) list" over "('n :: finite) state"
 = "changed := False;
    i := 0;
    while i < length data
    do
      x_i := fst (data ! i);
      y_i := snd (data ! i);
      if y_i * (x_i \<bullet> theta + bias) \<le> 0 then
        theta := theta + y_i *\<^sub>R x_i;
        bias := bias + y_i;
        updates := updates + 1;
        changed := True
      fi;
      i := i + 1
    od"

program perceptron "data :: ((real vec['n]) \<times> real) list" over "('n :: finite) state"
 = "theta := 0;
    bias := 0;
    changed := True;
    updates := 0;
    while changed
    do pass_through data od"

execute "perceptron
  [(Vector[-1::real, -1], -1::real),
    (Vector[-1,        1], -1),
    (Vector[ 1,       -1], -1),
    (Vector[ 1,        1],  1)]"

program pass_through_aux "(
    data :: ((real vec['n]) \<times> real) list,
    \<theta> :: real vec['n],
    b :: real,
    \<gamma> :: real,
    R :: real
  )" over "('n :: finite) state"
 = "changed := False;
    i := 0;
    while i < length data
    invariant (
      i \<le> length data
      \<and> updates * \<gamma> \<le> \<theta> \<bullet> theta + b * bias
      \<and> \<parallel>theta\<parallel>\<^sup>2 + bias\<^sup>2 \<le> updates * R\<^sup>2
      \<and> (changed \<longrightarrow> old[updates] < updates)
      \<and> (\<not> changed \<longrightarrow> updates = old[updates])
      \<and> (\<not> changed \<longrightarrow> (\<forall>j < i. (snd (data ! j)) * ((fst (data ! j)) \<bullet> theta + bias) > 0))
    )
    variant (length data - i)
    do
      x_i := fst (data ! i);
      y_i := snd (data ! i);
      if y_i * (x_i \<bullet> theta + bias) \<le> 0 then
        theta := theta + y_i *\<^sub>R x_i;
        bias := bias + y_i;
        updates := updates + 1;
        changed := True
      fi;
      i := i + 1
    od"

program perceptron_aux "(
    data :: ((real vec['n]) \<times> real) list,
    \<theta> :: real vec['n],
    b :: real,
    \<gamma> :: real,
    R :: real
  )" over "('n :: finite) state"
  = "theta := 0;
    bias := 0;
    changed := True;
    updates := 0;
    while changed
    invariant (
      updates * \<gamma> \<le> \<theta> \<bullet> theta + b * bias
      \<and> \<parallel>theta\<parallel>\<^sup>2 + bias\<^sup>2 \<le> updates * R\<^sup>2
      \<and> updates \<le> nat \<lceil>(R / \<gamma>)\<^sup>2\<rceil>
      \<and> (\<not> changed \<longrightarrow> (\<forall>j < length data. (snd (data ! j)) * ((fst (data ! j)) \<bullet> theta + bias) > 0))
    )
    variant (nat \<lceil>(R / \<gamma>)\<^sup>2\<rceil> - updates + (if changed then 1 else 0))
    do pass_through_aux (data, \<theta>, b, \<gamma>, R) od"

lemma perceptron_aux_is_perceptron:
  "perceptron_aux(data, R, \<gamma>, \<theta>, b) = perceptron(data)"
  by (simp add: perceptron_aux_def perceptron_def
                pass_through_def pass_through_aux_def while_inv_var_def)

theorem perceptron_aux_convergence:
  fixes data :: "((real vec['n]) \<times> real) list"
    and R \<gamma> b :: real
    and \<theta> :: "real vec['n]"
  assumes bounded:  "\<And>t. t < length data \<Longrightarrow>\<parallel>fst (data ! t)\<parallel>\<^sup>2 + 1 \<le> R\<^sup>2"
    and labels:     "\<And>t. t < length data \<Longrightarrow> snd (data ! t) \<in> {-1, 1}"
    and sep_norm:   "\<parallel>\<theta>\<parallel>\<^sup>2 + b\<^sup>2 = 1"
    and sep_margin: "\<forall>t < length data. snd (data ! t) *\<^sub>R ((fst (data ! t)) \<bullet> \<theta> + b) \<ge> \<gamma>"
    and \<gamma>pos:       "\<gamma> > 0"
  shows "H[True] perceptron_aux (data, \<theta>, b, \<gamma>, R)
    [updates \<le> (R/\<gamma>)^2 \<and>
    (\<forall>i < length data. sgn (fst (data ! i) \<bullet> theta + bias) = snd (data ! i))]"
proof(vcg)
  fix theta bias updates t
  assume hyp: "snd (data ! t) * (fst (data ! t) \<bullet> theta + bias) \<le> 0"
    and inv: "(real updates) * \<gamma> \<le> \<theta> \<bullet> theta + b * bias"
    and "t < length data"
  let ?theta_upd = "snd (data ! t) *\<^sub>R fst (data ! t)"
    and ?bias_upd = "snd (data ! t)"
  let ?theta' = "theta + ?theta_upd"
    and ?bias' = "bias + ?bias_upd"
  have "\<theta> \<bullet> ?theta' + b * ?bias' = \<theta> \<bullet> theta + \<theta> \<bullet> ?theta_upd + b * bias + b * ?bias_upd"
    using cross3_simps(35)
    by (smt (verit, ccfv_SIG) inner_real_def insert_iff real_inner_1_right singletonD)
  also have "... = \<theta> \<bullet> theta + b * bias + \<theta> \<bullet> ?theta_upd + b * ?bias_upd"
    by (clarsimp simp: field_simps)
  also have "... = \<theta> \<bullet> theta + b * bias + snd (data ! t) *\<^sub>R ((fst (data ! t)) \<bullet> \<theta> + b)"
    using labels[OF \<open>t < length data\<close>] by (simp add: cross3_simps(24,7) inner_commute)
  also have "... \<ge> \<theta> \<bullet> theta + b * bias + \<gamma>"
    using sep_margin[rule_format, OF \<open>t < length data\<close>] by linarith
  also have "\<theta> \<bullet> theta + b * bias + \<gamma> \<ge> (real_of_int updates) * \<gamma> + \<gamma>"
    using inv by linarith
  finally show "(1 + real updates) * \<gamma> \<le> \<theta> \<bullet> ?theta' + b * ?bias'"
    by (simp add: cross3_simps(11,24))
  then show "(1 + real updates) * \<gamma> \<le> \<theta> \<bullet> ?theta' + b * ?bias'".
  then show "(1 + real updates) * \<gamma> \<le> \<theta> \<bullet> ?theta' + b * ?bias'".
next
  fix theta bias updates t
  assume check: "snd (data ! t) * (fst (data ! t) \<bullet> theta + bias) \<le> 0"
    and inv: "\<parallel>theta\<parallel>\<^sup>2 + bias\<^sup>2 \<le> (real updates) * R\<^sup>2"
    and "t < length data"
  let ?x_i = "fst (data ! t)"
    and ?y_i = "snd (data ! t)"
  let ?theta' = "theta + ?y_i *\<^sub>R ?x_i"
    and ?bias' = "bias + ?y_i"
  have "\<parallel>?theta'\<parallel>\<^sup>2 + ?bias'\<^sup>2 =
        \<parallel>theta\<parallel>\<^sup>2 + \<parallel>?y_i *\<^sub>R ?x_i\<parallel>\<^sup>2 + 2 * (theta \<bullet> (?y_i *\<^sub>R ?x_i)) + bias\<^sup>2 + ?y_i\<^sup>2 + 2 * bias * ?y_i"
    using dot_norm[of theta "?y_i *\<^sub>R ?x_i"] power2_sum[of bias ?y_i] by (clarsimp simp: field_simps)
  also have "... = \<parallel>theta\<parallel>\<^sup>2 + \<parallel>?y_i *\<^sub>R ?x_i\<parallel>\<^sup>2 + 2 * (?y_i *\<^sub>R (?x_i \<bullet> theta))
    + bias\<^sup>2 + ?y_i\<^sup>2 + 2 * bias * ?y_i"
    using inner_commute labels[OF \<open>t < length data\<close>]
    by (smt (verit, ccfv_SIG) inner_scaleR_right real_scaleR_def)
  also have "... \<le> \<parallel>theta\<parallel>\<^sup>2 + R\<^sup>2 + bias\<^sup>2"
    using check labels[OF \<open>t < length data\<close>] bounded[OF \<open>t < length data\<close>] by fastforce
  also have "... \<le> (real updates) * R\<^sup>2 + R\<^sup>2"
    using inv by linarith
  finally show "\<parallel>?theta'\<parallel>\<^sup>2 + ?bias'\<^sup>2 \<le> (1 + real updates) * R\<^sup>2"
    by argo
  then show "\<parallel>?theta'\<parallel>\<^sup>2 + ?bias'\<^sup>2 \<le> (1 + real updates) * R\<^sup>2".
  then show "\<parallel>?theta'\<parallel>\<^sup>2 + ?bias'\<^sup>2 \<le> (1 + real updates) * R\<^sup>2".
next
  fix theta bias updates t changed i
  assume "\<not> snd (data ! t) * (fst (data ! t) \<bullet> theta + bias) \<le> 0"
   and "\<forall>i<t. 0 < snd (data ! i) * (fst (data ! i) \<bullet> theta + bias)"
   and "i < Suc t"
  thus "0 < snd (data ! i) * (fst (data ! i) \<bullet> theta + bias)"
    using less_Suc_eq by fastforce
next
  fix theta bias i
  assume "\<forall>i<length data. 0 < snd (data ! i) * (fst (data ! i) \<bullet> theta + bias)"
   and "i < length data"
  thus "sgn (fst (data ! i) \<bullet> theta + bias) = snd (data ! i)"
    using labels by fastforce
next
  fix theta bias updates
  assume hyp1: "real updates * \<gamma> \<le> \<theta> \<bullet> theta + b * bias"
     and hyp2: "\<parallel>theta\<parallel>\<^sup>2 + bias\<^sup>2 \<le> real updates * R\<^sup>2"
  hence "0 \<le> updates"
    by blast
  have cs: "\<theta> \<bullet> theta + b * bias \<le> norm (theta, bias) * norm (\<theta>, b)"
    using norm_cauchy_schwarz[of  "(\<theta>, b)" "(theta, bias)"] by (simp add: cross3_simps(11))
  also have "... = sqrt (\<parallel>theta\<parallel>\<^sup>2 + bias\<^sup>2)"
    using sep_norm by (simp add: norm_Pair)
  finally have "\<theta> \<bullet> theta + b * bias \<le> sqrt (\<parallel>theta\<parallel>\<^sup>2 + bias\<^sup>2)".
  hence key: "(real updates * \<gamma>)\<^sup>2 \<le> real updates * R\<^sup>2"
    using hyp2 by (smt (verit) Num.of_nat_simps(1) \<gamma>pos of_nat_0_less_iff
        bot_nat_0.not_eq_extremum hyp1 real_less_lsqrt zero_compare_simps(4))
  have obs1: "updates = 0 \<Longrightarrow> real updates \<le> (R / \<gamma>)\<^sup>2"
    by fastforce
  have key_div: "updates \<noteq> 0 \<Longrightarrow> real updates * \<gamma>\<^sup>2 > 0"
    using \<gamma>pos \<open>0 \<le> updates\<close> by (clarsimp simp: field_simps)
  hence "updates \<noteq> 0 \<Longrightarrow> (real_of_int updates * \<gamma>)\<^sup>2 / (real updates * \<gamma>\<^sup>2)
    \<le> (real updates * R\<^sup>2) / (real updates * \<gamma>\<^sup>2)"
    using key by (metis divide_right_mono less_eq_real_def of_int_of_nat_eq)
  hence obs2: "updates \<noteq> 0 \<Longrightarrow> real updates \<le> (R / \<gamma>)\<^sup>2"
    using \<gamma>pos by (simp add: power2_eq_square power_divide)
  show "real updates \<le> (R / \<gamma>)\<^sup>2"
    using obs1 obs2 by blast
  thus "updates \<le> nat \<lceil>(R / \<gamma>)\<^sup>2\<rceil>"
    by linarith

  (* final goal *)
  fix old_updates :: "\<nat>"
  assume  "old_updates \<le> nat \<lceil>(R / \<gamma>)\<^sup>2\<rceil>"
    and   "old_updates < updates"
  thus "nat \<lceil>(R / \<gamma>)\<^sup>2\<rceil> - updates < nat \<lceil>(R / \<gamma>)\<^sup>2\<rceil> - old_updates"
    using \<open>updates \<le> nat \<lceil>(R / \<gamma>)\<^sup>2\<rceil>\<close> by (metis diff_less_mono2 linorder_neqE_nat not_le)
qed

theorem perceptron_convergence_theorem:
  fixes data :: "((real vec['n]) \<times> real) list"
    and R \<gamma> b :: real
    and \<theta> :: "real vec['n]"
  assumes bounded:    "\<And>t. t < length data \<Longrightarrow> \<parallel>fst (data ! t)\<parallel>\<^sup>2 + 1 \<le> R\<^sup>2"
  assumes labels:     "\<And>t. t < length data \<Longrightarrow> snd (data ! t) \<in> {-1, 1}"
  assumes sep_norm:   "\<parallel>\<theta>\<parallel>\<^sup>2 + b\<^sup>2 = 1"
  assumes sep_margin: "\<forall>t < length data. snd (data ! t) * ((fst (data ! t)) \<bullet> \<theta> + b) \<ge> \<gamma>"
  assumes \<gamma>pos:       "\<gamma> > 0"
  shows "H[True] perceptron (data)
    [real updates \<le> (R/\<gamma>)^2 \<and>
    (\<forall>i < length data. sgn (fst (data ! i) \<bullet> theta + bias) = snd (data ! i))]"
proof -
  have "H[True] perceptron_aux (data, \<theta>, b, \<gamma>, R)
[real updates \<le> (R/\<gamma>)^2 \<and> (\<forall>i < length data. sgn (fst (data ! i) \<bullet> theta + bias) = snd (data ! i))]"
    using \<gamma>pos assms(1,4) labels
      perceptron_aux_convergence[of data R \<theta> b \<gamma>]
      real_scaleR_def[of "snd (data ! _)" "fst (data ! _) \<bullet> \<theta> + b"]
      sep_norm by presburger
  thus ?thesis
    by (simp add: perceptron_aux_is_perceptron)
qed

end