mirror of
https://github.com/Andreaierardi/Master-DataScience-Notes.git
synced 2025-01-27 11:47:36 +01:00
186 lines
6.7 KiB
TeX
186 lines
6.7 KiB
TeX
\documentclass[../main.tex]{subfiles}
|
|
\begin{document}
|
|
|
|
\chapter{Lecture 21 - 25-05-2020}
|
|
|
|
\section{Pegasos in Kernel space}
|
|
|
|
Objective function was $$F_\lambda (w) = \frac{1}{m} \sum_{t=1}^{m} h_t(w) + \frac{1}{2} \|w\|^2 \quad w \in \barra{R}^d$$
|
|
$$
|
|
w_{T+1} = \frac{1}{\lambda \, T} \sum_{t=1}^T y_{st} \, x_{st} \, I \{ h_{st} (w_t) > 0 \} \qquad s_1, ..., s_t \quad (realised \ draws \ in \ training)
|
|
$$
|
|
$$
|
|
K \qquad H_k = \{ \sum_i \alpha_i k(x_i, \cdot) , \alpha_i, x_i \} \qquad g \in H_k
|
|
$$
|
|
$$
|
|
F_\lambda = \frac{1}{m} \sum_{t=1}^m h_t(g) + \frac{1}{2}
|
|
\| g\|^2 \qquad h_t(g) = \left[ 1- y_t \, g(x_t) \, \right]_+ $$
|
|
$$
|
|
g_{T+1} = \frac{1}{\lambda \, T} \sum_{t=1}^T \red{ y_{st} \, k (x_st, \cdot) \, I \{ h_{st}(g_t) > 0 \} }
|
|
$$
|
|
where \red{red} part is $v_{st}$
|
|
\section{Stability}
|
|
A way to bound the risk of a predictor. \\ Controlling the variance error and leave to the user the job to minimise the bias.
|
|
\\ Variance error is due to the fact that the predictor an algorithm generate from the training set will depends strongly on the training set itself.
|
|
If we perturb the training set our predictor will change a lot.
|
|
\\\\
|
|
Minimisation of training error $ \Rightarrow $ predictor changes if training set if perturbed. $\Rightarrow $ risk of overfitting
|
|
\\
|
|
Stability is the opposite since avoid overfitting when we perturbing the training set.
|
|
\\
|
|
\begin{itemize}
|
|
\item $S$ Training set $(x_t,y_t)...(x_m, y_m)$
|
|
\item loss function $\ell$
|
|
\item distribution $D$
|
|
\end{itemize}
|
|
$h: X \rightarrow Y$ $\ell_D(h)$ risk of $h$\\
|
|
$z_t = (x_t,y_t) \, \ell(h_t, z_t) = \ell(h(x_t),y_t)$
|
|
$$
|
|
\hat{\ell}_s(h) = \frac{1}{m} \sum_{t=1}^m \ell(h,z_t)
|
|
$$
|
|
Perturbation $z_t' = (x_t', y_t') $ also drawn from $D$
|
|
\\
|
|
$
|
|
S^{(t)} \textbf{is $S$ where $z_t$ is replaced by $z'_t$ }
|
|
$
|
|
$h_s= A(S)$
|
|
\\
|
|
A learning algorithm is $\varepsilon$-stable $\qquad$ $(\varepsilon > 0) \qquad h_s^{(t)} = A(S^{(t)})$
|
|
$$
|
|
\ell(h_s^{(t)}, z_t) - \ell(h_s, z_t)
|
|
$$
|
|
we expect this subtraction result to be positive.
|
|
$$
|
|
\barra{E} \left[ \,\ell(h_s^{(t)}, z_t) - \ell(h_s, z_t) \, \right] \leq \varepsilon \qquad \forall t = 1, ...m
|
|
$$
|
|
where $\barra{E}[ \ ] \rightarrow s,z_t'$
|
|
\\
|
|
$z_t$ and $z'_t$ come from $D$ both
|
|
\\
|
|
$$
|
|
\barra{E} \left[ \ell(h_s,z'_t) - \ell(h_s^{(t)}, z_t' ) \right] \leq \varepsilon
|
|
$$
|
|
\bred{Theorem}
|
|
\\
|
|
If $A$ is $\varepsilon$-stable, then $$
|
|
\barra{E} \left[ \ell_D(h_s) - \hat{\ell}_s(h_s) \right] \leq \varepsilon
|
|
$$
|
|
\\
|
|
Proof: \ $S \qquad z_t = (x_t, y_t) \qquad s' \qquad z'_t = (x_t',y_t') \qquad D$
|
|
$$
|
|
\barra{E} \left[ \hat{\ell}_s (h_s) \right] = \barra{E} \left[ \frac{1}{m} \sum_{t=1}^m \ell(h_s,z_t) \right] = \frac{1}{m} \sum_{t=1}^m \barra{E} \left[ \ell(h_s, z_t ) \right] \ = \ \frac{1}{m} \sum_{t=1}^m \barra{E} \left[ \ell(h_s^{(t)}, z_t') \right]
|
|
$$
|
|
$$
|
|
\ell_D(h_s) = \barra{E} \left[ \ell(h_s, z'_t) | S \right] = \frac{1}{m} \sum_{t=1}^m \barra{E} \left[ \ell(h_s, z'_t) \right]
|
|
$$
|
|
Average with respect to random draw of $S$\\
|
|
$
|
|
\barra{E} \left[ \ell_D (h_s) \right] = \frac{1}{m} \sum_{t=1}^m \barra{E} \left[ \ell(h_s, z'_t)\right]
|
|
$
|
|
$$
|
|
\barra{E} \left[ \ell_D(h_s) - \hat{\ell}_s (h_s) \right] = \frac{1}{m} \sum_{t=1}^m \barra{E} \left[ \ell(h_s, z'_t) - \ell(h_s^{(t)}, z'_t \right] \leq \varepsilon
|
|
$$
|
|
A stable algorithm is not overfitting (but they still underfit!).
|
|
\\So if an $ERM$ algorithm is $\varepsilon$-stable, it would be pretty good.
|
|
\\\\
|
|
\bred{Theorem}
|
|
\\
|
|
If \textbf{$A$} is $\varepsilon$-stable and it approximates $ERM$ in a class $H$:
|
|
$$
|
|
\hat{\ell}_s \leq \min_{h \in H} \hat{\ell}_s (h) + \gamma \qquad \forall s, \ h_s = A(S)
|
|
$$
|
|
for some $\gamma > 0$, then:
|
|
$$
|
|
\barra{E} \left[ \ell_D (h_s) \right] \leq \min_{h \in H} \ell_D(h) + \varepsilon + \gamma
|
|
$$
|
|
\bred{Proof}
|
|
\\
|
|
$$
|
|
\barra{E} \left[ \ell_D(h_s) \right] = \barra{E} \left[ \ell_D (h_s) - \hat{\ell}_s(h_s) \right] + \barra{E} \left[ \hat{\ell}_s(h_s) - \hat{\ell}_s (h^*) \right] + \barra{E} \left[ \ell_s (h^*) \right]
|
|
$$
|
|
$$
|
|
h^* = arg \min_{h \in H} \ \ell_D (h)
|
|
$$
|
|
$$
|
|
\barra{E} \left[ \hat{\ell}(h^*) \right] = \ell_D(h^*) \ \longrightarrow \ \barra{E} \left[ \frac{1}{m} \sum_t \ell(h^*, z_t) \right] = \frac{1}{m} \sum_t \red{\barra{E} \left[ \ell(h^*, z_t ) \right] }
|
|
$$
|
|
where \bred{red} is $ \ell_D(h^*)$
|
|
\\\\
|
|
$
|
|
\ell(\cdot , z)
|
|
$ is a convex function $\ell(w,z)$ \\ $\exists L > 0 \qquad | \ell(w,z)- \ell(z,z) | \leq L \| w-w'\|$
|
|
\\
|
|
$z = (x,y)$
|
|
\\
|
|
In the case of SVM, $\ell(w,z) = \left[ \- y \, w^T \, x \right]_+$
|
|
$ \exists L > 0 \quad \forall z \ \forall w, w'$
|
|
$$
|
|
| \ell(w,z) - \ell(w',z) | \leq L \| w-w'\|
|
|
$$
|
|
where $ell$ is \textbf{Lipschitz}
|
|
\\\\
|
|
\bred{Theorem}
|
|
\\
|
|
Let $\ell$ be convex, Lipschitz and differentiable.
|
|
\\
|
|
Consider $A$ \qquad $ A(S) = w_s$ where
|
|
\\
|
|
$$
|
|
w_s = arg \min_{w \in \barra{R}^d} \left( \hat{\ell}_s (w) + \frac{\lambda}{2}\| w\|^2 \right)
|
|
$$
|
|
If $\ell$ is hinge loss, then $A$ is $SVM$.\\
|
|
then $A$ is $\frac{(2 \, L)^2}{\lambda \, m}$-stable \qquad $\forall \lambda > 0 $
|
|
\\\\
|
|
\bred{Proof}
|
|
\\
|
|
Fix $\lambda > 0$ \qquad $F_s (w) = \hat{\ell}_s(w) + \frac{\lambda}{2} \| w \|^2$
|
|
$$
|
|
w_s = arg \min_{w \in \barra{R}^d} F_s (w) \qquad w_s^{(t)} = arg \min_{w \in \barra{R}^d } F_s^{(t)}(w)
|
|
$$
|
|
$$
|
|
\ell(w_s,z'_t) - \ell(w_s^{(t)},z'_t) \leq \varepsilon \qquad \forall s, z'_t \ \forall t
|
|
$$
|
|
Use Lipschtiz
|
|
$$
|
|
| \ell(w_s, z'_t) - \ell(w_s^{(t)}, z'_t) | \leq L \| w_s-w_s^{(t)} \|
|
|
$$
|
|
$w = w_s, \ w' = w_s^{(t)}$
|
|
$$
|
|
F_s(w') - F_s(w) = \hat{\ell}(w') - \hat{\ell}(w) + \frac{1}{2} \| w' \|^2 - \frac{\lambda}{2} \|w \|^2 \ =
|
|
$$
|
|
$$
|
|
\ = \
|
|
\hat{\ell}_s^{(t)}(w') - \hat{\ell}_s^{(t)} + \frac{1}{m} \left( \ell(w', z_t) - \ell(w,z_t) \right) - \frac{1}{m} \left( \ell(w', z'_t ) - \ell(w, z'_t) \right) + \frac{\lambda}{2} ( \| w'\|^2 - \| w \|^2 ) \ =
|
|
$$
|
|
$$
|
|
= \ \red{F_s^{(t)} (w') - F_S^{(t)}(w)} + \frac{1}{m} (\ell(w', z_t) - \ell (w,z_t) ) - \frac{1}{m}( \ell(w',z'_t)-\ell(w,z'_t)) \ \leq
|
|
$$
|
|
where \bred{red} is $\leq 0$
|
|
$$
|
|
\ \leq \ |\frac{1}{m}\ell(w',z_t) - \ell(w, z_t) | + \frac{1}{m} | \ell(w',z_t') - \ell(w, z'_t) | \leq \
|
|
$$
|
|
----- MANCAAAAAAA ----
|
|
$$
|
|
F_s(w) - F_s(w') \leq \frac{2 \, L}{m} \| w- w'\|
|
|
$$
|
|
$F_s$ is $\lambda$-SC \qquad $F_s(w') \geq F_s(w) + \nabla F_s(w)^T (w'-w)+\frac{\lambda}{2}\|w- w'\|^2$
|
|
Since $w$ is minimiser of $F_s$ the gradiant $ \nabla F_s(w)^T = 0$
|
|
Therefore:
|
|
$$
|
|
F_s(w') - F_s(w) \geq \frac{1}{2}
|
|
\| w- w'\|^2 $$
|
|
$$
|
|
\frac{\lambda}{2} \| w- w'\|^2 \geq \frac{2 \, L}{m} \|w-w'\| \Rightarrow \| w-w' \| \leq \frac{4 \, L}{\lambda \, m}
|
|
$$
|
|
$$
|
|
\ell(w_s, z'_t) - \ell(w_s^{(t)}, z'_t) \leq \frac{4 \, L^2}{\lambda \, m}
|
|
$$
|
|
We now know the stability of the SVM.
|
|
\begin{figure}[h]
|
|
\centering
|
|
\includegraphics[width=0.8\linewidth]{../img/lez21-img1.JPG}
|
|
\caption{}
|
|
%\label{fig:}
|
|
\end{figure}
|
|
\end{document}
|