mirror of
https://github.com/Andreaierardi/Master-DataScience-Notes.git
synced 2025-01-27 11:47:36 +01:00
166 lines
5.2 KiB
TeX
166 lines
5.2 KiB
TeX
\documentclass[../main.tex]{subfiles}
|
|
\begin{document}
|
|
|
|
\chapter{Lecture 23 - 08-06-2020}
|
|
\bred{Bagging}
|
|
$$
|
|
h_1,...,h_t \qquad \hat{\ell}(f) \leq e^{-2 \, T \gamma^2} \qquad \gamma_t > \gamma>0
|
|
$$
|
|
Under the assumption that $ \{ h_t(x_z) \neq y_z \} $ \qquad $ \gamma_t = \frac{1}{2} - \hat{\ell}_s(h_t)$ are independent
|
|
$$
|
|
f = sgn ( \sum_{i=1}^T h_i) \qquad Bagging
|
|
$$
|
|
\section{Boosting}
|
|
$$
|
|
f = sgn ( \sum_{i=1}^T w_i h_i) \qquad Boosting
|
|
$$
|
|
The hard thing here is how to compute the weights.
|
|
\\
|
|
$
|
|
h_1, ..., h_t \qquad X \rightarrow \{ -1,+1 \}
|
|
$
|
|
\begin{figure}[h]
|
|
\centering
|
|
\includegraphics[width=0.3\linewidth]{../img/lez23-img1.JPG}
|
|
\caption{}
|
|
%\label{fig:}
|
|
\end{figure}\\
|
|
\\
|
|
$$
|
|
\hat{\ell}(f)\sum^m_{t=1} I \{ y_t g(x_t) \leq 0 \}
|
|
\leq
|
|
\frac{1}{m} \sum_{t=1}^m e^{-g(x_t) y_t
|
|
} = $$
|
|
$$
|
|
g = \sum_{i = 1}^T w_i h_i \textit{and we substitute g} \qquad and \qquad f = sgn(g)
|
|
$$
|
|
$$
|
|
= \ \frac{1}{m} \sum^m_{t=1} e^{-y_t \, \sum_{t=1}^T w_i h_i(x_i)} \qquad L_i(t) = h_i (x_t) y_t \in \{-1,+1 \} i = 1, ..., T
|
|
$$
|
|
$
|
|
L_i(z)
|
|
$ where $Z$ uniform over $\{1,...,m\}$
|
|
$$
|
|
\hat(\ell)(f) \leq \frac{1}{m} \sum_{t=1}^m e^{- \sum_{t=1}^T w_i L_i (t)} = \barra{E} \left[ e^{-\sum_{t=1}^T w_i L_i (t)}\right]
|
|
$$
|
|
$$
|
|
\barra{E} \left[ \prod_{t=1}^T e^{- w_i L_i}\right] =^? \prod^T_{t=1} \barra{E} \left[ e^{-w_i Li} \right]
|
|
$$
|
|
Ok if $Li$ are independent
|
|
\\$E\left[XY \right] = \barra{E}[X] \, \barra{E} [Y]$
|
|
\\
|
|
$X,Y$ are independent
|
|
\\
|
|
$\barra{E}_i $ is a probability $P_i$ and $P_i$ is sum $\{1,...m\}$
|
|
\\
|
|
$$
|
|
\hat{\ell}(f)\ \leq \ \prod_{i=1}^T \barra{E}_i \left[ e^{- w_i L_i} \right]
|
|
\ = \ \prod_{i=1}^T \left( e^{w_i} P_i (L_i = 1) + e^{w_i} P_i\left(L_i=1 \right) \right) \ = \ \prod_{i=1}^T \left( e^{-w_i}(1 - \epsilon_i) + e^{-w_i} \varepsilon_i \right)$$
|
|
$ L_i(z) \qquad z \sim P_i$
|
|
$$
|
|
\varepsilon_i = P_i(L_i = -1) = \sum_{t=1}^m I \{y_t h_i(x_t) \leq 0 \} P_i(t) \qquad \textbf{weighted training error of $h_i$}
|
|
$$
|
|
$$
|
|
F(w) = e^{-w} (1- \varepsilon) + e^w \varepsilon \qquad F'(w) = 0 \Leftrightarrow w = \frac{1}{2} \ln \frac{1-\varepsilon}{\varepsilon} \qquad 0< \varepsilon< 1
|
|
$$
|
|
$$
|
|
P_i(t) > 0 \quad \forall i,t \qquad \varepsilon_i = \frac{1}{2} \Rightarrow w_i = 0
|
|
$$
|
|
$$
|
|
\varepsilon_i >\frac{1}{2} \Rightarrow w_i < 0 \qquad \qquad
|
|
\varepsilon_i < \frac{1}{2} \Rightarrow w_i > 0
|
|
$$
|
|
|
|
$$
|
|
\hat{\ell}(f) \ \leq \ \prod_{i=1}^T \sqrt[]{4 \, \varepsilon_i (1-\varepsilon_i)}
|
|
$$
|
|
$$
|
|
\gamma_i = \frac{1}{2} - \varepsilon_i \qquad \textit{edge over random guessing $ 0 < \varepsilon_i <1$}
|
|
$$
|
|
$$
|
|
1 + x \leq e^x \quad \forall x \in \barra{R} \quad \hat{\ell}(f) \leq \prod_{i=1}^T \sqrt[]{4 \, \varepsilon_i(1 - \varepsilon_i)} \ = \
|
|
\prod_{i=1}^T \sqrt[]{1- 4 \gamma^2} =
|
|
$$
|
|
$$\ = \ \prod_{i = 1}^T 4 (\frac{1}{2} - \gamma_i)(\frac{1}{2}+ \gamma_i) \ = \prod_{i=1}^T e^{- 2 \gamma_i^2} = e^{- 2 \sum_{i=1}^T \gamma_i^2} \leq e^{-2 T \gamma^2}
|
|
$$
|
|
$
|
|
If \quad |\gamma_i| > \gamma > 0 \quad i = 1, ..., T$
|
|
$$
|
|
\hat{\ell}_s(f) = 0 \ \Leftrightarrow \ e^{-\varepsilon T \gamma^2}< \frac{1}{m} \ \Leftrightarrow \ T > \frac{ \ln m}{2 \gamma^2}
|
|
$$
|
|
$$
|
|
E \left[ \prod_i e^{-w_i L_i }\right] \ = \ \prod_i E \left[ e^{-w_i L_i } \right]
|
|
$$
|
|
$P_i,...,P_T \quad P_1(t) = \frac{1}{m}$ \quad $ t = 1,...m$
|
|
$$
|
|
P_{i+1}(t) = \frac{P_i(t) e^{-w_i L_i(t)}}{E_i \left[ e^{-w_i L_i} \right] } \qquad \sum_t P_{i+1}(t) = \frac{1}{E_i [e^{-w_i L_i}]}
|
|
\sum_t P_i(t) e^{...}
|
|
$$
|
|
MANCAaaa
|
|
$$
|
|
e^{-w_i L_i(t)} = E_i [e^{-w_i L_i}] \frac{P_{i+1}}{P_i(t)}
|
|
$$
|
|
$$
|
|
E[\prod_{i=1}^T e^{-w_i L_i}] = \frac{1}{m} \sum_{t=1}^m \prod_{i=1}^T e^{-w_i L_i(t)} = \frac{1}{m} \sum_t l\left( \prod_i E [e^{- w_i L_i}] \frac{P_{t+1}(t)}{P_i(t)}\right) \ =
|
|
$$
|
|
$$
|
|
\frac{1}{m} \sum_t \left( \prod_i E_i [e^{-w_i L_i} \right) \frac{P_{t+1}(t)}{P_1(t)} = \left( \prod_i E_i[e^{-w_i L_i}] \right) \red{\frac{1}{m} \sum_t \frac{P_{t+1}(t)}{y m}}
|
|
$$
|
|
where \bred{red} cancel out since $= 1$
|
|
\newpage
|
|
\section{Adaboost}
|
|
It is a meta learning algorithm.
|
|
\begin{figure}[h]
|
|
\centering
|
|
\includegraphics[width=0.5\linewidth]{../img/lez23-img2.JPG}
|
|
\caption{}
|
|
%\label{fig:}
|
|
\end{figure}\\
|
|
\\
|
|
Initialize $P_i(t) = \frac{1}{m} \quad t=1,...,m$\\
|
|
For $i = 1,...,T$\\
|
|
1) Feed $A$ with $S$ wrighted by $P_i$ and get $h_1$\\
|
|
2) $w_i = \frac{1}{2} \ln \frac{\varepsilon_i}{1-\varepsilon_i}$
|
|
\\
|
|
3) Compute $P_{i+1}$
|
|
\\
|
|
Output $\sum_i w_i h_i$
|
|
\\\\
|
|
What should $A$ do? \\
|
|
1) $A$ should pay attention to $P_i$ \\
|
|
2) More precisely $A$ should output $h_i$ s.t. $|\gamma_i|$ is as big as possible \\ where $ | \gamma | \rightarrow \frac{1}{2} \varepsilon_i$
|
|
\begin{figure}[h]
|
|
\centering
|
|
\includegraphics[width=0.5\linewidth]{../img/lez23-img3.JPG}
|
|
\caption{}
|
|
%\label{fig:}
|
|
\end{figure}\\
|
|
$$
|
|
P_{i+1}= \frac{P_i(t) e^{-w_i L_i(t)} }{E_i[ \quad ]}
|
|
$$
|
|
$$
|
|
L_i(t) = 1 \ \Leftrightarrow h_t(x_t) = y_t \qquad w_i > 0
|
|
$$
|
|
\\
|
|
\begin{figure}[h]
|
|
\centering
|
|
\includegraphics[width=0.3\linewidth]{../img/lez23-img4.JPG}
|
|
\caption{}
|
|
%\label{fig:}
|
|
\end{figure}\\
|
|
Typically $h_i$ (classifiers) are simple
|
|
\\
|
|
\bred{Decision stamps}:
|
|
$$
|
|
h(x) = \pm \ sgn (x_i- \tau)
|
|
$$
|
|
$i$ if is feature index, $\tau \in \barra{R}$\\
|
|
At the end boosting is gonna look like this
|
|
\\
|
|
\begin{figure}[h]
|
|
\centering
|
|
\includegraphics[width=0.5\linewidth]{../img/lez23-img5.JPG}
|
|
\caption{}
|
|
%\label{fig:}
|
|
\end{figure}\\
|
|
\end{document} |