Update ddasp_exercise_slides.tex

fs446 · fs446 · commit f609a861a6cf · 2024-11-29T13:01:31.000+01:00
- ex 06 vs. 06 mods
- PCA typo -&gt; improved equations
diff --git a/slides/ddasp_exercise_slides.tex b/slides/ddasp_exercise_slides.tex
@@ -2289,9 +2289,52 @@ \subsection{Exercise 04}
 
 
 
+
+
 \subsection{Exercise 05}
 
-\begin{frame}{Ex05: Condition Number / Regularization}
+\begin{frame}{Ex05: Least Squares / Left Inverse / Projection into Subspaces / Linear Regression}
+%
+Toy Example 1
+
+$
+\bm{X}=
+\begin{bmatrix}
+2 & 0 \\ 0 & 1 \\ 0 & 0 \\ 0 & 0
+\end{bmatrix}\quad
+\bm{y} =
+\begin{bmatrix}
+4 \\ 1 \\ -1 \\ -2
+\end{bmatrix}
+$
+%
+\qquad
+%
+$\min_{\text{wrt }\bm{\theta}} \lVert\bm{e}\rVert_2^2 = \min_{\text{wrt }\bm{\theta}} \lVert\bm{y} - \bm{X} \bm{\theta}\rVert_2^2
+\leftrightarrow
+\operatorname*{argmin}_{\bm{\theta}} \lVert\bm{y} - \bm{X} \bm{\theta}\rVert_2^2
+$
+
+\addvspace{100mm}
+
+\end{frame}
+
+
+\begin{frame}{Linear Regression}
+Toy Example 2
+
+only on Clever Touch
+\end{frame}
+
+
+
+
+
+
+
+
+\subsection{Exercise 06}
+\begin{frame}{Ex06: Condition Number / Regularization}
 Objectives
 \begin{itemize}
 \item concept of the condition number in terms of singular values
@@ -2387,7 +2430,7 @@ \subsection{Exercise 05}
 
 
 
-\begin{frame}[t]{Ex05: Regularization of the LS-Problem}
+\begin{frame}[t]{Regularization of the LS-Problem}
 $\cdot$ full column rank inverse problem $\rightarrow$ solve with left inverse $\bm{X}^{\dagger_l} \bm{y} = \bm{\theta}$
 $$
 \bm{X}^{\dagger_l}_{N \times M}
@@ -2462,8 +2505,8 @@ \subsection{Exercise 05}
 \begin{frame}[t]{L-Curve to Find Optimum Regularization Parameter $\lambda$}
 $$
 \hat{\bm{\theta}}(\textcolor{C0}{\lambda}) \quad=\quad
-\left[\bm{V} (\bm{\Sigma}^\mathrm{T} \bm{\Sigma} + \textcolor{C0}{\lambda}\bm{I})^{-1} \bm{\Sigma}^\mathrm{T} \bm{U}^\mathrm{H}\right] \bm{y} \quad=\quad
-\left[(\bm{X}^\mathrm{H}\bm{X} + \textcolor{C0}{\lambda}\bm{I})^{-1} \bm{X}^\mathrm{H}\right] \bm{y}
+\left[\bm{V} (\bm{\Sigma}^\mathrm{T} \bm{\Sigma} + \textcolor{C0}{\lambda}\bm{I})^{-1} \bm{\Sigma}^\mathrm{T} \bm{U}^\mathrm{T}\right] \bm{y} \quad=\quad
+\left[(\bm{X}^\mathrm{T}\bm{X} + \textcolor{C0}{\lambda}\bm{I})^{-1} \bm{X}^\mathrm{T}\right] \bm{y}
 $$
 \begin{center}
 \begin{tikzpicture}[scale=1]
@@ -2519,73 +2562,102 @@ \subsection{Exercise 05}
 \lVert\bm{y} - \bm{X} \bm{\theta}\rVert_2^2 + \textcolor{C0}{\lambda} \lVert \bm{\theta} \rVert_2^2
 $$
 
-$\cdot$ and the plain Least Squares Error Problem (special case for $\textcolor{C0}{\lambda}=0$)
+$\cdot$ and the ordinary Least Squares Error Problem (i.e. special case for $\textcolor{C0}{\lambda}=0$)
 $$
 \min_{\text{wrt }\bm{\theta}} J(\bm{\theta}) \quad\text{with cost function}\quad
 J(\bm{\theta}) =  \lVert\bm{y} - \bm{X} \bm{\theta}\rVert_2^2
 $$
 
-have the closed form solution using the (regularized) left inverse of $\bm{X} = \bm{U}\bm{\Sigma}\bm{V}^H$:
+have the closed form solution using the (regularized) left inverse of $\bm{X} = \bm{U}\bm{\Sigma}\bm{V}^\mathrm{T}$:
 $$
 \hat{\bm{\theta}} \quad=\quad
-\left[\bm{V} (\bm{\Sigma}^\mathrm{T} \bm{\Sigma} + \textcolor{C0}{\lambda \bm{I})}^{-1} \bm{\Sigma}^\mathrm{T} \bm{U}^\mathrm{H}\right] \bm{y} \quad=\quad
-\left[(\bm{X}^\mathrm{H}\bm{X} + \textcolor{C0}{\lambda \bm{I}})^{-1} \bm{X}^\mathrm{H}\right] \bm{y}
+\left[\bm{V} (\bm{\Sigma}^\mathrm{T} \bm{\Sigma} + \textcolor{C0}{\lambda \bm{I})}^{-1} \bm{\Sigma}^\mathrm{T} \bm{U}^\mathrm{T}\right] \bm{y} \quad=\quad
+\left[(\bm{X}^\mathrm{T}\bm{X} + \textcolor{C0}{\lambda \bm{I}})^{-1} \bm{X}^\mathrm{T}\right] \bm{y}
 $$
 
 Such a model has $N$ \underline{model parameters} $\bm{\theta} = [\theta_1, \theta_2 \dots \theta_N]^T$ and one \underline{hyper parameter} $\textcolor{C0}{\lambda}$ to be learned from the data $\bm{y}_{M \times 1}$ and the feature matrix $\bm{X}_{M \times N}$ with full column rank $R=N$.
 
 \end{frame}
 
+\begin{frame}[t]{Normal Equations of Tikhonov Regularization in SDV Domain}
 
+The ridge regression optimization problem
+$$
+\min_{\text{wrt }\bm{\theta}} J(\bm{\theta}) \quad\text{with cost function}\quad
+J(\bm{\theta}) =
+\lVert\bm{y} - \bm{X} \bm{\theta}\rVert_2^2 + \textcolor{C0}{\lambda} \lVert \bm{\theta} \rVert_2^2
+$$
+yields the normal equations
+$$(\bm{X}^\mathrm{T}\bm{X} + \lambda \bm{I}) \hat{\bm{\theta}} = \bm{X}^\mathrm{T} \bm{y}$$
+and can be solved in SVD domain
+$$((\bm{U} \bm{\Sigma} \bm{V}^\mathrm{T})^\mathrm{T} (\bm{U} \bm{\Sigma} \bm{V}^\mathrm{T}) + \lambda \bm{I}) \hat{\bm{\theta}}=
+(\bm{U} \bm{\Sigma} \bm{V}^\mathrm{T})^\mathrm{T} \bm{y}$$
 
-\subsection{Exercise 06}
+%$$(\bm{V} \bm{\Sigma}^\mathrm{T} \bm{U}^\mathrm{T} \bm{U} \bm{\Sigma} \bm{V}^\mathrm{T} + \lambda \bm{I}) \hat{\bm{\theta}}=
+%\bm{V} \bm{\Sigma}^\mathrm{T} \bm{U}^\mathrm{T} \bm{y}$$
 
-\begin{frame}{Ex06: Audio Toy Example for Linear Regression and SVD}
-Objectives
-\begin{itemize}
-\item audio multitrack data (stems) arranged as data matrix
-\item the SVD of this matrix allows to listen to the U space, i.e. to the orthogonal audio signals (which is some source separation approach)
-\item try to find the mixing gains of a mix that is corrupted by noise
-\end{itemize}
-\end{frame}
+$$(\bm{V} \bm{\Sigma}^\mathrm{T} \bm{\Sigma} \bm{V}^\mathrm{T} + \lambda \bm{V}\bm{I}\bm{V}^\mathrm{T}) \hat{\bm{\theta}}=
+\bm{V} \bm{\Sigma}^\mathrm{T} \bm{U}^\mathrm{T} \bm{y}$$
 
-\begin{frame}[t]{Ex06: Audio Toy Example for Linear Regression and SVD}
-\begin{center}
-$
-\def\L{0.5}
-\def\F{1}
-\def\N{3}
-\def\rank{0.999999}
-\drawmatrix[fill=none, height=\N, width=0]{y}_\mathtt{N \times 1} =
-\drawmatrix[fill=none, height=\N, width=\F]{X}_\mathtt{N \times F}
-\drawmatrix[fill=none, height=\F, width=0]\theta_\mathtt{F \times 1}+
-\drawmatrix[fill=none, height=\N, width=0]{\nu}_\mathtt{N \times 1}
-=
-\drawmatrix[bbox style={fill=C4}, bbox height=\N, bbox width=\N, fill=C0, height=\N, width=\rank\F]U_\mathtt{N \times N}
-\drawmatrix[bbox style={fill=gray!50}, bbox height=\N, bbox width=\F, fill=white, height=\rank\F, width=\rank\F]\Sigma_\mathtt{N \times F}
-\drawmatrix[bbox style={fill=C1}, bbox height=\F, bbox width=\F, fill=C2, height=\F, width=\rank\F]{V}_\mathtt{F \times F}^H
-\drawmatrix[fill=none, height=\F, width=0]\theta_\mathtt{F \times 1}+
-\drawmatrix[fill=none, height=\N, width=0]{\nu}_\mathtt{N \times 1}
-$
-\end{center}
-\end{frame}
+$$[\bm{V} (\bm{\Sigma}^\mathrm{T} \bm{\Sigma} + \lambda \bm{I})\bm{V}^\mathrm{T}] \hat{\bm{\theta}}=
+\bm{V} \bm{\Sigma}^\mathrm{T} \bm{U}^\mathrm{T} \bm{y}$$
+using left inverse
+$$\hat{\bm{\theta}}=
+[\bm{V} (\bm{\Sigma}^\mathrm{T} \bm{\Sigma} + \lambda \bm{I})\bm{V}^\mathrm{T}]^{-1} \bm{V} \bm{\Sigma}^\mathrm{T} \bm{U}^\mathrm{T} \bm{y}$$
 
+$$\hat{\bm{\theta}}=
+\bm{V} (\bm{\Sigma}^\mathrm{T} \bm{\Sigma} + \lambda \bm{I})^{-1} \bm{\Sigma}^\mathrm{T} \bm{U}^\mathrm{T} \bm{y}$$
 
+\end{frame}
 
-\begin{frame}{Ex06: Audio Toy Example for Linear Regression and SVD}
-Consider the following linear combinations
-$$\bm{X} \bm{\theta} + \bm{\nu} = \bm{y}$$
-where $\bm{\theta}=[\theta_1, \theta_2, \theta_3, ..., \theta_{F}]^\mathrm{T}$ are typical variables for the model parameter vector.
+% \subsection{Exercise 06}
 %
-\begin{itemize}
-\item $\bm{X}_{N \times F}$ matrix with $N$ audio samples for each column, $f$-th column represents the $f$-th audiotrack / feature
-\item $\bm{\theta}_{F \times 1}$ column vector of scalar values that represent a dedicated gain for each audiotrack
-\item $\bm{\nu}_{N \times 1}$ column vector that represents an $N$-sample long noise signal added to the mixdown $\bm{X} \bm{\theta}$
-\item $\bm{y}_{N \times 1}$ audio signal with $N$ samples as a result of the linear model's linear combination plus noise
-\end{itemize}
+% \begin{frame}{Ex06: Audio Toy Example for Linear Regression and SVD}
+% Objectives
+% \begin{itemize}
+% \item audio multitrack data (stems) arranged as data matrix
+% \item the SVD of this matrix allows to listen to the U space, i.e. to the orthogonal audio signals (which is some source separation approach)
+% \item try to find the mixing gains of a mix that is corrupted by noise
+% \end{itemize}
+% \end{frame}
 %
-Let us assume that a) we know $\bm{X}$ (i.e. the individual audio tracks) and $\bm{y}$ (i.e. the noise-corrupted final mixdown), b) that we do not know the noise $\bm{\nu}$ and c) that we want to estimate the 'real world' mixing gains $\bm{\theta}$
-\end{frame}
+% \begin{frame}[t]{Ex06: Audio Toy Example for Linear Regression and SVD}
+% \begin{center}
+% $
+% \def\L{0.5}
+% \def\F{1}
+% \def\N{3}
+% \def\rank{0.999999}
+% \drawmatrix[fill=none, height=\N, width=0]{y}_\mathtt{N \times 1} =
+% \drawmatrix[fill=none, height=\N, width=\F]{X}_\mathtt{N \times F}
+% \drawmatrix[fill=none, height=\F, width=0]\theta_\mathtt{F \times 1}+
+% \drawmatrix[fill=none, height=\N, width=0]{\nu}_\mathtt{N \times 1}
+% =
+% \drawmatrix[bbox style={fill=C4}, bbox height=\N, bbox width=\N, fill=C0, height=\N, width=\rank\F]U_\mathtt{N \times N}
+% \drawmatrix[bbox style={fill=gray!50}, bbox height=\N, bbox width=\F, fill=white, height=\rank\F, width=\rank\F]\Sigma_\mathtt{N \times F}
+% \drawmatrix[bbox style={fill=C1}, bbox height=\F, bbox width=\F, fill=C2, height=\F, width=\rank\F]{V}_\mathtt{F \times F}^H
+% \drawmatrix[fill=none, height=\F, width=0]\theta_\mathtt{F \times 1}+
+% \drawmatrix[fill=none, height=\N, width=0]{\nu}_\mathtt{N \times 1}
+% $
+% \end{center}
+% \end{frame}
+%
+%
+%
+% \begin{frame}{Ex06: Audio Toy Example for Linear Regression and SVD}
+% Consider the following linear combinations
+% $$\bm{X} \bm{\theta} + \bm{\nu} = \bm{y}$$
+% where $\bm{\theta}=[\theta_1, \theta_2, \theta_3, ..., \theta_{F}]^\mathrm{T}$ are typical variables for the model parameter vector.
+% %
+% \begin{itemize}
+% \item $\bm{X}_{N \times F}$ matrix with $N$ audio samples for each column, $f$-th column represents the $f$-th audiotrack / feature
+% \item $\bm{\theta}_{F \times 1}$ column vector of scalar values that represent a dedicated gain for each audiotrack
+% \item $\bm{\nu}_{N \times 1}$ column vector that represents an $N$-sample long noise signal added to the mixdown $\bm{X} \bm{\theta}$
+% \item $\bm{y}_{N \times 1}$ audio signal with $N$ samples as a result of the linear model's linear combination plus noise
+% \end{itemize}
+% %
+% Let us assume that a) we know $\bm{X}$ (i.e. the individual audio tracks) and $\bm{y}$ (i.e. the noise-corrupted final mixdown), b) that we do not know the noise $\bm{\nu}$ and c) that we want to estimate the 'real world' mixing gains $\bm{\theta}$
+% \end{frame}
 
 
 
@@ -2680,7 +2752,7 @@ \subsection{Exercise 08}
 
 Mapping
 
-$\bm{X}_c= \bm{U} \bm{\Sigma} \bm{V}^\mathrm{T} = \bm{F}_c \bm{L}^\mathrm{T}$
+$\bm{X}_c= \bm{U} \bm{\Sigma} \cdot \bm{V}^\mathrm{T} = \bm{F}_c \cdot \bm{L}^\mathrm{T}$
 
 $\bm{F}_c = \bm{X}_c \bm{L} = \bm{X}_c \bm{V} = \bm{U} \bm{\Sigma}$
 
@@ -2701,9 +2773,9 @@ \subsection{Exercise 08}
 
 Mapping
 
-$\bm{X}_r = \bm{V} \bm{\Sigma} \bm{U}^\mathrm{T} = \bm{L} \bm{F}_r$
+$\bm{X}_r = \bm{V} \cdot \bm{\Sigma}^\mathrm{T} \bm{U}^\mathrm{T} = \bm{L} \cdot \bm{F}_r$
 
-$\bm{F}_r = \bm{L}^\mathrm{T} \bm{X}_r = \bm{V}^\mathrm{T} \bm{X}_r = \bm{\Sigma} \bm{U}^\mathrm{T}$
+$\bm{F}_r = \bm{L}^\mathrm{T} \bm{X}_r = \bm{V}^\mathrm{T} \bm{X}_r = \bm{\Sigma}^\mathrm{T} \bm{U}^\mathrm{T} = (\bm{\Sigma} \bm{U})^\mathrm{T}$
 
 PC scores $\bm{F}_r =
 \begin{bmatrix}
@@ -2713,7 +2785,7 @@ \subsection{Exercise 08}
 - \bm{f}_F -
 \end{bmatrix}
 =
-\bm{\Sigma} \bm{U}^\mathrm{T}$
+(\bm{\Sigma} \bm{U})^\mathrm{T}$
 
 PC loadings $\bm{L} = \bm{V} $