\documentclass{article}
\usepackage{array}
\usepackage{graphicx}
\textheight=1.1\textheight

\begin{document}
\title{CSC2515 --- Assignment \#1 Answers}
\author{Behdad Esfahbod\footnote{Not registered in the course yet.  Already
talked to Sam and will
do late-registration after administration problem is resolved.}
\\993505827}
\maketitle

\section{Training/Testing Error Curves (1.5\%)}
The crossing pair of lines are exactly the ones as in graphs below.  In other
words, training set error crosses Bayes error, but test set error does not.
\begin{itemize}
\item Fixed training set size: $$\mbox{\includegraphics{a111.mps}}$$
\item Fixed model complexity: $$\mbox{\includegraphics{a112.mps}}$$
\end{itemize}


\break
\section{Learning Random Boolean Functions (2.5\%)}
\begin{itemize}
\item\textbf{$\mathbf a$:}
\def\N{{\cal N}}
Let $\N$ be the multiset of training cases.
$$
a = E(\sum_{x\in\N}A_x)
  = \sum_{x\in\N}E(A_x)
$$
where $A_x$ is a random variable that is zero if $x\not\in\N$ and 1
otherwise.
$$
A_x = 1-p(x\not\in\N)
    = 1-\prod_{x_0\in\N}p(x\not=x_0)
    = 1-(\frac{2^k-1}{2^k})^N
$$
Putting together:
$$
a = \sum_{x\in\N}E(A_x)
  = 2^k\left(1-(\frac{2^k-1}{2^k})^N\right)
  = \frac{2^k-(2^k-1)^N}{2^{k(N-1)}}
$$


\item\textbf{$\mathbf b$:}
\def\M{{\cal M}}
Let $\M$ be the multiset of test cases.
$$
b = E(\sum_{x\in\M}B_x)
  = \sum_{x\in\M}E(B_x)
$$
where $B_x$ is a random variable that is zero if $x\not\in\N$ and 1
otherwise.
$$
B_x = p(x\in\N)
    = \frac{a}{2^k}
$$
Putting together:
$$
b = \sum_{x\in\M}E(B_x)
  = \frac{Ma}{2^k}
$$


\item\textbf{Lowest error rate:}
$$
   \frac{0*b+\frac12*(M-b)}{M}
 = \frac{M-b}{2M}
$$

\textbf{Algorithm:} If test case has been in training cases, return the
output to one such training case; return 0 otherwise.

\textbf{Argument:} For seen cases, error is zero.  For the rest, they are
independently and uniformly distributed among classes, so no difference
what to decide.


\item\textbf{Generalization:}
Yes.  Because the error rate
$$
\frac{M-b}{2M} = 1-\frac{a}{2^k}
$$
is independent of the test set.
\end{itemize}


\break
\section{Class-Conditional Gaussians (3\%)}
\begin{itemize}
\def\x{\mathbf{x}}
\item
\begin{eqnarray*}
p(y=k|\x) 
& = & \frac{p(\x|y=k)p(y=k)}{p(\x)}
  =   \frac{p(\x|y=k)\alpha_k}{\sum_{k=1}^{K}p(\x|y=k)\alpha_k}
  \\
& = & \frac
{\alpha_k\exp\left\{-\sum_{i=1}^D\frac1{2\sigma_i^2}(x_i-\mu_{ki})^2\right\}}%
{\sum_{k=1}^K\alpha_k\exp\left\{-\sum_{i=1}^D\frac1{2\sigma_i^2}(x_i-\mu_{ki})^2\right\}\alpha_k}%
\end{eqnarray*}


\item
\begin{eqnarray*}
\ell(\theta;D)
& = & \log p(y^1,x^1,y^2,x^2,\dots,y^M,x^M|\theta) \\
& = & \log \prod_{j=1}^M p(y^j,x^j|\theta)
  =   \sum_{j=1}^M \log p(y^j,x^j|\theta)
  =   \sum_{j=1}^M \left(\log p(x^j|y^j,\theta)+\log p(y^j)\right) \\
& = & \sum_{j=1}^M\left[
      -\frac12\sum_{i=1}^D\log(2\pi\sigma_i^2)
      -\sum_{i=1}^D\frac1{2\sigma_i^2}(x^j_i-\mu_{y_ji})^2
      +\log\alpha_{y^j}
      \right]
\end{eqnarray*}


\item
\begin{eqnarray*}
\frac{\partial\ell(\cdot)}{\partial\mu_{ki}}
& = & \frac1{\sigma_i^2}\sum_{j:\thinspace y^j=k}(\mu_k-x^j_i)
\\
\frac{\partial\ell(\cdot)}{\partial\log\sigma_i^2}
& = & \sum_{j=1}^M
      \frac{\partial}{\partial\log\sigma_i^2}
        \left[-\frac12\log\sigma_i^2
	  -\frac12e^{-\log\sigma_i^2}(x^j_i-\mu_{y_ji})^2
        \right]
\\
& = & -\frac M2
      +\frac1{2\sigma_i^2}
      \sum_{j=1}^M (x^j_i-\mu_{y_ji})^2
\\
\frac{\partial\ell(\cdot)}{\partial\alpha_k}
& = & \sum_{j:\thinspace y^j=k}\frac1{\alpha_k}
\end{eqnarray*}


\item
\begin{eqnarray*}
\sum_{j:\thinspace y^j=k}(\mu_k-x^j_i)=0
& \Rightarrow &
\mu_k = \frac{\sum_{j:\thinspace y^j=k}x_i^j}{\sum_{j:\thinspace y^j=k}1}
\\
-\frac M2 +\frac1{2\sigma_i^2} \sum_{j=1}^M (x^j_i-\mu_{y_ji})^2 = 0
& \Rightarrow &
\sigma_i^2 = \frac1M\sum_{j=1}^M(x_i^j-\mu_{y^ji})^2
\end{eqnarray*}

Using Lagrange Multipliers for class priors to sum them up to one:
\begin{eqnarray*}
L(\mathbf{\alpha},\lambda)=\ell(\cdot)+\lambda c(\mathbf{\alpha})\\
c(\mathbf{\alpha}) = 1-\sum_{k=1}^K\alpha_k
\end{eqnarray*}

Means:
\begin{eqnarray*}
\frac{\partial L(\cdot)}{\partial \mathbf{\alpha}}=0
& \Rightarrow &
\lambda + \sum_{j:\thinspace y^j=k}\frac1{\alpha_k} = 0
\\
\frac{\partial L(\cdot)}{\partial \lambda}=0
& \Rightarrow &
1-\sum_{k=1}^K\alpha_k = 0
\end{eqnarray*}

Solving for $\lambda$ first and $\sigma_k$ next gives:
\begin{eqnarray*}
\lambda & = & M\\
\alpha_k & = & \frac{\sum_{j:\thinspace y^j=k}1}{M}
\end{eqnarray*}
and observe that fortunately $\alpha_k$'s are all positive and less than one.
\end{itemize}


\break
\section{Handwritten Digit Classification (11\%)}
The images are in row-major, top to bottom, left to right.

\subsection{$K$-NN Classifier}
I broke ties by getting the first item, i.e.~the one with smallest
index.  This way $k=1$ turned out to be the best $k$ as can be seen
in the graph below:
$$\mbox{\includegraphics[width=\textwidth]{knn_best_k.pdf}}$$

\break
\subsection{Conditional Gaussian Classifier Training}
For the means, I took black for zero, and white for maximum value
over all means, using linear grayscale in between.  For log of
covariances, mapped black to the minimum over all values of the diagonals
and white to the maximum of them, again using linear grayscale
in between.  The image of covariances is a lot less \emph{blurred}
if I draw covariance diagonals themselves instead of the log of them and
map colors linearly.
$$\mbox{\includegraphics[width=\textwidth]{gaussian_plot.pdf}}$$


\subsection{Na\"ive Bayes Classifier Training}
Black represents the minimum over all values of the logs
and white represents the maximum of them, using linear grayscale
in between.
$$\mbox{\includegraphics[width=\textwidth]{naive_plot.pdf}}$$


\break
\subsection{Performance Evaluation}
\begin{table*}[h]
\begin{center}
\begin{tabular}{r|c|c}
& Gaussian-conditional & Na\"ive Bayes \\
\hline
training set & -63.7881 & -12.7559 \\
test set     & -62.3942 & -12.7017 \\
\end{tabular}
\end{center}
\caption{Average conditional log likelihood}
\end{table*}

\def\o{\phantom0}
\begin{table*}[h]
\begin{center}
\begin{tabular}{r|c|c|c}
& $K$-NN & Gaussian-conditional & Na\"ive Bayes \\
\hline
1 &\o1 (0.25\%) &\o0 (0.00\%) & 162 (23.14\%) \\
2 & 11 (2.75\%) &\o5 (1.25\%) & 174 (24.86\%) \\
3 & 21 (5.25\%) & 13 (3.25\%) & 141 (20.14\%) \\
4 & 14 (3.50\%) & 11 (2.75\%) & 147 (21.00\%) \\
5 & 19 (4.75\%) & 17 (4.25\%) & 178 (25.43\%) \\
6 &\o7 (1.75\%) &\o9 (2.25\%) &\o93 (13.29\%) \\
7 & 13 (3.25\%) & 14 (3.50\%) & 157 (22.43\%) \\
8 & 26 (6.50\%) & 21 (5.25\%) & 225 (32.14\%) \\
9 & 11 (2.75\%) & 15 (3.75\%) & 227 (32.43\%) \\
0 &\o2 (0.50\%) &\o4 (1.00\%) &\o77 (11.00\%) \\
\hline
total & 125 (3.125\%)  & 109 (2.725\%) & 1581 (22.59\%) \\
\end{tabular}
\end{center}
\caption{Error cases on test sets}
\end{table*}

\begin{table*}[h]
\begin{center}
\begin{tabular}{r|c|c|c}
& $K$-NN & Gaussian-conditional & Na\"ive Bayes \\
\hline
1 &\o1 (0.14\%) &\o3 (0.43\%) &\o87 (21.75\%) \\
2 & 25 (3.57\%) & 12 (1.71\%) & 106 (26.50\%) \\
3 & 26 (3.71\%) & 11 (1.57\%) &\o91 (22.75\%) \\
4 & 31 (4.43\%) & 19 (2.71\%) &\o85 (21.25\%) \\
5 & 22 (3.14\%) & 13 (1.86\%) & 110 (27.50\%) \\
6 &\o9 (1.29\%) &\o5 (0.71\%) &\o60 (15.00\%) \\
7 & 23 (3.29\%) & 12 (1.71\%) &\o89 (22.25\%) \\
8 & 57 (8.14\%) & 27 (3.86\%) & 122 (30.50\%) \\
9 & 34 (4.86\%) & 22 (3.14\%) & 134 (33.50\%) \\
0 &\o9 (1.29\%) &\o6 (0.86\%) &\o59 (14.75\%) \\
\hline
total & 237 (3.39\%)  & 130 (1.86\%) & 943 (23.58\%)\\
\end{tabular}
\end{center}
\caption{Error cases on training sets}
\end{table*}

%\subsection{Tips}

\end{document}