Models and Estimation II¶

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings; warnings.simplefilter('ignore', FutureWarning) # Seaborn triggers warnings in scipy
%matplotlib inline

In [2]:
# Configure nice plotting defaults - (this must be done in a cell separate
# from %matplotlib call)
plt.style.use('seaborn')
sns.set_context('talk', font_scale=1.4)
plt.rcParams['figure.figsize'] = (10, 7)


We're also going to use plotly for interactive plots.

In [3]:
import plotly.offline as py
py.init_notebook_mode(connected=True) # True if online -> smaller notebooks without plotly.js

import plotly.graph_objs as go
import plotly.figure_factory as ff



Review of Last Time¶

In [4]:
data = [5, 7, 8, 9]

In [5]:
def abs_loss(est, y_obs):
return np.abs(y_obs - est)

In [6]:
def squared_loss(est, y_obs):
return (est - y_obs)**2

In [7]:
def avg_absolute_loss(est, data):
return np.mean(np.array([abs_loss(est, y_obs) for y_obs in data]), axis=0)

In [8]:
def avg_squared_loss(est, data):
return np.mean(np.array([squared_loss(est, y_obs) for y_obs in data]), axis=0)

In [9]:
avg_absolute_loss(8, data)

Out[9]:
1.25
In [10]:
avg_squared_loss(8, data)

Out[10]:
2.75
In [11]:
thetas = np.linspace(0, 10, 200)
loss = avg_absolute_loss(thetas, data)
plt.plot(thetas, loss, label="L1 loss")
plt.vlines(data, -3, -2, colors="r", linewidth=0.8, label="Observations")
plt.xlabel(r"Choice for $\theta$")
plt.ylabel(r"Loss")
plt.legend()
plt.savefig("l9_abs_loss.png", dpi=300)

In [12]:
thetas = np.linspace(0, 10, 200)
loss = avg_squared_loss(thetas, data)
plt.plot(thetas, loss, label = "L2 loss")
plt.vlines(data, -5, -2, colors="r", linewidth=0.8, label="Observations")
plt.xlabel(r"Choice for $\theta$")
plt.ylabel(r"Loss")
plt.legend()
plt.savefig("l9_squared_loss.png", dpi=300)


For the L2 loss, a single outlier can cause huge changes in the optimal theta.

In [13]:
data2 = [5, 7, 8, 9, 100]

In [14]:
np.mean(data2)

Out[14]:
25.8
In [15]:
thetas = np.linspace(0, 100, 200)
loss = avg_squared_loss(thetas, data2)
plt.plot(thetas, loss, label="L2 Loss")
plt.vlines(data2, 0, 500, colors="r", linewidth=2, label="Observations")
plt.xlabel(r"Choice for $\theta$")
plt.ylabel(r"Loss")
plt.legend()
plt.savefig("l9_squared_loss_outlier.png", dpi=300)

In [16]:
thetas = np.linspace(0, 100, 200)
loss = avg_absolute_loss(thetas, data2)
plt.plot(thetas, loss, label="L1 Loss")
plt.vlines(data2, 0, 5, colors="r", linewidth=2, label="Observations")
plt.xlabel(r"Choice for $\theta$")
plt.ylabel(r"Loss")
plt.legend()
plt.savefig("l9_absolute_loss_outlier.png", dpi=300)

In [17]:
def huber_loss(est, y_obs, alpha = 1):
d = abs_loss(est, y_obs)
return np.where(d < alpha,
squared_loss(est, y_obs) / 2.0,
alpha * (d - alpha / 2.0))

In [18]:
def avg_huber_loss(est, data, alpha = 1):
return np.mean(np.array([huber_loss(est, y_obs, alpha) for y_obs in data]), axis=0)

In [19]:
thetas = np.linspace(0, 10, 200)
loss = avg_huber_loss(thetas, data, 1)
plt.plot(thetas, loss, label="Huber Loss")
plt.vlines(data, -5, -2, colors="r", linewidth=0.8, label="Observations")
plt.xlabel(r"Choice for $\theta$")
plt.ylabel(r"Loss")
plt.legend()
#print("Minimizing Theta", thetas[np.argmin(loss)])
plt.savefig("l9_alpha1_huber_loss.png", dpi=300)

In [20]:
thetas = np.linspace(0, 10, 200)
loss = avg_huber_loss(thetas, data, 0.1)
plt.plot(thetas, loss, label="Huber Loss")
plt.vlines(data, -0.5, 0, colors="r", linewidth=0.4, label="Observations")
plt.xlabel(r"Choice for $\theta$")
plt.ylabel(r"Loss")
plt.legend()
#print("Minimizing Theta", thetas[np.argmin(loss)])
plt.savefig("l9_alphapt1_huber_loss.png", dpi=300)


Minimizing the Average Huber Loss Analytically¶

Recall that the Huber loss has the form:

$$\Large f(\theta) =\frac{1}{n}\sum_{i=1}^n \begin{cases} \frac{1}{2}\left( y_i - \theta \right)^2 & \left| y_i - \theta \right| < \alpha \\ \alpha \left(\left| y_i - \theta \right| - \frac{\alpha}{2} \right) & \text{otherwise} \end{cases}$$

Taking the derivative we get:

$$\Large \frac{\partial}{\partial \theta} f(\theta) =\frac{1}{n}\sum_{i=1}^n \begin{cases} -\left(y_i - \theta \right) & \left| \theta - y_i \right| < \alpha \\ -\alpha \, \textbf{sign}(y_i - \theta) & \text{otherwise} \end{cases}$$

Unfortunately this is difficult to solve analytically. However we can get a plot of what the derivative looks like

Derivative at a single point¶

In [21]:
alpha =1.0
f = lambda theta: huber_loss(theta, 3, alpha = alpha)

def huber_loss_derivative_single(est, y_obs, alpha=1):
d = abs_loss(est, y_obs)
return np.where(d < alpha,
est - y_obs,
alpha * np.sign(est-y_obs))

df = lambda theta: huber_loss_derivative_single(theta, 3.0, alpha=alpha)
thetas = np.linspace(1, 5, 1000)
plt.plot(thetas, f(thetas), 'k', label="f")
plt.plot(thetas, df(thetas), '.', label=r"$\partial f / \partial \theta$")
# plt.plot(thetas, ddf(thetas), '--', label=r"$\partial^2 f / \partial \theta^2$")
plt.plot([3],[0], 'ro', label="Minimizer")
plt.xlabel(r'Choice for $\theta$')
plt.legend();


Derivative on 4 test points.¶

For this example, we'll use datasets [5, 10, 30, 40], and [5, 10, 20, 30, 40]. Notice that for the default $\alpha=1$ alpha value the curves look similar to the absolute loss but with smooth corners.

In [22]:
def huber_loss_derivative(est, data, alpha = 1):
return np.mean(huber_loss_derivative_single(est, data, alpha))

thetas = np.linspace(2.5, 12.5, 200)
alpha = 1
plt.plot(thetas, avg_huber_loss(thetas, data, alpha), '-k', label="Huber Loss")
plt.plot(thetas, [huber_loss_derivative(u, data, alpha) for u in thetas], '.', label='derivative')
plt.vlines(data, -3, -2, colors="r", label="Observations")
plt.xlabel(r'Choice for $\theta$')
print("Minimizing Theta", thetas[np.argmin(avg_huber_loss(thetas, data, alpha))])
plt.legend(loc=9)
plt.savefig('l9_huber_4_pt_large_alpha.png', dpi=300)

Minimizing Theta 7.474874371859297

In [23]:
thetas = np.linspace(2.5, 12.5, 200)
alpha = 0.2
plt.plot(thetas, avg_huber_loss(thetas, data, alpha), '-k', label="Huber Loss")
plt.plot(thetas, [huber_loss_derivative(u, data, alpha) for u in thetas], '.', label='derivative')
plt.vlines(data, -0.5, -0.25, colors="r", label="Observations")
plt.xlabel(r'Choice for $\theta$')
plt.legend(loc=9)
#plt.savefig('l9_huber_4_pt_small_alpha.png', dpi=300)

Out[23]:
<matplotlib.legend.Legend at 0x11b85d160>

Maybe add another example with 5 test points to show that solution is unique for arbitrarily small alpha.

Numerical Minimization¶

Brute Force¶

We can also attempt to minimize our loss functions numerically instead of graphically. A very slow and terrible way would be manual guess-and-check.

In [24]:
data = np.array([5, 7, 8, 9])

In [25]:
avg_squared_loss(4, data)

Out[25]:
12.75
In [26]:
avg_squared_loss(7, data)

Out[26]:
2.25

A somewhat better approach is to use brute force to try out a bunch of thetas and return the one that yields the lowest loss.

In [27]:
def simple_minimize(loss_fn, observations, thetas):
losses = [loss_fn(theta, observations) for theta in thetas]
return thetas[np.argmin(losses)]

In [28]:
simple_minimize(avg_squared_loss, data, np.linspace(0, 10, 20))

Out[28]:
7.368421052631579

Visually, what we're doing is computing all the starred values below and then returning the $\theta$ that goes with the minimum value.

In [29]:
thetas = np.linspace(0, 10, 200)
sparse_thetas = np.linspace(0, 10, 20)

loss = avg_squared_loss(thetas, data)
sparse_loss = avg_squared_loss(sparse_thetas, data)

plt.plot(thetas, loss, label = "L2 loss")
plt.plot(sparse_thetas, sparse_loss, 'r*', label = "L2 loss")
plt.vlines(data, -5, -2, colors="r", linewidth=0.8, label="Observations")
plt.xlabel(r"Choice for $\theta$")
plt.ylabel(r"Loss")
plt.legend()
plt.savefig("l9_brute_force.png", dpi=300)


From our calculus based approach above, we know the actual answer is simply the mean of the data.

In [30]:
np.mean(data)

Out[30]:
7.25

Note that simple_minimize off by a bit, and this is simply because the exact value 7.25 wasn't one of the guesses we tried.

We can repeat this same process to minimize the huber loss for our data set.

In [31]:
avg_huber_alpha_one = lambda theta, data: avg_huber_loss(theta, data, 1)
simple_minimize(avg_huber_alpha_one, data, np.linspace(0, 10, 10000))

Out[31]:
7.4997499749975

Or visually:

In [32]:
thetas = np.linspace(0, 10, 200)
sparse_thetas = np.linspace(0, 10, 20)

loss = avg_huber_alpha_one(thetas, data)
sparse_loss = avg_huber_alpha_one(sparse_thetas, data)

plt.plot(thetas, loss, label = "L2 loss")
plt.plot(sparse_thetas, sparse_loss, 'r*', label = "L2 loss")
plt.vlines(data, -2, -1, colors="r", linewidth=0.8, label="Observations")
plt.xlabel(r"Choice for $\theta$")
plt.ylabel(r"Loss")
plt.legend()
plt.savefig("l9_brute_force.png", dpi=300)


This basic approach is incredibly inefficient, and suffers from two major flaws:

1. If the minimum is outside our range of guesses, the answer will be completely wrong.
2. Even if our range of gueseses is correct, if the guesses are too coarse, our answer will be inaccurate.
In [33]:
simple_minimize(avg_huber_alpha_one, data, np.linspace(0, 3, 10000))

Out[33]:
3.0
In [34]:
simple_minimize(avg_huber_alpha_one, data, np.linspace(0, 1000, 100))

Out[34]:
10.1010101010101

Instead of choosing all of our guesses ahead of time, we can instead start from a single guess and try to iteratively improve on our loss.

They key insight is this: Given that our loss functions are convex, we know that the derivative of the loss is always negative to the left of the solution, and always positive to the right of the solution.

Thus, the derivative tells us which way to go.

In [35]:
thetas = np.linspace(2.5, 12.5, 200)
alpha = 1
plt.plot(thetas, avg_huber_loss(thetas, data, alpha), '-k', label="Huber Loss")
plt.plot(thetas, [huber_loss_derivative(u, data, alpha) for u in thetas], '.', label='derivative')
plt.vlines(data, -3, -2, colors="r", label="Observations")
plt.xlabel(r'Choice for $\theta$')
print("Minimizing Theta", thetas[np.argmin(avg_huber_loss(thetas, data, alpha))])
plt.legend(loc=9)

Minimizing Theta 7.474874371859297

Out[35]:
<matplotlib.legend.Legend at 0x11c122f60>
In [36]:
huber_loss_derivative(0, data, 1)

Out[36]:
-1.0

Since the derivative above is negative, we should go to the right.

In [37]:
huber_loss_derivative(4, data, 1)

Out[37]:
-1.0
In [38]:
huber_loss_derivative(6, data, 1)

Out[38]:
-0.5
In [39]:
huber_loss_derivative(9, data, 1)

Out[39]:
0.75
In [40]:
huber_loss_derivative(8, data, 1)

Out[40]:
0.25
In [41]:
huber_loss_derivative(7.5, data, 1)

Out[41]:
0.0

Gradient descent is a more intelligent process than the one we did above. In particular, gradient descent creates its next guess based not only on the sign of the slope, but also its magnitude. Let's try the most naive possible approach and simply add the slope to our current guess, i.e.

$$\theta^{(t+1)} = \theta^{(t)} - \frac{\partial}{\partial \theta} L(\theta^{(t)}, \textbf{y})$$

Where $\theta^{(t)}$ is the current estimate and $\theta^{(t+1)}$ is the next estimate.

In [42]:
huber_loss_derivative(5, data, 1)

Out[42]:
-0.75
In [43]:
huber_loss_derivative(5.75, data, 1)

Out[43]:
-0.5625
In [44]:
huber_loss_derivative(6.3125, data, 1)

Out[44]:
-0.421875
In [45]:
huber_loss_derivative(6.734375, data, 1)

Out[45]:
-0.31640625
In [46]:
huber_loss_derivative(7.05078125, data, 1)

Out[46]:
-0.224609375
In [47]:
huber_loss_derivative(7.275390625, data, 1)

Out[47]:
-0.1123046875
In [48]:
huber_loss_derivative(7.3876953125, data, 1)

Out[48]:
-0.05615234375
In [49]:
huber_loss_derivative(7.44384765625, data, 1)

Out[49]:
-0.028076171875
In [50]:
huber_loss_derivative(7.471923828125, data, 1)

Out[50]:
-0.0140380859375
In [51]:
def gradient_descent_v1(df, initial_guess, n):
guesses = [initial_guess]
guess = initial_guess
while len(guesses) < n:
guess = guess - df(guess)
guesses.append(guess)
return np.array(guesses)

In [52]:
hld_example = lambda theta: huber_loss_derivative(theta, data, 1)

In [53]:
gradient_descent_v1(hld_example, 5, 50)

Out[53]:
array([5.        , 5.75      , 6.3125    , 6.734375  , 7.05078125,
7.27539062, 7.38769531, 7.44384766, 7.47192383, 7.48596191,
7.49298096, 7.49649048, 7.49824524, 7.49912262, 7.49956131,
7.49978065, 7.49989033, 7.49994516, 7.49997258, 7.49998629,
7.49999315, 7.49999657, 7.49999829, 7.49999914, 7.49999957,
7.49999979, 7.49999989, 7.49999995, 7.49999997, 7.49999999,
7.49999999, 7.5       , 7.5       , 7.5       , 7.5       ,
7.5       , 7.5       , 7.5       , 7.5       , 7.5       ,
7.5       , 7.5       , 7.5       , 7.5       , 7.5       ,
7.5       , 7.5       , 7.5       , 7.5       , 7.5       ])
In [54]:
def squared_loss_derivative_single(est, y_obs):
return 2*(est - y_obs)

def squared_loss_derivative(est, data):
return np.mean(squared_loss_derivative_single(est, data))

In [55]:
thetas = np.linspace(0, 15, 200)
plt.plot(thetas, avg_squared_loss(thetas, data), '-k', label="Squared Loss")
plt.plot(thetas, [squared_loss_derivative(u, data) for u in thetas], '.', label='derivative')
plt.vlines(data, -15, -8, colors="r", label="Observations")
plt.xlabel(r'Choice for $\theta$')
plt.legend(loc=9)
plt.savefig("l9_squared_loss_derivative.png", dpi=300)

In [56]:
squared_loss_derivative(0, data)

Out[56]:
-14.5
In [57]:
squared_loss_derivative(14.5, data)

Out[57]:
14.5
In [58]:
sld_example = lambda theta: squared_loss_derivative(theta, data)

Out[58]:
array([ 0. , 14.5,  0. , 14.5,  0. , 14.5,  0. , 14.5,  0. , 14.5])
In [59]:
def gradient_descent_v2(df, initial_guess, n, alpha):
guesses = [initial_guess]
guess = initial_guess
while len(guesses) < n:
#print(alpha * df(guess))
guess = guess - alpha * df(guess)
guesses.append(guess)
return np.array(guesses)

In [60]:
sld_example = lambda theta: squared_loss_derivative(theta, data)

Out[60]:
array([0.        , 5.8       , 6.96      , 7.192     , 7.2384    ,
7.24768   , 7.249536  , 7.2499072 , 7.24998144, 7.24999629])

Note that we have not discussed how to decide when gradient descent is done. We'll leave this for you to learn a later time.

Function optimization is such a common problem that tools have been written to do all this work for us. For example, scipy provides us with the relatively user friendly scipy.optimize.minimize function.

All we have to do is provide the function, the deriviative of the function, and a starting guess, and it will use the gradient to find the solution. Note, the actual algorithm used by minimize is something more sophisticated than gradient descent called BFGS that we will not discuss in our course.

In [61]:
def squared_loss_derivative(ests, data):
return np.array([np.mean(squared_loss_derivative_single(est, data)) for est in ests])

from scipy.optimize import minimize
data = np.array([5, 7, 8, 9])
f = lambda theta: avg_squared_loss(theta, data)
df = lambda theta: squared_loss_derivative(theta, data)
minimize(f, 0.0, jac=df)

Out[61]:
      fun: 2.1875
hess_inv: array([[0.5]])
jac: array([0.])
message: 'Optimization terminated successfully.'
nfev: 4
nit: 3
njev: 4
status: 0
success: True
x: array([7.25])

Strictly speaking, minimize doesn't need the derivative, though you're likely to get better and faster results if you provide it.

In [62]:
minimize(f, 0.0)

Out[62]:
      fun: 2.187500000000008
hess_inv: array([[0.49999999]])
jac: array([-1.78813934e-07])
message: 'Optimization terminated successfully.'
nfev: 12
nit: 3
njev: 4
status: 0
success: True
x: array([7.24999991])

Multi Dimensional Models¶

In [63]:
#tips datset
data['pcttip'] = data['tip']/data['total_bill'] * 100

Out[63]:
total_bill tip sex smoker day time size pcttip
0 16.99 1.01 Female No Sun Dinner 2 5.944673
1 10.34 1.66 Male No Sun Dinner 3 16.054159
2 21.01 3.50 Male No Sun Dinner 3 16.658734
3 23.68 3.31 Male No Sun Dinner 2 13.978041
4 24.59 3.61 Female No Sun Dinner 4 14.680765

Improving the Model¶

So far we have taken a very simple model that the percentage tip is constant:

$$\Large \text{percentage tip} = \theta$$

We then defined several loss functions and used those to estimate the value of $\theta$. How can we improve this model? Recall that we do have additional information:

In [64]:
data.head()

Out[64]:
total_bill tip sex smoker day time size pcttip
0 16.99 1.01 Female No Sun Dinner 2 5.944673
1 10.34 1.66 Male No Sun Dinner 3 16.054159
2 21.01 3.50 Male No Sun Dinner 3 16.658734
3 23.68 3.31 Male No Sun Dinner 2 13.978041
4 24.59 3.61 Female No Sun Dinner 4 14.680765

One idea is to model the tip as having a linear dependence on the total bill size.

$\texttt{percentage tip} = \theta_1 + \theta_2 * \texttt{total bill}$

This idea is supported by the plot below.

In [65]:
sns.lmplot(x = "total_bill", y = "pcttip", data = data, aspect=1.6)
plt.savefig("ptip_total_bill.pdf")

In [66]:
# We can implement this model as below
def f(theta, total_bill):
return theta[0] + theta[1] * total_bill

In [67]:
def squared_loss(est, y_obs):
return (est - y_obs)**2

def abs_loss(est, y_obs):
return np.abs(y_obs - est)

def huber_loss(est, y_obs, alpha = 1):
d = abs_loss(est, y_obs)
return np.where(d < alpha,
squared_loss(est, y_obs) / 2.0,
alpha * (d - alpha / 2.0))

#We can define our l1, l2, and huber loss functions on tips as below.
def l1_tips(theta):
return np.mean(abs_loss(f(theta, data['total_bill']), data['pcttip']).values)

def l2_tips(theta):
return np.mean(squared_loss(f(theta, data['total_bill']), data['pcttip']).values)

def huber_tips(theta):
return np.mean(huber_loss(f(theta, data['total_bill']), data['pcttip']))
#minimize(l1, np.array([0.0,0.0]))
#minimize(l2, np.array([0.0,0.0]))
#minimize(huber, np.array([0.0,0.0]))

In [68]:
# Make range of values for thetas
uvalues = np.linspace(16,22,70)
vvalues = np.linspace(-.5,0,70)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))

# res = {}

#replace first line with this to see all 3 loss functions
#for loss_name, loss in [("L1", l1_tips), ("L2", l2_tips), ("Huber", huber_tips)]:

for loss_name, loss in [("Huber", huber_tips)]:
loss_values = np.array([loss(t) for t in thetas.T])

loss_surface = go.Surface(name=loss_name,
x=u, y=v, z=np.reshape(loss_values, u.shape),
contours=dict(z=dict(show=True, color="gray", project=dict(z=True)))
)

scene=go.Scene(
xaxis=go.XAxis(title='w0'),
yaxis=go.YAxis(title='w1'),
aspectratio=dict(x=2.,y=2., z=1.),
#         camera=dict(eye=dict(x=-2, y=-2, z=2))
)

layout = go.Layout(
autosize=True,
width=800,
height=600,
scene = scene
)
ind = np.argmin(loss_values)
optimal_point = go.Scatter3d(name = "Optimal Point for " + loss_name,
x = [thetas.T[ind,0]], y = [thetas.T[ind,1]],
z = [loss_values[ind]],
marker=dict(size=10, color="red"))
py.iplot(go.Figure(data=[loss_surface, optimal_point], layout = layout))