ex5-bias vs variance

AndrewNg 机器学习习题ex5-bias vs variance

练习用数据

ex5data1.mat文件储存了大坝出水量的数据,由三部分组成:

  • 训练集:X,y
  • 交叉验证集:Xval,yval
  • 测试集:Xtest,ytest

需要的头:

1
2
3
4
5
6
import numpy as np
import scipy.io as sio
import scipy.optimize as opt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

数据预处理

画出训练集的散点图,给特征集加一列1.

1
2
3
4
5
6
7
8
9
10
11
12
13
def load_data():
d = sio.loadmat('./data/ex5data1.mat')
return map(np.ravel, [d['X'], d['y'], d['Xval'], d['yval'], d['Xtest'], d['ytest']])


X, y, Xval, yval, Xtest, ytest = load_data()
df = pd.DataFrame({'water_level': X, 'flow': y})
print(df.shape)
sns.lmplot('water_level', 'flow', data=df, fit_reg=False)
plt.show()

X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis=1) for x in (X, Xval, Xtest)]
# print(X, Xval, Xtest )

image

正则化

代价函数是:

image

梯度下降:
${\theta_{j}}:={\theta_{j}}-\alpha \frac{\partial }{\partial {\theta_{j}}}J\left(\theta \right)$

正则化线性回归的代价函数为:
$J\left( \theta \right)=\frac{1}{2m}\sum\limits_{i=1}^{m}{[({{({h_\theta}({{x}^{(i)}})-{{y}^{(i)}})}^{2}}+\lambda \sum\limits_{j=1}^{n}{\theta _{j}^{2}})]}$

如果我们要使用梯度下降法令这个代价函数最小化,因为我们未对$\theta_0$进行正则化,所以梯度下降算法将分两种情形:
$Repeat$ $until$ $convergence${ ${\theta_0}:={\theta_0}-a\frac{1}{m}\sum\limits_{i=1}^{m}{(({h_\theta}({{x}^{(i)}})-{{y}^{(i)}})x_{0}^{(i)}})$ ${\theta_j}:={\theta_j}-a[\frac{1}{m}\sum\limits_{i=1}^{m}{(({h_\theta}({{x}^{(i)}})-{{y}^{(i)}})x_{j}^{\left( i \right)}}+\frac{\lambda }{m}{\theta_j}]$ $for$ $j=1,2,...n$ }

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 代价函数
def cost(theta, X, y):
m = X.shape[0]
inner = X @ theta - y # R(m+1)
# 1*m @ m*1 = 1*1 矩阵乘法
# 一维矩阵的转置乘以它自己等于每个元素的平方和
return inner.T @ inner / (2 * m)



print(cost(theta, X, y,))
# 303.9515255535976

# 梯度
def gradient(theta, X, y):
m = X.shape[0]
return X.T @ (X @ theta - y) / m # (m, n).T @ (m, 1) -> (n, 1)


print(gradient(theta, X, y,))
# [-15.30301567 598.16741084]


# 正则化
def regularized_cost(theta, X, y, l=1):
return cost(theta, X, y) + (l / (2 * X.shape[0])) * np.power(theta[1:], 2).sum()


def regularized_gradient(theta, X, y, l=1):
m = X.shape[0]
regularized_term = theta.copy()
regularized_term[0] = 0
regularized_term = (l / m) * regularized_term
return gradient(theta, X, y) + regularized_term

print(regularized_gradient(theta, X, y, l=1))
# [-15.30301567 598.25074417]

训练数据

正则化项 $\lambda=0$

1
2
3
4
5
6
7
8
9
10
11
12
13
def linear_regression_np(theta, X, y, l=1):
res = opt.fmin_tnc(func=regularized_cost, x0=theta, fprime=regularized_gradient, args=(X, y))
return res


final_theta = linear_regression_np(theta, X, y)[0]
b = final_theta[0]
m = final_theta[1]

plt.scatter(X[:, 1], y, label="Training data")
plt.plot(X[:, 1], X[:, 1]*m + b, label='Prediction')
plt.legend(loc=2)
plt.show()

image

学习曲线

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def plot_learning_curve(X, y, Xval, yval, l=0):
training_cost, cv_cost = [], [] # 计算训练集的代价和交叉验证(cross validation)集的代价
m = X.shape[0]
for i in range(1, m + 1):
res = linear_regression_np(theta, X[:i, :], y[:i], l=0)

tc = regularized_cost(res[0], X[:i, :], y[:i], l=0)
cv = regularized_cost(res[0], Xval, yval, l=0)

training_cost.append(tc)
cv_cost.append(cv)

plt.plot(np.arange(1, m + 1), training_cost, label='training cost')
plt.plot(np.arange(1, m + 1), cv_cost, label='cv cost')
plt.legend(loc=1)


plot_learning_curve(X, y, Xval, yval, l=0)
plt.show()

image

观察学习曲线发现拟合的不太好,欠拟合。很显然我们的模型不优秀,改为多项式特征尝试。

多项式特征

把特征扩展到8阶,然后归一化特征值。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
def poly_features(x, power, as_ndarray=False):
data = {'f{}'.format(i): np.power(x, i) for i in range(1, power + 1)}
df = pd.DataFrame(data)
return df.as_matrix() if as_ndarray else df


# 归一化特征值,减去平均数除以标准差
def normalize_feature(df):
"""Applies function along input axis(default 0) of DataFrame."""
return df.apply(lambda column: (column - column.mean()) / column.std())


def prepare_poly_data(*args, power):
"""
args: keep feeding in X, Xval, or Xtest
will return in the same order
"""
def prepare(x):
# expand feature
df = poly_features(x, power=power)

# normalization
ndarr = normalize_feature(df).as_matrix()

# add intercept term
return np.insert(ndarr, 0, np.ones(ndarr.shape[0]), axis=1)

return [prepare(x) for x in args]

尝试不同的λ来观察学习曲线

1
2
3
4
5
6
7
8
9
X, y, Xval, yval, Xtest, ytest = load_data()
X_poly, Xval_poly, Xtest_poly= prepare_poly_data(X, Xval, Xtest, power=8)

plot_learning_curve(X_poly, y, Xval_poly, yval, l=0)
plt.show()
plot_learning_curve(X_poly, y, Xval_poly, yval, l=1)
plt.show()
plot_learning_curve(X_poly, y, Xval_poly, yval, l=100)
plt.show()

当λ取0时,也就是没有正则项时,可以看到训练的代价太低了,不真实. 这是 过拟合了

image

当训练代价增加了些,不再是0了。 稍减轻了过拟合

image

当λ取100时,正则化过多,变成了欠拟合。

image

最优λ

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# 找到最佳拟合
l_candidate = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
training_cost, cv_cost = [], []
for l in l_candidate:
theta = np.ones(X_poly.shape[1])
theta = linear_regression_np(theta, X_poly, y, l)[0]

tc = cost(theta, X_poly, y)
cv = cost(theta, Xval_poly, yval)
training_cost.append(tc)
cv_cost.append(cv)

plt.plot(l_candidate, training_cost, label='training')
plt.plot(l_candidate, cv_cost, label='cross validation')
plt.legend(loc=2)
plt.xlabel('lambda')
plt.ylabel('cost')
plt.show()


# best cv I got from all those candidates
l_candidate[np.argmin(cv_cost)]

# use test data to compute the cost
for l in l_candidate:
theta = np.ones(X_poly.shape[1])
theta = linear_regression_np(theta, X_poly, y, l)[0]
print('test cost(l={}) = {}'.format(l, cost(theta, Xtest_poly, ytest)))

image

1
2
3
4
5
6
7
8
9
10
test cost(l=0) = 9.799399498688892
test cost(l=0.001) = 11.054987989655938
test cost(l=0.003) = 11.249198861537238
test cost(l=0.01) = 10.879605199670008
test cost(l=0.03) = 10.022734920552129
test cost(l=0.1) = 8.632060998872074
test cost(l=0.3) = 7.336602384055533
test cost(l=1) = 7.46630349664086
test cost(l=3) = 11.643928200535115
test cost(l=10) = 27.715080216719304

调参后, lambda = 0.3 是最优选择,这个时候测试代价最小

0%