理论树
1.logistic regression model:logistic distibution、binomial logistic regression 以及modelmulti-nominal logistic regression model
2.模型参数估计:依靠对数似然函数
3.最大熵模型的最大熵原理:信息熵的定义
4.最大熵模型的定义:样本加权的信息熵
5.最大熵模型的学习:带约束的最优化问题,转化为对偶问题。
拉格朗日乘数法:先求P再求w。本质上与似然函数等价
6.最大熵模型的最优化算法:改进的迭代尺度法或拟牛顿法。
习题
6.2写出Logistic回归模型学习的梯度下降算法。
答:
导包部分
from scipy.optimize import fminbound
from pylab import mpl
import numpy as np
按书上公式模仿sklearn写法
class MyLogisticRegression:
def __init__(self, max_iter=10000, distance=3, epsilon=1e-6):
self.max_iter = max_iter
self.epsilon = epsilon
self.w = None
self.distance = distance
self._X = None
self._y = None
@staticmethod
def preprocessing(X):
row = X.shape[0]
y = np.ones(row).reshape(row, 1)
return np.hstack((X, y))
@staticmethod
def sigmod(x):
return 1 / (1 + np.exp(-x))
def grad(self, w):
z = np.dot(self._X, w.T)
grad = self._X * (self._y - self.sigmod(z))
grad = grad.sum(axis=0)
return grad
def like_func(self, w):
z = np.dot(self._X, w.T)
f = self._y * z - np.log(1 + np.exp(z))
return np.sum(f)
def fit(self, data_x, data_y):
self._X = self.preprocessing(data_x)
self._y = data_y.T
w = np.array([[0] * self._X.shape[1]], dtype=np.float)
k = 0
fw = self.like_func(w)
for _ in range(self.max_iter):
grad = self.grad(w)
if (np.linalg.norm(grad, axis=0, keepdims=True) < self.epsilon).all():
self.w = w
break
def f(x):
z = w - np.dot(x, grad)
return -self.like_func(z)
_lambda = fminbound(f, -self.distance, self.distance)
w1 = w - np.dot(_lambda, grad)
fw1 = self.like_func(w1)
if np.linalg.norm(fw1 - fw) < self.epsilon or \
(np.linalg.norm((w1 - w), axis=0, keepdims=True) < self.epsilon).all():
self.w = w1
break
k += 1
w, fw = w1, fw1
self.grad_ = grad
self.n_iter_ = k
self.coef_ = self.w[0][:-1]
self.intercept_ = self.w[0][-1]
def predict(self, x):
p = self.sigmod(np.dot(self.preprocessing(x), self.w.T))
p[np.where(p > 0.5)] = 1
p[np.where(p < 0.5)] = 0
return p
def score(self, X, y):
y_c = self.predict(X)
error_rate = np.sum(np.abs(y_c - y.T)) / y_c.shape[0]
return 1 - error_rate
尝试实现一下
X_train = np.array([[3, 3, 3], [4, 3, 2], [2, 1, 2], [1, 1, 1], [-1, 0, 1], [2, -2, 1]])
y_train = np.array([[1, 1, 1, 0, 0, 0]])
clf = MyLogisticRegression(epsilon=1e-6)
clf.fit(X_train, y_train)
print("迭代次数:{}次".format(clf.n_iter_))
print("梯度:{}".format(clf.grad_))
print("权重:{}".format(clf.w[0]))
print("模型准确率:%0.2f%%" % (clf.score(X_train, y_train) * 100))
6.3写出最大熵模型学习的DFP算法。
答:
导包部分
import copy
from collections import defaultdict
import numpy as np
from scipy.optimize import fminbound
按书上的公式
class MaxEntDFP:
def __init__(self, epsilon, max_iter=1000, distance=0.01):
self.distance = distance
self.epsilon = epsilon
self.max_iter = max_iter
self.w = None
self._dataset_X = None
self._dataset_y = None
self._y = set()
self._xyID = {}
self._IDxy = {}
self._pxy_dic = defaultdict(int)
self._N = 0
self._n = 0
self.n_iter_ = 0
def init_params(self, X, y):
self._dataset_X = copy.deepcopy(X)
self._dataset_y = copy.deepcopy(y)
self._N = X.shape[0]
for i in range(self._N):
xi, yi = X[i], y[i]
self._y.add(yi)
for _x in xi:
self._pxy_dic[(_x, yi)] += 1
self._n = len(self._pxy_dic)
self.w = np.zeros(self._n)
for i, xy in enumerate(self._pxy_dic):
self._pxy_dic[xy] /= self._N
self._xyID[xy] = i
self._IDxy[i] = xy
def calc_zw(self, X, w):
zw = 0.0
for y in self._y:
zw += self.calc_ewf(X, y, w)
return zw
def calc_ewf(self, X, y, w):
sum_wf = self.calc_wf(X, y, w)
return np.exp(sum_wf)
def calc_wf(self, X, y, w):
sum_wf = 0.0
for x in X:
if (x, y) in self._pxy_dic:
sum_wf += w[self._xyID[(x, y)]]
return sum_wf
def calc_pw_yx(self, X, y, w):
return self.calc_ewf(X, y, w) / self.calc_zw(X, w)
def calc_f(self, w):
fw = 0.0
for i in range(self._n):
x, y = self._IDxy[i]
for dataset_X in self._dataset_X:
if x not in dataset_X:
continue
fw += np.log(self.calc_zw(x, w)) - \
self._pxy_dic[(x, y)] * self.calc_wf(dataset_X, y, w)
return fw
def fit(self, X, y):
self.init_params(X, y)
def calc_dfw(i, w):
def calc_ewp(i, w):
ep = 0.0
x, y = self._IDxy[i]
for dataset_X in self._dataset_X:
if x not in dataset_X:
continue
ep += self.calc_pw_yx(dataset_X, y, w) / self._N
return ep
def calc_ep(i):
(x, y) = self._IDxy[i]
return self._pxy_dic[(x, y)]
return calc_ewp(i, w) - calc_ep(i)
def calc_gw(w):
return np.array([[calc_dfw(i, w) for i in range(self._n)]]).T
Gk = np.array(np.eye(len(self.w), dtype=float))
w = self.w
gk = calc_gw(w)
if np.linalg.norm(gk, ord=2) < self.epsilon:
self.w = w
return
k = 0
for _ in range(self.max_iter):
pk = -Gk.dot(gk)
def _f(x):
z = w + np.dot(x, pk).T[0]
return self.calc_f(z)
_lambda = fminbound(_f, -self.distance, self.distance)
delta_k = _lambda * pk
w += delta_k.T[0]
gk1 = calc_gw(w)
if np.linalg.norm(gk1, ord=2) < self.epsilon:
self.w = w
break
yk = gk1 - gk
Pk = delta_k.dot(delta_k.T) / (delta_k.T.dot(yk))
Qk = Gk.dot(yk).dot(yk.T).dot(Gk) / (yk.T.dot(Gk).dot(yk)) * (-1)
Gk = Gk + Pk + Qk
gk = gk1
k += 1
self.w = w
self.n_iter_ = k
def predict(self, x):
result = {}
for y in self._y:
prob = self.calc_pw_yx(x, y, self.w)
result[y] = prob
return result
尝试实现一下,跑起来有点慢
dataset = np.array([['no', 'sunny', 'hot', 'high', 'FALSE'],
['no', 'sunny', 'hot', 'high', 'TRUE'],
['yes', 'overcast', 'hot', 'high', 'FALSE'],
['yes', 'rainy', 'mild', 'high', 'FALSE'],
['yes', 'rainy', 'cool', 'normal', 'FALSE'],
['no', 'rainy', 'cool', 'normal', 'TRUE'],
['yes', 'overcast', 'cool', 'normal', 'TRUE'],
['no', 'sunny', 'mild', 'high', 'FALSE'],
['yes', 'sunny', 'cool', 'normal', 'FALSE'],
['yes', 'rainy', 'mild', 'normal', 'FALSE'],
['yes', 'sunny', 'mild', 'normal', 'TRUE'],
['yes', 'overcast', 'mild', 'high', 'TRUE'],
['yes', 'overcast', 'hot', 'normal', 'FALSE'],
['no', 'rainy', 'mild', 'high', 'TRUE']])
X_train = dataset[:, 1:]
y_train = dataset[:, 0]
mae = MaxEntDFP(epsilon=1e-4, max_iter=1000, distance=0.01)
mae.fit(X_train, y_train)
print("模型训练迭代次数:{}次".format(mae.n_iter_))
print("模型权重:{}".format(mae.w))
result = mae.predict(['overcast', 'mild', 'high', 'FALSE'])
print("预测结果:", result)