统计学习方法习题实战第六章

理论树

1.logistic regression model:logistic distibution、binomial logistic regression 以及modelmulti-nominal logistic regression model
2.模型参数估计:依靠对数似然函数
3.最大熵模型的最大熵原理:信息熵的定义
4.最大熵模型的定义:样本加权的信息熵
5.最大熵模型的学习:带约束的最优化问题,转化为对偶问题。
拉格朗日乘数法:先求P再求w。本质上与似然函数等价
6.最大熵模型的最优化算法:改进的迭代尺度法或拟牛顿法。

习题

6.2写出Logistic回归模型学习的梯度下降算法。
答:
导包部分

from scipy.optimize import fminbound
from pylab import mpl
import numpy as np

按书上公式模仿sklearn写法

class MyLogisticRegression:
    def __init__(self, max_iter=10000, distance=3, epsilon=1e-6):
        self.max_iter = max_iter
        self.epsilon = epsilon
        self.w = None
        self.distance = distance
        self._X = None
        self._y = None
        
    @staticmethod
    def preprocessing(X):
        row = X.shape[0]
        y = np.ones(row).reshape(row, 1)
        return np.hstack((X, y))
    
    @staticmethod
    def sigmod(x):
        return 1 / (1 + np.exp(-x))
    
    def grad(self, w):
        z = np.dot(self._X, w.T)
        grad = self._X * (self._y - self.sigmod(z))
        grad = grad.sum(axis=0)
        return grad
    
    def like_func(self, w):
        z = np.dot(self._X, w.T)
        f = self._y * z - np.log(1 + np.exp(z))
        return np.sum(f)
    
    def fit(self, data_x, data_y):
        self._X = self.preprocessing(data_x)
        self._y = data_y.T
        w = np.array([[0] * self._X.shape[1]], dtype=np.float)
        k = 0
        fw = self.like_func(w)
        for _ in range(self.max_iter):
            grad = self.grad(w)
            if (np.linalg.norm(grad, axis=0, keepdims=True) < self.epsilon).all():
                self.w = w
                break
        
            def f(x):
                z = w - np.dot(x, grad)
                return -self.like_func(z)
            
            _lambda = fminbound(f, -self.distance, self.distance)
            w1 = w - np.dot(_lambda, grad)
            fw1 = self.like_func(w1)

            if np.linalg.norm(fw1 - fw) < self.epsilon or \
                    (np.linalg.norm((w1 - w), axis=0, keepdims=True) < self.epsilon).all():
                self.w = w1
                break

            k += 1
            w, fw = w1, fw1

        self.grad_ = grad
        self.n_iter_ = k
        self.coef_ = self.w[0][:-1]
        self.intercept_ = self.w[0][-1]

    def predict(self, x):
        p = self.sigmod(np.dot(self.preprocessing(x), self.w.T))
        p[np.where(p > 0.5)] = 1
        p[np.where(p < 0.5)] = 0
        return p

    def score(self, X, y):
        y_c = self.predict(X)
        error_rate = np.sum(np.abs(y_c - y.T)) / y_c.shape[0]
        return 1 - error_rate

尝试实现一下

X_train = np.array([[3, 3, 3], [4, 3, 2], [2, 1, 2], [1, 1, 1], [-1, 0, 1], [2, -2, 1]])
y_train = np.array([[1, 1, 1, 0, 0, 0]])
clf = MyLogisticRegression(epsilon=1e-6)
clf.fit(X_train, y_train)
print("迭代次数:{}次".format(clf.n_iter_))
print("梯度:{}".format(clf.grad_))
print("权重:{}".format(clf.w[0]))
print("模型准确率:%0.2f%%" % (clf.score(X_train, y_train) * 100))

6.3写出最大熵模型学习的DFP算法。
答:
导包部分

import copy
from collections import defaultdict
import numpy as np
from scipy.optimize import fminbound

按书上的公式

class MaxEntDFP:
    def __init__(self, epsilon, max_iter=1000, distance=0.01):
        self.distance = distance
        self.epsilon = epsilon
        self.max_iter = max_iter
        self.w = None
        self._dataset_X = None
        self._dataset_y = None
        self._y = set()
        self._xyID = {}
        self._IDxy = {}
        self._pxy_dic = defaultdict(int)
        self._N = 0
        self._n = 0
        self.n_iter_ = 0

    def init_params(self, X, y):
        self._dataset_X = copy.deepcopy(X)
        self._dataset_y = copy.deepcopy(y)
        self._N = X.shape[0]

        for i in range(self._N):
            xi, yi = X[i], y[i]
            self._y.add(yi)
            for _x in xi:
                self._pxy_dic[(_x, yi)] += 1

        self._n = len(self._pxy_dic)
        self.w = np.zeros(self._n)

        for i, xy in enumerate(self._pxy_dic):
            self._pxy_dic[xy] /= self._N
            self._xyID[xy] = i
            self._IDxy[i] = xy

    def calc_zw(self, X, w):
        zw = 0.0
        for y in self._y:
            zw += self.calc_ewf(X, y, w)
        return zw

    def calc_ewf(self, X, y, w):
        sum_wf = self.calc_wf(X, y, w)
        return np.exp(sum_wf)

    def calc_wf(self, X, y, w):
        sum_wf = 0.0
        for x in X:
            if (x, y) in self._pxy_dic:
                sum_wf += w[self._xyID[(x, y)]]
        return sum_wf

    def calc_pw_yx(self, X, y, w):
        return self.calc_ewf(X, y, w) / self.calc_zw(X, w)

    def calc_f(self, w):
        fw = 0.0
        for i in range(self._n):
            x, y = self._IDxy[i]
            for dataset_X in self._dataset_X:
                if x not in dataset_X:
                    continue
                fw += np.log(self.calc_zw(x, w)) - \
                    self._pxy_dic[(x, y)] * self.calc_wf(dataset_X, y, w)

        return fw

    def fit(self, X, y):
        self.init_params(X, y)

        def calc_dfw(i, w):
            def calc_ewp(i, w):
                ep = 0.0
                x, y = self._IDxy[i]
                for dataset_X in self._dataset_X:
                    if x not in dataset_X:
                        continue
                    ep += self.calc_pw_yx(dataset_X, y, w) / self._N
                return ep

            def calc_ep(i):
                (x, y) = self._IDxy[i]
                return self._pxy_dic[(x, y)]

            return calc_ewp(i, w) - calc_ep(i)

        def calc_gw(w):
            return np.array([[calc_dfw(i, w) for i in range(self._n)]]).T

        Gk = np.array(np.eye(len(self.w), dtype=float))


        w = self.w
        gk = calc_gw(w)
        if np.linalg.norm(gk, ord=2) < self.epsilon:
            self.w = w
            return

        k = 0
        for _ in range(self.max_iter):
            pk = -Gk.dot(gk)

            def _f(x):
                z = w + np.dot(x, pk).T[0]
                return self.calc_f(z)

            _lambda = fminbound(_f, -self.distance, self.distance)

            delta_k = _lambda * pk
            w += delta_k.T[0]

            gk1 = calc_gw(w)
            if np.linalg.norm(gk1, ord=2) < self.epsilon:
                self.w = w
                break
            yk = gk1 - gk
            Pk = delta_k.dot(delta_k.T) / (delta_k.T.dot(yk))
            Qk = Gk.dot(yk).dot(yk.T).dot(Gk) / (yk.T.dot(Gk).dot(yk)) * (-1)
            Gk = Gk + Pk + Qk
            gk = gk1
            k += 1

        self.w = w
        self.n_iter_ = k

    def predict(self, x):
        result = {}
        for y in self._y:
            prob = self.calc_pw_yx(x, y, self.w)
            result[y] = prob

        return result

尝试实现一下,跑起来有点慢

dataset = np.array([['no', 'sunny', 'hot', 'high', 'FALSE'],
                    ['no', 'sunny', 'hot', 'high', 'TRUE'],
                    ['yes', 'overcast', 'hot', 'high', 'FALSE'],
                    ['yes', 'rainy', 'mild', 'high', 'FALSE'],
                    ['yes', 'rainy', 'cool', 'normal', 'FALSE'],
                    ['no', 'rainy', 'cool', 'normal', 'TRUE'],
                    ['yes', 'overcast', 'cool', 'normal', 'TRUE'],
                    ['no', 'sunny', 'mild', 'high', 'FALSE'],
                    ['yes', 'sunny', 'cool', 'normal', 'FALSE'],
                    ['yes', 'rainy', 'mild', 'normal', 'FALSE'],
                    ['yes', 'sunny', 'mild', 'normal', 'TRUE'],
                    ['yes', 'overcast', 'mild', 'high', 'TRUE'],
                    ['yes', 'overcast', 'hot', 'normal', 'FALSE'],
                    ['no', 'rainy', 'mild', 'high', 'TRUE']])
X_train = dataset[:, 1:]
y_train = dataset[:, 0]

mae = MaxEntDFP(epsilon=1e-4, max_iter=1000, distance=0.01)
mae.fit(X_train, y_train)
print("模型训练迭代次数:{}次".format(mae.n_iter_))
print("模型权重:{}".format(mae.w))

result = mae.predict(['overcast', 'mild', 'high', 'FALSE'])
print("预测结果:", result)
浙ICP备19012682号