Logstic Regression 模型

由线性回归的知识可知,线性回归方程为:$$y = \theta_0 + \theta_1 x_1 + \cdots + \theta_n x_n = \Theta^T X$$
而Logstic Regression就是在线性回归方程的基础上加上sigmoid的函数:$ y = \frac{1}{1 + e^{-z}} $
Logstic Regression方程如所示:$$ h_\theta (x) = \frac{1}{1 + e^{-\Theta^T X}} $$
所以$h_\theta (x)$的结果是(0,1)之间的值。当$h_\theta (x)$大于0.5时,我们可以判定为正例,即$y=1$;当$h_\theta (x)$小于0.5时,我们可以判定为负例,即$y=0$;当$h_\theta (x)$等于0.5时,可任意判断

损失函数

通过概率论的极大似然估计,我们可以得到:$$ max L(x; \Theta) = \prod p(y_i | x_i, \Theta)$$
两边取对数:
$$ \log(L(x; \Theta)) = \sum \log(p(y_i | x_i, \Theta)) $$
由Lostic Regression的方程可得:$$ p(y_i | x_i, \Theta) = h_\theta(x_i)^{y_i} \cdot (1-h_\theta(x_i))^{1-y_i} $$
带入化简可得:$$ J(\Theta) = \log(L(x; \Theta)) = \sum y_i\log(h_\theta(x_i)) + (1-y_i)\log(1-h_\theta(x_i))$$
求导得:$$ \frac {\partial J(\Theta)}{\partial \theta} = \sum (y-h_\theta(x))x $$

ps: 这符号好难打,再也不想写了

实现代码

import numpy as np
import matplotlib.pyplot as plt

def loadData():
    dataMat = []
    labelMat = []
    fr = open('dataSet/testSet.txt')
    for line in fr.readlines():
        lineArr = line.strip().split()
        dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
        labelMat.append(int(lineArr[2]))
    return dataMat, labelMat

def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))

def GradientDecent0(data, label):
    dataMatrix = np.mat(data)
    labelMatrix = np.mat(label).transpose()
    m,n = np.shape(dataMatrix)
    alpha = 0.001
    maxCycles = 500
    weights = np.ones((n,1))
    for k in range(maxCycles):                           
        h = sigmoid(dataMatrix * weights)
        error = labelMatrix - h
        weights = weights + alpha * dataMatrix.transpose() * error
    return weights

def GradientDecent1(data, label):
    data =  np.array(data)
    m, n = np.shape(data)
    alpha = 0.01
    w = np.ones((n,1))
    for j in range(500):
        for i in range(m):
            h = sigmoid(np.dot(data[i], w))
            error = label[i] - h
            w = w + alpha * error * np.transpose([data[i]])
    return w 

def GradientDecent2(data, label, num_iter=500):
    m, n = np.shape(data)
    w = np.ones((n,1))
    for i in range(num_iter):
        index = list(range(m))
        for j in range(m):
            alpha = 4 / (1.0 + i + j) + 0.01
            id = int(np.random.uniform(0, len(index)))
            h = sigmoid(np.dot(data[id], w))
            error = label[id] - h
            w = w + alpha * error * np.transpose([data[id]])
            del index[id]
    return w

def plotBestFit(weights):
    data, label = loadData()
    dataArr =np.array(data)
    n = np.shape(dataArr)[0]
    xcord1, ycord1 = [], []
    xcord2, ycord2 = [], []
    for i in range(n):
        if int(label[i]) == 1:
            xcord1.append(dataArr[i,1])
            ycord1.append(dataArr[i,2])
        else:
            xcord2.append(dataArr[i,1])
            ycord2.append(dataArr[i,2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
    ax.scatter(xcord2, ycord2, s=30, c='green')
    x = np.arange(-3.0, 3.0, 0.1)
    y = (-weights[0] - weights[1] * x) / weights[2]
    ax.plot(x,y)
    plt.xlabel('X1'); plt.ylabel('X2')
    plt.show()

def prediction(data, w):
    prob = sigmoid(np.dot(data, w))
    if prob[0] >= 0.5:
        return 1
    else:
        return 0

def dataClean():
    fr_train = open('dataSet/horse_train.txt')
    fr_test = open('dataSet/horse_test.txt')
    train_data, train_label = [], []
    for line in fr_train.readlines():
        ln = line.strip().split('\t')
        lnArr = [float(ln[i]) for i in range(21)]
        train_data.append(lnArr)
        train_label.append(float(ln[-1]))
    w = GradientDecent0(train_data, train_label)
    print(w)
    cnt_error, num_data = 0, 0
    for line in fr_test.readlines():
        num_data += 1
        ln = line.strip().split('\t')
        lnArr = [float(ln[i]) for i in range(21)]
        if prediction(lnArr, w) != float(ln[-1]):
            cnt_error += 1
    error_rate = cnt_error * 1.0 / num_data
    print("error_rate: %f" % error_rate)
    return error_rate

dataClean()