Logstic Regression 模型
由线性回归的知识可知,线性回归方程为:$$y = \theta_0 + \theta_1 x_1 + \cdots + \theta_n x_n = \Theta^T X$$
而Logstic Regression就是在线性回归方程的基础上加上sigmoid的函数:$ y = \frac{1}{1 + e^{-z}} $
Logstic Regression方程如所示:$$ h_\theta (x) = \frac{1}{1 + e^{-\Theta^T X}} $$
所以$h_\theta (x)$的结果是(0,1)之间的值。当$h_\theta (x)$大于0.5时,我们可以判定为正例,即$y=1$;当$h_\theta (x)$小于0.5时,我们可以判定为负例,即$y=0$;当$h_\theta (x)$等于0.5时,可任意判断
损失函数
通过概率论的极大似然估计,我们可以得到:$$ max L(x; \Theta) = \prod p(y_i | x_i, \Theta)$$
两边取对数:
$$ \log(L(x; \Theta)) = \sum \log(p(y_i | x_i, \Theta)) $$
由Lostic Regression的方程可得:$$ p(y_i | x_i, \Theta) = h_\theta(x_i)^{y_i} \cdot (1-h_\theta(x_i))^{1-y_i} $$
带入化简可得:$$ J(\Theta) = \log(L(x; \Theta)) = \sum y_i\log(h_\theta(x_i)) + (1-y_i)\log(1-h_\theta(x_i))$$
求导得:$$ \frac {\partial J(\Theta)}{\partial \theta} = \sum (y-h_\theta(x))x $$
ps: 这符号好难打,再也不想写了
实现代码
import numpy as np
import matplotlib.pyplot as plt
def loadData():
dataMat = []
labelMat = []
fr = open('dataSet/testSet.txt')
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelMat.append(int(lineArr[2]))
return dataMat, labelMat
def sigmoid(x):
return 1.0 / (1 + np.exp(-x))
def GradientDecent0(data, label):
dataMatrix = np.mat(data)
labelMatrix = np.mat(label).transpose()
m,n = np.shape(dataMatrix)
alpha = 0.001
maxCycles = 500
weights = np.ones((n,1))
for k in range(maxCycles):
h = sigmoid(dataMatrix * weights)
error = labelMatrix - h
weights = weights + alpha * dataMatrix.transpose() * error
return weights
def GradientDecent1(data, label):
data = np.array(data)
m, n = np.shape(data)
alpha = 0.01
w = np.ones((n,1))
for j in range(500):
for i in range(m):
h = sigmoid(np.dot(data[i], w))
error = label[i] - h
w = w + alpha * error * np.transpose([data[i]])
return w
def GradientDecent2(data, label, num_iter=500):
m, n = np.shape(data)
w = np.ones((n,1))
for i in range(num_iter):
index = list(range(m))
for j in range(m):
alpha = 4 / (1.0 + i + j) + 0.01
id = int(np.random.uniform(0, len(index)))
h = sigmoid(np.dot(data[id], w))
error = label[id] - h
w = w + alpha * error * np.transpose([data[id]])
del index[id]
return w
def plotBestFit(weights):
data, label = loadData()
dataArr =np.array(data)
n = np.shape(dataArr)[0]
xcord1, ycord1 = [], []
xcord2, ycord2 = [], []
for i in range(n):
if int(label[i]) == 1:
xcord1.append(dataArr[i,1])
ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1])
ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = np.arange(-3.0, 3.0, 0.1)
y = (-weights[0] - weights[1] * x) / weights[2]
ax.plot(x,y)
plt.xlabel('X1'); plt.ylabel('X2')
plt.show()
def prediction(data, w):
prob = sigmoid(np.dot(data, w))
if prob[0] >= 0.5:
return 1
else:
return 0
def dataClean():
fr_train = open('dataSet/horse_train.txt')
fr_test = open('dataSet/horse_test.txt')
train_data, train_label = [], []
for line in fr_train.readlines():
ln = line.strip().split('\t')
lnArr = [float(ln[i]) for i in range(21)]
train_data.append(lnArr)
train_label.append(float(ln[-1]))
w = GradientDecent0(train_data, train_label)
print(w)
cnt_error, num_data = 0, 0
for line in fr_test.readlines():
num_data += 1
ln = line.strip().split('\t')
lnArr = [float(ln[i]) for i in range(21)]
if prediction(lnArr, w) != float(ln[-1]):
cnt_error += 1
error_rate = cnt_error * 1.0 / num_data
print("error_rate: %f" % error_rate)
return error_rate
dataClean()