机器学习实战之Logistic回归算法

本文总结了一下逻辑回归,逻辑回归用于分类,通过寻找最佳拟合参数,使用的是最优化算法。

总结了一下基于逻辑回归和Sigmoid函数的分类。

Sigmoid函数:

1531379081537

x是分类器的输入数据,是样本的特征值,w就是我们要找的最佳系数。

下面使用了批量梯度上升算法和随机梯度上升算法进行求最佳系数。

梯度下降不在详述,在下面链接可以找到梯度下降详解。

http://hanyaopeng.coding.me/2018/06/11/ml-wuenda-1/

逻辑回归的梯度下降推导看如下链接:

https://blog.csdn.net/xiaoxiangzi222/article/details/55097570

批量梯度下降算法

批量梯度下降算法使对θ使用所有的样本进行更新。

1531379943396

随机梯度下降算法

随机梯度下降是对θ一个样本进行更新即可

1531379533652

确定好算法后便可对数据进行处理,进而用梯度上升算法进行训练测试。代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from numpy import *


def loadDataSet():
dataMat = []
labelMat = []
fr = open('G:\机器学习\机器学习实战\机器学习实战(中文版+英文版+源代码)\machinelearninginaction\Ch05\\testSet.txt')
for line in fr.readlines():
lineArr = line.strip().split() # strip() 方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。注意:该方法只能删除开头或是结尾的字符,不能删除中间部分的字符。
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelMat.append(int(lineArr[2]))
return dataMat, labelMat


def sigmoid(inX):
return 1.0 / (1 + exp(-inX))


# 使用批量梯度上升算法找到最佳参数
def gradAscent(dataMatIn, classLabels):
dataMatrix = mat(dataMatIn)
labelMat = mat(classLabels).transpose() # transpose转置矩阵
m, n = shape(dataMatrix)
alpha = 0.001
maxCycles = 500
weights = ones((n, 1))
for k in range(maxCycles):
h = sigmoid(dataMatrix * weights)
error = (labelMat - h)
weights = weights + alpha * dataMatrix.transpose() * error # 省略了求导推导

return weights


# 画出决策边界
def plotBestFit(weights):
weights = array(weights) # 解决报错
import matplotlib.pyplot as plt
dataMat, labelMat = loadDataSet()
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = []
ycord1 = []
xcord2 = []
ycord2 = []
for i in range(n):
if (int(labelMat[i] == 1)):
xcord1.append(dataArr[i, 1])
ycord1.append(dataArr[i, 2])
else:
xcord2.append(dataArr[i, 1])
ycord2.append(dataArr[i, 2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = arange(-3.0, 3.0, 0.1) # 根据start与stop指定的范围以及step设定的步长,生成一个ndarray
y = (-weights[0] - weights[1] * x) / weights[2]
ax.plot(x, y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()


# 随机梯度上升算法
def stocGradAscent0(dataMatrix, classLabels):
m, n = shape(dataMatrix)
alpha = 0.01
weights = ones(n) # initialize to all ones
for i in range(m):
h = sigmoid(sum(dataMatrix[i] * weights))
error = classLabels[i] - h
weights = weights + alpha * error * dataMatrix[i]
return weights


# 增加迭代次数和随机选取样本进行更新w的的优化版的随机梯度上升算法
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
m, n = shape(dataMatrix)
weights = ones(n) # initialize to all ones
for j in range(numIter):
dataIndex = list(range(m)) # python3 range不返回数组对象,而是返回range对象,所以执行del操作会报错,需要list(range(m))
for i in range(m):
alpha = 4 / (1.0 + j + i) + 0.0001 # apha decreases with iteration, does not
randIndex = int(random.uniform(0, len(dataIndex))) # go to 0 because of the constant
h = sigmoid(sum(dataMatrix[randIndex] * weights))
error = classLabels[randIndex] - h
weights = weights + alpha * error * dataMatrix[randIndex]
del (dataIndex[randIndex])
return weights


# 用来进行预测分类
def classifyVector(inX, weights):
prob = sigmoid(sum(inX * weights))
if prob > 0.5:
return 1.0
else:
return 0.0

# 使用书籍所给的数据进行训练测试
def colicTest():
frTrain = open('G:\机器学习\机器学习实战\机器学习实战(中文版+英文版+源代码)\machinelearninginaction\Ch05\horseColicTraining.txt')
frTest = open('G:\机器学习\机器学习实战\机器学习实战(中文版+英文版+源代码)\machinelearninginaction\Ch05\horseColicTest.txt')
trainingSet = []
trainingLabels = []
for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[21]))
trainWeights = stocGradAscent1(array(trainingSet),trainingLabels,500)
errorCount = 0
numTestVec = 0.0
for line in frTest.readlines():
numTestVec += 1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]):
errorCount += 1
errorRate = (float(errorCount) / numTestVec)
print("测试错误率为: %f" % errorRate)
return errorRate


def multiTest():
numTests = 10
errorSum = 0.0
for k in range(numTests):
errorSum += colicTest()
print("经过 %d 次迭代后错误率均值为:%f" % (numTests, errorSum/float(numTests)))



if __name__ == '__main__':
# dataArr, labelMat = loadDataSet()
# plotBestFit(gradAscent(dataArr, labelMat))
multiTest()
测试结果如下图:

1531379770481

-------------The End-------------