机器学习实战（二）k-近邻算法

k-近邻算法：KNN是通过测量不同特征值之间的距离进行分类。它的的思路是：如果一个样本在特征空间中的k个最相似(即特征空间中最邻近)的样本中的大多数属于某一个类别，则该样本也属于这个类别。K通常是不大于20的整数。KNN算法中，所选择的邻居都是已经正确分类的对象。该方法在定类决策上只依据最邻近的一个或者几个样本的类别来决定待分样本所属的类别。

from numpy import *
import operator
# 它可以列出给定目录下的文件名
from os import listdir
"""
科学计算包Numpy
运算符模块operator
"""
def createDataSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels = ['A','A','B','B']
    return group,labels
"""
读取文件并转成矩阵
"""
def file2matrix(filename):
    fr = open(filename)
    arrayOLines = fr.readlines()
#     得到文件行数
    numberOfLines = len(arrayOLines)
    returnMat = zeros((numberOfLines,3))
#     创建返回的NumPy矩阵
    classLabelVector = []
    index = 0
#     解析文件数据到列表
    for line in arrayOLines:
#     移除字符串头尾指定的字符，默认为空格
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat,classLabelVector
"""
归一化特征值
"""    
def autoNorm(dataSet):
    """
    公式
    newValue = (oldValue-min )/(max-min)
    """
#     从列中选取最小值
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals, (m,1))
#     特征值相除
    normDataSet = normDataSet/tile(ranges, (m,1))
    return normDataSet, ranges, minVals

"""
分类器测试代码
"""
def datingClassTest():
    hoRatio = 0.10
    datingDataMat,datingLabels = file2matrix('F:\study\datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
        print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
        if (classifierResult != datingLabels[i]):errorCount += 1.0
    print "the total error rate is %f" % (errorCount/float(numTestVecs))


"""
预测函数
"""
def classifyPerson():
    resultList = ['not at all','in small doses', 'in large doses']
    percentTats = float(raw_input("percentage of time spent playing video games?"))
    ffMiles = float(raw_input("frequent flier miles earned per year?"))
    iceCream = float(raw_input("liter of ice cream consumed per year?"))
    datingDataMat,datingLabels = file2matrix('F:\study\datingTestSet2.txt')
    norMat, ranges, minVals = autoNorm(datingDataMat)
    inArr = array ([ffMiles, percentTats, iceCream])
    classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
    print "You will probably like this person: ",resultList[classifierResult - 1]

"""
把32x32的二进制图像矩阵转换为1x1024的向量
"""
def img2vector(filename):
    returnVect = zeros((1,1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0,32*i+j] = int(lineStr[j])
    return returnVect

"""
手写数字识别系统的测试代码
"""
def handwritingClassTest():
    hwLabels = []
#     获取目录内容
    trainingFileList = listdir('F:\\study\\trainingDigits')
    m = len(trainingFileList)
#     创建m行1024列的矩阵
    trainingMat = zeros((m,1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
#         从文件名解析分类数字，文件名的第一个数
#         去掉后缀
        fileStr = fileNameStr.split('.')[0]
#         获取第一个值
        classNumStr = int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector('F:\\study\\trainingDigits\\%s' % fileNameStr)
    testFileList = listdir('F:\\study\\testDigits')
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('F:\\study\\testDigits\\%s' % fileNameStr)
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        print "the classifier came back with: %d, the real answer is : %d" % (classifierResult, classNumStr)

        if(classifierResult != classNumStr) : errorCount +=1.0
    print "\nthe total number of errors is : %d" % errorCount
    print "\nthe total error rate is: %f" % (errorCount/float(mTest))

group, labels = createDataSet()
# print group,labels

[[ 1.   1.1]
 [ 1.   1. ]
 [ 0.   0. ]
 [ 0.   0.1]] ['A', 'A', 'B', 'B']

def classify0(inX, dataSet, labels, k):
    # 计算矩阵行数,一维长度
    dataSetSize = dataSet.shape[0]
    # print dataSetSize
    """
    距离计算:欧式距离公式
    """

#     tile():重复某个数组tile(A,n)，将数组A重复n次，构成一个新的数组
#         下面的代码是将inX重复成dataSetSize行，1列的数据
#         [[0,0],[0,0],[0,0],[0,0]]-[[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]
#         = [[-1,-1.1],[-1,-1],[0,0],[0,-0.1]]
#     下面的步骤就是先计算平方差，再相加后求根号，就是求距离
    diffMat = tile(inX, (dataSetSize,1)) - dataSet
#     平方运算
    sqDiffMat = diffMat**2
#     将向量的每一行相加
    sqDistances = sqDiffMat.sum(axis=1)
#     对数组中的每一个元素开根号
    distances = sqDistances**0.5
#     对数组进行升序排序，返回的是下标不是值！
    sortedDistIndicies = distances.argsort()
    classCount={}
    """
    选择距离最小的k个点
    """
    for i in range(k) : 
        voteIlabel = labels[sortedDistIndicies[i]]
#         get返回字典中指定键的值，若不存在则返回默认值，这里设置的是0
#         这里就是给字典中指定键的值加1统计数量,最终是A和B的数量
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    """
    排序
    """
#     sorted(data,cmp=None,key=None,reverse=False)
#         data:待排序数组
#         cmp :带两个参数的比较函数
#         key :是带一个参数的函数
#         reverse:排序规则，True降序
#         获取对象第一个域的值
    sortedClassCount = sorted(classCount.iteritems(),
                              key=operator.itemgetter(1), reverse=True)
#     取第一个则为最大值，也就是当前inX的分类
    return sortedClassCount[0][0]

classify0([0,0],group,labels,3)

'B'

"""
读取文件数据
"""
datingDataMat,datingLabels = file2matrix('F:\study\datingTestSet2.txt')

"""
执行可视化
"""
import matplotlib
import matplotlib.pyplot as plt
"""
可视化
"""
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:,1], datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels))
plt.show()

图像输出

"""
执行归一化结果
"""
normMat, ranges, minVales = autoNorm(datingDataMat)
#normMat
#ranges
#minVales

[  9.12730000e+04   2.09193490e+01   1.69436100e+00]

"""
执行分类器测试程序
"""
datingClassTest()

'\n\xe6\x89\xa7\xe8\xa1\x8c\xe5\x88\x86\xe7\xb1\xbb\xe5\x99\xa8\xe6\xb5\x8b\xe8\xaf\x95\xe7\xa8\x8b\xe5\xba\x8f\n'

"""
预测当前人员在哪个分类，执行预测函数
"""
classifyPerson()

percentage of time spent playing video games?56
frequent flier miles earned per year?1654
liter of ice cream consumed per year?0.65
[  9.12730000e+04   2.09193490e+01   1.69436100e+00]
You will probably like this person:  in large doses

"""
读取图像
"""
# 用\进行转义，否则访问不到
testVector = img2vector('F:\\study\\trainingDigits\\0_9.txt')
testVector[0,0:31]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.])

"""
手写数字识别测试执行
"""
handwritingClassTest()

the classifier came back with: 0, the real answer is : 0
...    
the classifier came back with: 0, the real answer is : 0
the classifier came back with: 0, the real answer is : 0
the classifier came back with: 1, the real answer is : 1
the classifier came back with: 1, the real answer is : 1
the classifier came back with: 2, the real answer is : 2
the classifier came back with: 2, the real answer is : 2
...    
the classifier came back with: 2, the real answer is : 2
the classifier came back with: 3, the real answer is : 3
the classifier came back with: 3, the real answer is : 3
the classifier came back with: 9, the real answer is : 3
...    
the classifier came back with: 3, the real answer is : 3
the classifier came back with: 3, the real answer is : 3
the classifier came back with: 4, the real answer is : 4
the classifier came back with: 4, the real answer is : 4
the classifier came back with: 5, the real answer is : 5
the classifier came back with: 5, the real answer is : 5
the classifier came back with: 5, the real answer is : 5
the classifier came back with: 6, the real answer is : 6
the classifier came back with: 7, the real answer is : 7
the classifier came back with: 6, the real answer is : 8
the classifier came back with: 8, the real answer is : 8
the classifier came back with: 9, the real answer is : 9
the classifier came back with: 9, the real answer is : 9

the total number of errors is : 11

the total error rate is: 0.011628