机器学习实战-kNN

from numpy import *
import operator
def createDataSet():
    group = array([[1.0,1.1], [1.0, 1.0], [0,0], [0, 0.1]])
    labels = ['A', 'A', 'B', 'B']
    return group, labels

def classify0(inX, dataSet, labels, k):
    #inX是输入值,dataset是训练集,labels是标签,k是前k个
    dataSetSize = dataSet.shape[0]
    #shape[0]表示数据集的行数
    diffMat = tile(inX, (dataSetSize,1)) - dataSet
    #tile是延展复制,将inX按 dataSetSize * 1 延展
    sqDiffMat = diffMat**2
    #方差中的平方差
    sqDistances = sqDiffMat.sum(axis = 1)
    #axis=1是每一列相加
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort()
    #排序返回索引号,默认递增排序
    classCount = {}
    #classCount是字典型
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
        #get(voteIlabel, 0)返回键为voteIlabel的值如果不存在,就返回0
    sortedClassCount = sorted(classCount.iteritems(),
        key = operator.itemgetter(1), reverse = True)
        #reverse = True是逆序排序,operator.itemgetter(1)是按照第二关键字排序
    return sortedClassCount[0][0]
        #返回字典里第一个键值对的键

def file2matrix(filename):
    fr = open(filename)
    arrayOLines = fr.readlines()
    numberOfLines = len(arrayOLines)
    returnMat = zeros((numberOfLines,3))
    classLabelVector = []
    index = 0
    for line in arrayOLines:
        line = line.strip()
        #strip截掉回车字符
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        #-1直接跳到最后一列
        index+=1
    return returnMat,classLabelVector

>>> import matplotlib
>>> import matplotlib.pyplot as plt
>>> fig = p l t .figure ()
>>> ax = fig.add_subplot(111)
    #将画布分成1行1列,现在画出来的图在从左到右从上到下的第1块
>>> ax.scatter(datingDataMat[:,1], datingDataMat[:,2])
    #scatter是散点图,两个参数是横纵坐标
>>> plt.show()