from numpy import *
import operator
def createDataSet():
group = array([[1.0,1.1], [1.0, 1.0], [0,0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
def classify0(inX, dataSet, labels, k):
#inX是输入值,dataset是训练集,labels是标签,k是前k个
dataSetSize = dataSet.shape[0]
#shape[0]表示数据集的行数
diffMat = tile(inX, (dataSetSize,1)) - dataSet
#tile是延展复制,将inX按 dataSetSize * 1 延展
sqDiffMat = diffMat**2
#方差中的平方差
sqDistances = sqDiffMat.sum(axis = 1)
#axis=1是每一列相加
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
#排序返回索引号,默认递增排序
classCount = {}
#classCount是字典型
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
#get(voteIlabel, 0)返回键为voteIlabel的值如果不存在,就返回0
sortedClassCount = sorted(classCount.iteritems(),
key = operator.itemgetter(1), reverse = True)
#reverse = True是逆序排序,operator.itemgetter(1)是按照第二关键字排序
return sortedClassCount[0][0]
#返回字典里第一个键值对的键
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines,3))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip()
#strip截掉回车字符
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
#-1直接跳到最后一列
index+=1
return returnMat,classLabelVector
>>> import matplotlib
>>> import matplotlib.pyplot as plt
>>> fig = p l t .figure ()
>>> ax = fig.add_subplot(111)
#将画布分成1行1列,现在画出来的图在从左到右从上到下的第1块
>>> ax.scatter(datingDataMat[:,1], datingDataMat[:,2])
#scatter是散点图,两个参数是横纵坐标
>>> plt.show()