下面的例子是用于文本分析
import operator
import numpy as np
import feedparser as fp
def loadDataSet():#0表示非侮辱言论,1表示侮辱言论
postingList = [['my', 'dog', 'has', 'flea',
'problems', 'helo', 'please'],
['maybe', 'not', 'take', 'him',
'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so',
'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to',
'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1]
return postingList, classVec
#将数据集中出现的所有单词放到一个向量里,做成词汇表
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
#每一个数据样本都变成长度与词汇表相同的向量,如果样本中没有单词就是0,有就是1
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
return returnVec
#与上面的函数作用相似,区别在于+=1
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
#贝叶斯公式计算,分母相同都是p(w)不用考虑,因为独立事件,所以单个特征相乘即可,log可以避免曲线变型
def trainNB0(trainMatrix, tarinCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(tarinCategory) / float(numTrainDocs)
# p0Num = zeros(numWords) 0容易使概率结果为0,所以分子分母都加1
# p1Num = zeros(numWords)
# p0Denom = p1Denom = 0.0
p0Num = np.ones(numWords)
p1Num = np.ones(numWords)
p0Denom = p1Denom = 2.0
for i in range(numTrainDocs):
if tarinCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = np.log(p1Num / p1Denom)
p0Vect = np.log(p0Num / p0Denom)
return p0Vect, p1Vect, pAbusive
#log的+反应概率相乘
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOposts, listClasses = loadDataSet()
myVocabList = createVocabList(listOposts)
trainMat=[]
for postinDoc in listOposts:
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
testEntry = ['love', 'my', 'dalmation']
thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
print (testEntry, 'classified as: ',
classifyNB(thisDoc, p0V, p1V, pAb))
testEntry = ['stupid', 'garbage']
thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
print (testEntry, 'classified as: ',
classifyNB(thisDoc, p0V, p1V, pAb))
#正则式拆分文本
def textParse(bigString):
import re
listOfToken = re.split(r'\W*', bigString)
return [tok.lower() for tok in listOfToken if len(tok) > 2]
#垃圾邮件过滤
def spamTest():
docList = []; classList = []; fullText = []
for i in range(1,26):
wordList = textParse(open('f:/email/spam/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('f:/email/ham/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
trainingSet = list(range(50));
testSet = []
for i in range(10):
randIndex = int(np.random.uniform(0, len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat = []; trainClasses = []
for docIndex in trainingSet:
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
print('the error rate is: ',float(errorCount)/ len(testSet))
#统计高频词
def calcMostFreq(vocabList, fullText):
freqDict={}
for token in vocabList:
freqDict[token] = fullText.count(token)
#iteritems - > items
sortedFreq = sorted(freqDict.items(), key = operator.itemgetter(1), reverse = True)
return sortedFreq[:30]
#rss分析
def localWords(feed1, feed0):
docList = []; classList = []; fullText = []
minLen = min(len(feed1['entries']), len(feed0['entries']))
for i in range(minLen):
wordList = textParse(feed1['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
top30Word = calcMostFreq(vocabList, fullText)
for pairW in top30Word:
if pairW[0] in vocabList:
vocabList.remove(pairW[0])
trainingSet = list(range(2 * minLen));
testSet= []
for i in range(20):
randIndex = int(np.random.uniform(0,len(trainingSet
)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat = []; trainClasses = []
for docIndex in trainingSet:
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
print('the error rate is: ', float(errorCount)/ len(testSet))
return vocabList, p0V, p1V
#返回与类别有关的单词
def getTopWords(ny, sf):
fw = open('f:/test.txt', 'w')
vocabList, p0V, p1V, = localWords(ny, sf)
topNY = []; topSF = []
for i in range(len(p0V)):
if p0V[i] > -6.0 : topSF.append((vocabList[i], p0V[i]))
if p1V[i] > -6.0 : topNY.append((vocabList[i], p1V[i]))
sortedSF = sorted(topSF, key = lambda pair:pair[1], reverse = True)
sortedSF = sortedSF[:10]
fw.write('SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF*\n')
#print ('SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF*')
for item in sortedSF:
fw.write(item[0] + '\n')
# print (item[0])
sortedNY = sorted(topNY, key = lambda pair:pair[1], reverse = True)
sortedNY = sortedNY[:10]
fw.write('NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY*\n')
#print ('NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY*')
for item in sortedNY:
fw.write(item[0] + '\n')
# print (item[0])
ny = fp.parse('http://newyork.craigslist.org/stp/index.rss')
sf = fp.parse('http://sfbay.craigslist.org/stp/index.rss')
getTopWords(ny, sf)