資源簡介
bayes.py為主體代碼,利用終端輸入python調用程序,代碼中包含中文注釋。也包含測試集與訓練集。

代碼片段和文件信息
from?math?import?log
from?numpy?import?*
import?operator
def?loadDataSet():
????postingList?=?[[‘my‘‘dog‘‘has‘‘flea‘‘problems‘‘help‘‘please‘]
???????????????????[‘maybe‘‘not‘‘take‘‘him‘‘to‘‘dog‘‘park‘‘stupid‘]
???????????????????[‘my‘‘dalmation‘‘is‘‘so‘‘cute‘‘I‘‘love‘‘him‘]
???????????????????[‘stop‘‘posting‘‘stupid‘‘worthless‘‘garbage‘]
???????????????????[‘mr‘‘licks‘‘ate‘‘my‘‘steak‘‘how‘‘to‘‘stop‘‘him‘]
???????????????????[‘quit‘‘buying‘‘worthless‘‘dog‘‘food‘‘stupid‘]]
????classVec?=?[010101]????#1代表侮辱性文字0代表正常言論
????return?postingList?classVec
#創建一個包含所有文檔中出現的不重復單詞列表
def?createVocabList(dataSet):
????vocabSet?=?set([])????#創建空集合
????for?document?in?dataSet:
????????vocabSet?=?vocabSet?|?set(document)?????#返回不重復的單詞集合
????????#print(vocabSet)
????return?list(vocabSet)
def?setOfWords2Vec(vocabList?inputSet):
????returnVec?=?[0]*len(vocabList)
????for?word?in?inputSet:
????????if?word?in?vocabList:
????????????returnVec[vocabList.index(word)]?=?1
????????else:print(“the?word:?%s?is?not?in?my?Vocabulary!“?%word)
????return?returnVec
#trainMatrix為輸入的詞條集合trainCategory為詞條類別
def?trainNB0(trainMatrix?trainCategory):
????numTrainDocs?=?len(trainMatrix)?????#獲取詞條長度即分母變量
????numWords?=?len(trainMatrix[0])?????#第一段詞條中單詞個數即分子變量
????pAbusive?=?sum(trainCategory)/float(numTrainDocs)
????p0Num?=?zeros(numWords);p1Num?=?zeros(numWords)
????p0Denom?=?0.0;?p1Denom?=?0.0
????for?i?in?range(numTrainDocs):
????????if?trainCategory[i]?==?1:
????????????p1Num?+=?trainMatrix[i]
????????????p1Denom?+=?sum(trainMatrix[i])
????????else:
????????????p0Num?+=?trainMatrix[i]
????????????p0Denom?+=?sum(trainMatrix[i])
????p1Vect?=?p1Num/p1Denom
????p0Vect?=?p0Num/p0Denom
????return?p0Vectp1VectpAbusive
#分類取概率高的值
def?classifyNB(vec2Classify?p0Vec?p1Vec?pClass1):
????p1?=?sum(vec2Classify?*?p1Vec)?+?log(pClass1)
????p0?=?sum(vec2Classify?*?p0Vec)?+?log(1.0?-?pClass1)
????if?p1?>?p0:
????????return?1
????else:
????????return?0
def?testingNB():
????listOPosts?listClasses?=?loadDataSet()
????myVocabList?=?createVocabList(listOPosts)
????trainMat?=?[]
????for?postinDoc?in?listOPosts:
????????trainMat.append(setOfWords2Vec(myVocabList?postinDoc))
????p0Vp1VpAb?=?trainNB0(array(trainMat)array(listClasses))
????testEntry?=?[‘love‘‘my‘‘dalmation‘]
????thisDoc?=?array(setOfWords2Vec(myVocabList?testEntry))
????print(testEntry?‘classified?as:?‘?classifyNB(thisDoc?p0V?p1V?pAb))
????testEntry?=?[‘stupid‘]
????thisDoc?=?array(setOfWords2Vec(myVocabList?testEntry))
????print(testEntry?‘classified?as:?‘?classifyNB(thisDoc?p0V?p1V?pAb))
#與setOfWords2Vec不同的是在setOfWords2Vec函數中每個單詞只能出現一次而在bagOfWords2Vec中每個單詞可以出現多次
def?bagOfWords2Vec(vocabList?inputSet):
????returnVec?=?[0]*len(vocabList)
????for?word?in?inputSet:
????????if?word?in?vocabList:
????????????returnVec[vocabList.index(word)]?+=?1
????return?returnVec
#使用貝葉斯算法實現垃圾郵件過濾
#將一個大字符串解析為字符串列表
def?textParse(bigString):
????import?re
????listOfTokens?=?re.spli
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2017-12-14?21:14??bayes\
?????文件????????5313??2017-12-14?20:56??bayes\bayes.py
?????目錄???????????0??2011-12-20?11:36??bayes\email\
?????目錄???????????0??2017-12-14?16:46??bayes\email\ham\
?????文件?????????148??2010-10-23?17:11??bayes\email\ham\1.txt
?????文件??????????86??2017-12-14?16:44??bayes\email\ham\10.txt
?????文件?????????130??2017-12-14?16:44??bayes\email\ham\11.txt
?????文件?????????184??2017-12-14?16:45??bayes\email\ham\12.txt
?????文件?????????174??2010-10-23?17:13??bayes\email\ham\13.txt
?????文件?????????172??2017-12-14?16:45??bayes\email\ham\14.txt
?????文件?????????531??2017-12-14?16:37??bayes\email\ham\15.txt
?????文件??????????91??2017-12-14?16:45??bayes\email\ham\16.txt
?????文件?????????466??2017-12-14?16:45??bayes\email\ham\17.txt
?????文件?????????177??2017-12-14?16:45??bayes\email\ham\18.txt
?????文件?????????161??2017-12-14?16:46??bayes\email\ham\19.txt
?????文件?????????239??2017-12-14?16:42??bayes\email\ham\2.txt
?????文件?????????208??2010-10-23?09:26??bayes\email\ham\20.txt
?????文件?????????236??2017-12-14?16:46??bayes\email\ham\21.txt
?????文件?????????332??2017-12-14?16:46??bayes\email\ham\22.txt
?????文件?????????607??2017-12-14?16:46??bayes\email\ham\23.txt
?????文件??????????42??2010-10-23?09:33??bayes\email\ham\24.txt
?????文件??????????89??2010-10-23?09:34??bayes\email\ham\25.txt
?????文件?????????373??2017-12-14?16:43??bayes\email\ham\3.txt
?????文件?????????213??2017-12-14?16:43??bayes\email\ham\4.txt
?????文件?????????114??2010-10-23?17:11??bayes\email\ham\5.txt
?????文件????????1467??2017-12-14?15:44??bayes\email\ham\6.txt
?????文件?????????109??2010-10-23?17:12??bayes\email\ham\7.txt
?????文件?????????638??2010-10-23?08:58??bayes\email\ham\8.txt
?????文件?????????148??2017-12-14?16:44??bayes\email\ham\9.txt
?????目錄???????????0??2017-12-14?16:40??bayes\email\spam\
?????文件?????????238??2010-10-23?08:28??bayes\email\spam\1.txt
............此處省略24個文件信息
評論
共有 條評論