資源簡介
樸素貝葉斯算法實現的郵件分類器,其中包括代碼和數據。文章鏈接為:https://blog.csdn.net/J__Max/article/details/82965180

代碼片段和文件信息
#!/usr/bin/python
#?-*-?coding?utf-8?-*-
#?Project:?NB
#?Author:?jiangnan?
#?Mail:?jiangnanmax@gmail.com
#?Date:?2018/9/26
import?numpy?as?np
def?loadFile(filename):
????“““
????函數說明:
????????加載數據文件
????:param?filename:
????????文件名
????:return:
????????contentList?-?切分郵件內容得到的詞條
????????classVec?-?類別標簽向量
????“““
????file?=?open(filename)
????contentList?=?[]
????classVec?=?[]
????contents?=?file.readlines()
????for?line?in?contents:
????????content?=?line.strip(‘\n‘).split(‘?‘)???#以空格為分割符,切分郵件的內容,得到該郵件對應的詞條
????????classVec.append(int(content[0]))????#取出郵件的類別標簽
????????del(content[0])?????#刪掉詞條中的類別標簽
????????contentList.append(content)
????return?contentList?classVecx
def?createVocabList(dataSet):
????“““
????函數說明:
????????根據訓練數據,生成一個詞匯表
????:param?dataSet:
????????切分所有郵件得到的詞條
????:return:
????????list(vocabSet)?-?使用訓練數據生成的不重復的詞匯表
????“““
????vocabList?=?set([])??#創建一個空集合
????for?content?in?dataSet:
????????vocabList?=?vocabList?|?set(content)???#通過取并集的方式去重,擴充詞匯表
????return?list(vocabList)???#以list的形式返回詞匯表
def?Words_to_Vec(vocabList?wordsSet):
????“““
????函數說明:
????????根據vocabList詞匯表,將每個wordsSet詞條向量化,向量的每個值為1或0,分別表示該詞有或者沒有在詞匯表中出現
????:param?vocabList:
????????詞匯表
????:param?inputSet:
????????切分每封郵件得到的詞條
????:return:
????????詞條向量
????“““
????returnVec?=?[0]?*?len(vocabList)
????for?word?in?wordsSet:???#判斷每個詞是否在詞匯表中出現
????????if?word?in?vocabList:
????????????returnVec[vocabList.index(word)]?=?1????#在詞匯表中出現的話則該詞對應的位置標記為1
????????else:
????????????print(“The?word?%s?is?not?in?the?VocabList!“?%?word)
????return?returnVec
def?trainNB(trainMat?trainLabel):
????“““
????函數說明:
????????樸素貝葉斯分類訓練函數
????:param?trainMat:
????????訓練文檔,即Words_to_Vec函數返回的詞向量構成的矩陣
????:param?trainLabel:
????????訓練數據的類別標簽,即loadFile函數返回的classVec
????:return:
????????p0Vec?-?侮辱類的條件概率數組
????????p1Vec?-?非侮辱類的條件概率數組
????????pNotAbusive?-?文檔屬于侮辱類的概率
????“““
????numTraindocs?=?len(trainMat)????#訓練集的數量
????numWords?=?len(trainMat[0])?????#每個詞條向量的長度
????pNotAbusive?=?sum(trainLabel)?/?float(numTraindocs)????#文檔屬于非侮辱類的概率
????p0Num?=?np.ones(numWords)???#創建numpy.ones數組詞條出現數初始化為1,拉普拉斯平滑方法
????p1Num?=?np.ones(numWords)
????p0Denom?=?2.0???????????????##分母初始化為2拉普拉斯平滑方法
????p1Denom?=?2.0
????for?i?in?range(numTraindocs):
????????if?trainLabel[i]?==?1:
????????????p1Num?+=?trainMat[i]????#統計屬于非侮辱類的條件概率所需的數據,即P(w0|1)P(w1|1)P(w2|1)···
????????????p1Denom?+=?sum(trainMat[i])
????????else:
????????????p0Num?+=?trainMat[i]????#統計屬于侮辱類的條件概率所需的數據,即P(w0|0)P(w1|0)P(w2|0)···
????????????p0Denom?+=?sum(trainMat[i])
????p1Vec?=?np.log(p1Num?/?p1Denom)?#取對數
????p0Vec?=?np.log(p0Num?/?p0Denom)
????return?p0Vec?p1Vec?pNotAbusive
def?classifyNB(vec2Classify?p0Vec?p1Vec?pClass0):
????“““
????函數說明:
????????樸素貝葉斯分類函數
????:param?vec2Classify:
????????待分類的詞條向量
????:param?p0Vec:
????????侮辱類的條件概率數組
????:param?p1Vec:
????????非侮辱類的條件概率數組
????:param?pClass0:
????????文檔屬于侮辱類的概率
????:return:
????????0?-?文檔屬于侮辱類
????????1?-?文檔屬于分侮辱類
????“““
????p1?=?sum(vec2Classify?*?p1Vec)?+?np.log(pClass
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2018-10-08?11:01??NB\
?????文件????????6148??2018-10-08?11:01??NB\.DS_Store
?????目錄???????????0??2018-10-08?11:01??__MACOSX\
?????目錄???????????0??2018-10-08?11:01??__MACOSX\NB\
?????文件?????????172??2018-10-08?11:01??__MACOSX\NB\._.DS_Store
?????文件?????2375391??2015-04-16?14:04??NB\spam_test.txt
?????文件?????????172??2015-04-16?14:04??__MACOSX\NB\._spam_test.txt
?????文件????10309828??2015-04-16?14:04??NB\spam_train.txt
?????文件?????????172??2015-04-16?14:04??__MACOSX\NB\._spam_train.txt
?????文件????????5970??2018-10-07?12:30??NB\NB.py
?????文件?????????172??2018-10-08?11:01??__MACOSX\._NB
- 上一篇:LadEditor.zip
- 下一篇:PUNTutorial教程譯文.pdf
評論
共有 條評論