資源簡介
基于樸素貝葉斯的垃圾郵件分類 對垃圾郵件的分類有較好的效果 達(dá)到99%

代碼片段和文件信息
#?-*-?coding:?utf-8?-*-
import?numpy?as?np
def?textParser(text):
????“““
????對SMS預(yù)處理,去除空字符串,并統(tǒng)一小寫
????:param?text:
????:return:
????“““
????import?re
????regEx?=?re.compile(r‘[^a-zA-Z]|\d‘)??#?匹配非字母或者數(shù)字,即去掉非字母和數(shù)字,只留下單詞
????words?=?regEx.split(text)
????#?去除空字符串,并統(tǒng)一小寫
????words?=?[word.lower()?for?word?in?words?if?len(word)?>?0]
????return?words
def?loadSMSData(fileName):
????“““
????加載SMS數(shù)據(jù)
????:param?fileName:
????:return:
????“““
????f?=?open(fileName)
????classCategory?=?[]??#?類別標(biāo)簽,1表示是垃圾SMS,0表示正常SMS
????smsWords?=?[]
????for?line?in?f.readlines():
????????linedatas?=?line.strip().split(‘\t‘)
????????if?linedatas[0]?==?‘ham‘:
????????????classCategory.append(0)
????????elif?linedatas[0]?==?‘spam‘:
????????????classCategory.append(1)
????????#?切分文本
????????words?=?textParser(linedatas[1])
????????smsWords.append(words)
????return?smsWords?classCategory
def?createVocabularyList(smsWords):
????“““
????創(chuàng)建語料庫
????:param?smsWords:
????:return:
????“““
????vocabularySet?=?set([])
????for?words?in?smsWords:
????????vocabularySet?=?vocabularySet?|?set(words)
????vocabularyList?=?list(vocabularySet)
????return?vocabularyList
def?getVocabularyList(fileName):
????“““
????從詞匯列表文件中獲取語料庫
????:param?fileName:
????:return:
????“““
????fr?=?open(fileName)
????vocabularyList?=?fr.readline().strip().split(‘\t‘)
????fr.close()
????return?vocabularyList
def?setOfWordsToVecTor(vocabularyList?smsWords):
????“““
????SMS內(nèi)容匹配預(yù)料庫,標(biāo)記預(yù)料庫的詞匯出現(xiàn)的次數(shù)
????:param?vocabularyList:
????:param?smsWords:
????:return:
????“““
????vocabMarked?=?[0]?*?len(vocabularyList)
????for?smsWord?in?smsWords:
????????if?smsWord?in?vocabularyList:
????????????vocabMarked[vocabularyList.index(smsWord)]?+=?1
????return?vocabMarked
def?setOfWordsListToVecTor(vocabularyList?smsWordsList):
????“““
????將文本數(shù)據(jù)的二維數(shù)組標(biāo)記
????:param?vocabularyList:
????:param?smsWordsList:
????:return:
????“““
????vocabMarkedList?=?[]
????for?i?in?range(len(smsWordsList)):
????????vocabMarked?=?setOfWordsToVecTor(vocabularyList?smsWordsList[i])
????????vocabMarkedList.append(vocabMarked)
????return?vocabMarkedList
def?trainingNaiveBayes(trainMarkedWords?trainCategory):
????“““
????訓(xùn)練數(shù)據(jù)集中獲取語料庫中詞匯的spamicity:P(Wi|S)
????:param?trainMarkedWords:?按照語料庫標(biāo)記的數(shù)據(jù),二維數(shù)組
????:param?trainCategory:
????:return:
????“““
????numTrainDoc?=?len(trainMarkedWords)
????numWords?=?len(trainMarkedWords[0])
????#?是垃圾郵件的先驗(yàn)概率P(S)
????pSpam?=?sum(trainCategory)?/?float(numTrainDoc)
????#?統(tǒng)計(jì)語料庫中詞匯在S和H中出現(xiàn)的次數(shù)
????wordsInSpamNum?=?np.ones(numWords)
????wordsInHealthNum?=?np.ones(numWords)
????spamWordsNum?=?2.0
????healthWordsNum?=?2.0
????for?i?in?range(0?numTrainDoc):
????????if?trainCategory[i]?==?1:??#?如果是垃圾SMS或郵件
????????????wordsInSpamNum?+=?trainMarkedWords[i]
????????????spamWordsNum?+=?sum(trainMarkedWords[i])??#?統(tǒng)計(jì)Spam中語料庫中詞匯出現(xiàn)的總次數(shù)
????????else:
????????????wordsInHealthNum?+=?trainMarkedWords[i]
????????????healthWo
?屬性????????????大小?????日期????時(shí)間???名稱
-----------?---------??----------?-----??----
?????文件???????5111??2018-05-07?16:36??SMS\NaiveBayes\NaiveBayes.py
?????文件???????4842??2017-04-22?10:48??SMS\NaiveBayes\NaiveBayes.pyc
?????文件?????????14??2017-04-19?16:33??SMS\NaiveBayes\pSpam.txt
?????文件?????210195??2017-04-19?16:33??SMS\NaiveBayes\pWordsHealthy.txt
?????文件?????210195??2017-04-19?16:33??SMS\NaiveBayes\pWordsSpamicity.txt
?????文件?????198723??2017-04-22?13:55??SMS\NaiveBayes\ROC?Curve.png
?????文件???????3876??2017-04-22?13:53??SMS\NaiveBayes\SenSpeciList0.csv
?????文件???????3876??2017-04-22?13:53??SMS\NaiveBayes\SenSpeciList1.csv
?????文件???????3876??2017-04-22?13:54??SMS\NaiveBayes\SenSpeciList2.csv
?????文件???????3876??2017-04-22?13:54??SMS\NaiveBayes\SenSpeciList3.csv
?????文件???????3952??2017-04-22?13:55??SMS\NaiveBayes\SenSpeciList4.csv
?????文件?????477907??2011-03-15?22:36??SMS\NaiveBayes\SMSSpamCollection.txt
?????文件???????3239??2018-05-07?16:36??SMS\NaiveBayes\test.py
?????文件???????1942??2017-04-20?11:15??SMS\NaiveBayes\test.pyc
?????文件????????802??2018-05-07?16:36??SMS\NaiveBayes\TestPlot.py
?????文件???????1141??2018-05-07?16:36??SMS\NaiveBayes\training.py
?????文件??????54677??2017-04-19?16:33??SMS\NaiveBayes\vocabularyList.txt
?????文件?????????58??2017-04-19?16:32??SMS\NaiveBayes\__init__.py
?????目錄??????????0??2017-04-23?08:39??SMS\NaiveBayes
?????目錄??????????0??2017-04-23?08:39??SMS
-----------?---------??----------?-----??----
??????????????1188302????????????????????20
- 上一篇:雙閉環(huán)逆變器
- 下一篇:微機(jī)原理.docx
評論
共有 條評論