91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

  • 大小: 352KB
    文件類型: .rar
    金幣: 2
    下載: 0 次
    發(fā)布日期: 2021-05-15
  • 語言: 其他
  • 標(biāo)簽:

資源簡介

基于樸素貝葉斯的垃圾郵件分類 對垃圾郵件的分類有較好的效果 達(dá)到99%

資源截圖

代碼片段和文件信息

#?-*-?coding:?utf-8?-*-

import?numpy?as?np

def?textParser(text):
????“““
????對SMS預(yù)處理,去除空字符串,并統(tǒng)一小寫
????:param?text:
????:return:
????“““
????import?re
????regEx?=?re.compile(r‘[^a-zA-Z]|\d‘)??#?匹配非字母或者數(shù)字,即去掉非字母和數(shù)字,只留下單詞
????words?=?regEx.split(text)
????#?去除空字符串,并統(tǒng)一小寫
????words?=?[word.lower()?for?word?in?words?if?len(word)?>?0]
????return?words


def?loadSMSData(fileName):
????“““
????加載SMS數(shù)據(jù)
????:param?fileName:
????:return:
????“““
????f?=?open(fileName)
????classCategory?=?[]??#?類別標(biāo)簽,1表示是垃圾SMS,0表示正常SMS
????smsWords?=?[]
????for?line?in?f.readlines():
????????linedatas?=?line.strip().split(‘\t‘)
????????if?linedatas[0]?==?‘ham‘:
????????????classCategory.append(0)
????????elif?linedatas[0]?==?‘spam‘:
????????????classCategory.append(1)
????????#?切分文本
????????words?=?textParser(linedatas[1])
????????smsWords.append(words)
????return?smsWords?classCategory


def?createVocabularyList(smsWords):
????“““
????創(chuàng)建語料庫
????:param?smsWords:
????:return:
????“““
????vocabularySet?=?set([])
????for?words?in?smsWords:
????????vocabularySet?=?vocabularySet?|?set(words)
????vocabularyList?=?list(vocabularySet)
????return?vocabularyList


def?getVocabularyList(fileName):
????“““
????從詞匯列表文件中獲取語料庫
????:param?fileName:
????:return:
????“““
????fr?=?open(fileName)
????vocabularyList?=?fr.readline().strip().split(‘\t‘)
????fr.close()
????return?vocabularyList


def?setOfWordsToVecTor(vocabularyList?smsWords):
????“““
????SMS內(nèi)容匹配預(yù)料庫,標(biāo)記預(yù)料庫的詞匯出現(xiàn)的次數(shù)
????:param?vocabularyList:
????:param?smsWords:
????:return:
????“““
????vocabMarked?=?[0]?*?len(vocabularyList)
????for?smsWord?in?smsWords:
????????if?smsWord?in?vocabularyList:
????????????vocabMarked[vocabularyList.index(smsWord)]?+=?1
????return?vocabMarked


def?setOfWordsListToVecTor(vocabularyList?smsWordsList):
????“““
????將文本數(shù)據(jù)的二維數(shù)組標(biāo)記
????:param?vocabularyList:
????:param?smsWordsList:
????:return:
????“““
????vocabMarkedList?=?[]
????for?i?in?range(len(smsWordsList)):
????????vocabMarked?=?setOfWordsToVecTor(vocabularyList?smsWordsList[i])
????????vocabMarkedList.append(vocabMarked)
????return?vocabMarkedList


def?trainingNaiveBayes(trainMarkedWords?trainCategory):
????“““
????訓(xùn)練數(shù)據(jù)集中獲取語料庫中詞匯的spamicity:P(Wi|S)
????:param?trainMarkedWords:?按照語料庫標(biāo)記的數(shù)據(jù),二維數(shù)組
????:param?trainCategory:
????:return:
????“““
????numTrainDoc?=?len(trainMarkedWords)
????numWords?=?len(trainMarkedWords[0])
????#?是垃圾郵件的先驗(yàn)概率P(S)
????pSpam?=?sum(trainCategory)?/?float(numTrainDoc)

????#?統(tǒng)計(jì)語料庫中詞匯在S和H中出現(xiàn)的次數(shù)
????wordsInSpamNum?=?np.ones(numWords)
????wordsInHealthNum?=?np.ones(numWords)
????spamWordsNum?=?2.0
????healthWordsNum?=?2.0
????for?i?in?range(0?numTrainDoc):
????????if?trainCategory[i]?==?1:??#?如果是垃圾SMS或郵件
????????????wordsInSpamNum?+=?trainMarkedWords[i]
????????????spamWordsNum?+=?sum(trainMarkedWords[i])??#?統(tǒng)計(jì)Spam中語料庫中詞匯出現(xiàn)的總次數(shù)
????????else:
????????????wordsInHealthNum?+=?trainMarkedWords[i]
????????????healthWo

?屬性????????????大小?????日期????時(shí)間???名稱
-----------?---------??----------?-----??----

?????文件???????5111??2018-05-07?16:36??SMS\NaiveBayes\NaiveBayes.py

?????文件???????4842??2017-04-22?10:48??SMS\NaiveBayes\NaiveBayes.pyc

?????文件?????????14??2017-04-19?16:33??SMS\NaiveBayes\pSpam.txt

?????文件?????210195??2017-04-19?16:33??SMS\NaiveBayes\pWordsHealthy.txt

?????文件?????210195??2017-04-19?16:33??SMS\NaiveBayes\pWordsSpamicity.txt

?????文件?????198723??2017-04-22?13:55??SMS\NaiveBayes\ROC?Curve.png

?????文件???????3876??2017-04-22?13:53??SMS\NaiveBayes\SenSpeciList0.csv

?????文件???????3876??2017-04-22?13:53??SMS\NaiveBayes\SenSpeciList1.csv

?????文件???????3876??2017-04-22?13:54??SMS\NaiveBayes\SenSpeciList2.csv

?????文件???????3876??2017-04-22?13:54??SMS\NaiveBayes\SenSpeciList3.csv

?????文件???????3952??2017-04-22?13:55??SMS\NaiveBayes\SenSpeciList4.csv

?????文件?????477907??2011-03-15?22:36??SMS\NaiveBayes\SMSSpamCollection.txt

?????文件???????3239??2018-05-07?16:36??SMS\NaiveBayes\test.py

?????文件???????1942??2017-04-20?11:15??SMS\NaiveBayes\test.pyc

?????文件????????802??2018-05-07?16:36??SMS\NaiveBayes\TestPlot.py

?????文件???????1141??2018-05-07?16:36??SMS\NaiveBayes\training.py

?????文件??????54677??2017-04-19?16:33??SMS\NaiveBayes\vocabularyList.txt

?????文件?????????58??2017-04-19?16:32??SMS\NaiveBayes\__init__.py

?????目錄??????????0??2017-04-23?08:39??SMS\NaiveBayes

?????目錄??????????0??2017-04-23?08:39??SMS

-----------?---------??----------?-----??----

??????????????1188302????????????????????20


評論

共有 條評論

相關(guān)資源