-
大小: 56KB文件類(lèi)型: .rar金幣: 2下載: 0 次發(fā)布日期: 2021-06-02
- 語(yǔ)言: 其他
- 標(biāo)簽: python??bayes??classificati??
資源簡(jiǎn)介
簡(jiǎn)單的郵件分類(lèi),使用bayes分類(lèi)器,100行正常郵件和100行垃圾郵件。

代碼片段和文件信息
from?numpy?import?*
def?textParse(bigString):
????import?re
????listOfTokens?=?re.split(r‘\W*‘?bigString)
????return?[tok?for?tok?in?listOfTokens?if?len(tok)?>?2]
def?createVocabList(dataSet):
????vocabSet?=?set([])
????for?docment?in?dataSet:
????????vocabSet?=?vocabSet?|?set(docment)
????return?list(vocabSet)
def?bagOfWord2Vec(vocabListinputSet):
????returnVec?=?[0]*len(vocabList)
????for?word?in?inputSet:
????????if?word?in?vocabList:
????????????returnVec[vocabList.index(word)]?+=?1
????return?returnVec
def?trainNB(trainMattrainGategory):##trainMat為所有郵件的矩陣表示,trainCategory為表示郵件類(lèi)別的向量
????numTrain?=?len(trainMat)?#郵件總數(shù)量
????numWords?=?len(trainMat[0])?#詞典長(zhǎng)度
????pAbusive?=?sum(trainGategory)/float(numTrain)?#統(tǒng)計(jì)垃圾郵件的總個(gè)數(shù),然后除以總文檔個(gè)數(shù)(先驗(yàn)概率)
????p0Num?=?ones(numWords)?#將向量初始化為1,表示每個(gè)詞至少出現(xiàn)1次
????p1Num?=?ones(numWords)
????p0Denom?=?2.0#分母初始化為2
????p1Denom?=?2.0
????for?i?in?range(numTrain):
????????if?trainGategory[i]?==?1:?#如果是垃圾郵件
????????????p1Num?+=?trainMat[i]?#把屬于同一類(lèi)的文本向量相加,實(shí)質(zhì)是統(tǒng)計(jì)某個(gè)詞條在該類(lèi)文本中出現(xiàn)頻率
????????????p1Denom?+=?sum(trainMat[i])?#把垃圾郵件向量的所有元素加起來(lái),表示垃圾郵件中的所有詞匯
????????else:
????????????p0Num?+=?trainMat[i]
????????????p0Denom?+=?sum(trainMat[i])
????p1Vec?=?log(p1Num/p1Denom)??#統(tǒng)計(jì)詞典中所有詞條在垃圾郵件中出現(xiàn)的概率
????p0Vec?=?log(p0Num/p0Denom)?#統(tǒng)計(jì)詞典中所有詞條在正常文郵件中出現(xiàn)的概率
????return?p0Vec?p1Vec?pAbusive
#?classfy?funtion
def?classfy(vec2classfy?p0Vec?p1Vec?pClass1):
????p1?=?sum(vec2classfy*p1Vec)+log(pClass1)
????p0?=?sum(vec2classfy*p0Vec)+log(1-pClass1)
????if?p1?>?p0:
????????return?1
????else:
????????return?0
def?testmain():
????docList?=?[];classList?=?[]
????res?=?open(‘spam_100.utf8‘?‘rb‘)
????res1?=?open(‘ham_100.utf8‘?‘rb‘)
????for?i?in?range(100):
????????wordList?=?textParse(res.readline().decode(‘utf-8‘))
????????docList.append(wordList)
????????classList.append(1)
????????wordList?=?textParse(res1.readline().decode(‘utf-8‘))
????????docList.append(wordList)
????????classList.append(0)
????vocabList?=?createVocabList(docList)
#?select?test?data
????tdocList?=?[];tclassList?=?[]
????res2?=?open(‘test.utf8‘?‘rb‘)
????for?i?in?range(50):
????????twordList?=?textParse(res2.readline().decode(‘utf-8‘))
????????tdocList.append(twordList)
????????tclassList.append(0)
????for?i?in?range(50):
????????twordList?=?textParse(res2.readline().decode(‘utf-8‘))
????????tdocList.append(twordList)
????????tclassList.append(1)
????trainMat?=?[];trainClass?=?[]
????trainSet?=?range(200);testSet?=?range(100)
????for?docIndex?in?trainSet:
????????trainMat.append(bagOfWord2Vec(vocabList?docList[docIndex]))
????????trainClass.append(classList[docIndex])
????p0?p1?pSpam=trainNB(array(trainMat)?array(trainClass))
????errCount=0
????for?docIndex?in?testSet:
????????wordVec?=?bagOfWord2Vec(vocabList?tdocList[docIndex])
????????if?classfy(array(wordVec)?p0?p1?pSpam)?!=?tclassList[docIndex]:
????????????errCount?+=?1
????????????print(“classcification?error:“?tdocList[docIndex])
?屬性????????????大小?????日期????時(shí)間???名稱(chēng)
-----------?---------??----------?-----??----
?????文件????????740??2018-05-13?20:14??bayes\.idea\bayes.iml
?????文件???????1053??2018-05-13?19:00??bayes\.idea\modules.xm
?????文件??????32174??2018-05-13?20:22??bayes\.idea\workspace.xm
?????文件???????3491??2018-05-02?14:31??bayes\bayes.py
?????文件??????59489??2018-04-22?20:15??bayes\ham_100.utf8
?????文件??????44997??2018-04-22?20:15??bayes\spam_100.utf8
?????文件??????38749??2018-04-22?20:15??bayes\test.utf8
?????目錄??????????0??2018-05-13?20:22??bayes\.idea
?????目錄??????????0??2018-05-02?14:31??bayes
-----------?---------??----------?-----??----
???????????????180693????????????????????9
評(píng)論
共有 條評(píng)論