資源簡介
python源碼,用樸素貝葉斯算法實現垃圾郵件的過濾.

代碼片段和文件信息
#?-*-?coding:?utf-8?-*-
“““
Created?on?Thu?Apr?11?00:02:15?2019
@author:?自律給我自由
“““
import?xlrd
import?re
import?jieba
import?math
import?random
import?numpy?as?np
import?os
file=‘chinesespam.xlsx‘
wb=xlrd.open_workbook(filename=file)
ws=wb.sheet_by_name(‘chinesespam.‘)
dataset=[]
dataset1=[]
dataset2=[]
temp=[]
vocabList=[]
Percision?=?0.0
Percision_All?=?0.0
Recall?=?0.0
Recall_All?=?0.0
General?=?0.0
General_All?=?0.0
for?i?in?range(1ws.nrows):
?????temp.append(ws.cell(i0).value)
?????if??‘ham‘==temp[i-1]:
?????????dataset1.append(0)
?????else:?
?????????dataset1.append(1)
?????
for?r?in?range(1ws.nrows):
?????dataset2.append(ws.cell(r1).value)
?????
def?textHandle(bigString):
????#分詞,有的在網上找到的例子中創建了停止詞列表將那些無關語義表達的詞如“一天”刪去
????list1?=?jieba.lcut(bigString)
????newList?=?[re.sub(r‘\W*‘‘‘s)?for?s?in?list1]?
????#將不是字母,數字,下劃線,漢字的字符刪去
????return[tok.lower()?for?tok?in?newList?if?len(tok)?>?0]
for?i?in?range(len(dataset2)):???#將分好的詞存入到列表中
????dataset.append(textHandle(dataset2[i]))
????
#測試用?print(dataset)?
def?createVocabList(dataSet):
????vocabSet=set([])??#創建一個空集
????for?document?in?dataSet:
???????? vocabSet=vocabSet|set(document)?#取兩個集合的并集
????return?list(vocabSet)?????#以list的方式返回結果
vocabList=createVocabList(dataset)???
#該變量是已分好詞的列表?????
#?print(createVocabList(dataset))?測試用
def?setOfWords2Vec(vocabListinputSet):
????returnVec=np.zeros(len(vocabList))?#生成零向量的array
????for?word?in?inputSet:
????????????if?word?in?vocabList:
????????????????????returnVec[vocabList.index(word)]=1?#單詞出現則記為1
????????????else:?print(‘the?word:%s?is?not?in?my?Vocabulary!‘%?word)
????return?returnVec?#返回0.1向量
#測試用?for?i?in?range(149):
#測試用????print(setOfWords2Vec(vocabListdataset[i]))
#測試用?8127??print(len(vocabList))
#現在要做?減少8000維度經過一番想法,還是改用詞集模型
def?trainNB(trainDataSettrainLabels):
????numTrains?=?len(trainDataSet)??#訓練數據組數
????numWords?=?len(trainDataSet[0])?#每組訓練的大小
????pClass1?=?sum(trainLabels)/float(numTrains)?#垃圾郵件出現的概率
????p0Num?=?np.ones(numWords)?#正常郵件分詞出現頻率
????p1Num?=?np.ones(numWords)?#垃圾郵件分詞出現頻率
????p0SumWords?=?2.0????#正常郵件中分詞總數
????p1SumWords?=?2.0????#垃圾郵件中分詞總數
????for?i?in?range(numTrains):
????????if?trainLabels[i]==1:
????????????p1Num?+=?trainDataSet[i]????#統計垃圾郵件各分詞
????????else:
????????????p0Num?+=?trainDataSet[i]????#統計正常郵件各分詞
????p0SumWords?=?sum(p0Num)
????p1SumWords?=?sum(p1Num)
????p0Vect?=?p0Num/p0SumWords?#正常郵件中各分詞出現概率
????p1Vect?=?p1Num/p1SumWords?#垃圾郵件中各分詞出現概率
????return?pClass1p0Vectp1Vect
????
def?classifyNB(vec2Classifyp0Vecp1VecpClass1):
????temp0?=?vec2Classify*p0Vec
????temp1?=?vec2Classify*p1Vec
????temp00?=?[]
????temp11?=?[]
????for?x?in?temp0:
????????if?x>0:
????????????temp00.append(math.log(x))
????????else:
????????????temp00.append(0)
????for?x?in?temp1:
????????if?x>0:
????????????temp11.append(math.log(x))
????????else:
????????????temp11.append(0)
????p1=sum(temp11)+math.log(pClass1)
????p0=sum(temp
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件???????6937??2019-04-14?10:32??bayes.py
?????文件?????114184??2019-04-14?00:44??chinesespam.xlsx
-----------?---------??----------?-----??----
???????????????121121????????????????????2
- 上一篇:Python爬蟲爬取豆瓣電影
- 下一篇:GA-BP算法的python實現
評論
共有 條評論