資源簡(jiǎn)介
包含25封正常郵件、25封垃圾郵件以及分類器源代碼,適合ML初學(xué)者使用

代碼片段和文件信息
#?-*-?coding:?utf-8?-*-
“““
Created?on?Mon?Feb?26?16:45:49?2018
@author:?user
“““
#出現(xiàn)只算1次
import?math
from?string?import?punctuation
import?re
class?EmailClassifier(object):
????def?__init__(selfham_path=“ham/“spam_path=“spam/“tr_num=16):
????????self.ham_path=ham_path
????????self.spam_path=spam_path
????????self.tr_num=tr_num
????def?getPosibility(self):
????????self.ham_in_fre={}
????????self.spam_in_fre={}??
????????self.ham_notin_fre={}
????????self.spam_notin_fre={}
????????self.ham_in_dir={}
????????self.spam_in_dir={}
????????self.ham_notin_dir={}
????????self.spam_notin_dir={}
????????for?i?in?range(self.tr_num):
????????????f=open(self.ham_path+str(i+1)+“.txt““r“)
????????????message=f.readlines()
????????????f.close()
????????????tmp=re.sub(r‘[{}]+‘.format(punctuation)‘‘?str(message)).lower().split()?????????????#將字符串變?yōu)榱斜?br/>????????????t=[]
????????????for?w?in?tmp:???????????????????#將列表變?yōu)樽值?br/>????????????????if?w?in?t:
????????????????????continue
????????????????else:
????????????????????t.append(w)
????????????for?w?in?t:?????????????????????#將該信件內(nèi)容統(tǒng)計(jì)并入字典
????????????????if?w?in?self.ham_in_dir:
????????????????????self.ham_in_dir[w]+=1
????????????????else:
????????????????????self.ham_in_dir[w]=1
????????????????????self.spam_in_dir[w]=0
????????????f=open(self.spam_path+str(i+1)+“.txt““r“)
????????????message=f.readlines()
????????????f.close()
????????????tmp=re.sub(r‘[{}]+‘.format(punctuation)‘‘?str(message)).lower().split()?????????????#將字符串變?yōu)榱斜?br/>????????????t=[]???????????
????????????for?w?in?tmp:???????????????????#將列表變?yōu)樽值?br/>????????????????if?w?in?t:
????????????????????continue
????????????????else:
????????????????????t.append(w)
????????????for?w?in?t:?????????????????????#將該信件內(nèi)容統(tǒng)計(jì)并入字典
????????????????if?w?in?self.spam_in_dir:
????????????????????self.spam_in_dir[w]+=1
????????????????else:
????????????????????self.ham_in_dir[w]=0
????????????????????self.spam_in_dir[w]=1??????????????????#將該信件內(nèi)容統(tǒng)計(jì)并入字典
????????self.p_ham=0.5
????????self.p_spam=0.5
????????for?w?in?self.ham_in_dir:
????????????self.ham_in_fre[w]=(self.ham_in_dir[w]+1)/(self.tr_num+2)
????????????self.ham_notin_fre[w]=(self.tr_num-self.ham_in_dir[w]+1)/(self.tr_num+2)
????????????self.spam_in_fre[w]=(self.spam_in_dir[w]+1)/(self.tr_num+2)
????????????self.spam_notin_fre[w]=(self.tr_num-self.spam_in_dir[w]+1)/(self.tr_num+2)
????def?test(self):
????????correct_number=0
????????print(“ham中:“)
????????for?i?in?range(25-self.tr_num):
????????????f=open(self.ham_path+str(i+self.tr_num+1)+“.txt““r“)
????????????message=f.readlines()
????????????f.close()
????????????
????????????tmp=re.sub(r‘[{}]+‘.format(punctuation)‘‘?str(message)).lower().split()?????????????#將字符串變?yōu)榱斜?br/>????????????t=[]
????????????for?w?in?tmp:???????????????????#將列表變?yōu)樽值?br/>????????????????if?w?in?t:
????????????????????continue
????????????????else:
????????????????????t.append(w)
????????????p_h=math.log10(s
?屬性????????????大小?????日期????時(shí)間???名稱
-----------?---------??----------?-----??----
?????文件???????5140??2018-02-27?19:47??EmailClassifier.py
?????文件????????148??2010-10-23?17:11??ham\1.txt
?????文件?????????86??2010-10-23?17:13??ham\10.txt
?????文件????????130??2010-10-23?17:13??ham\11.txt
?????文件????????182??2010-10-23?09:16??ham\12.txt
?????文件????????174??2010-10-23?17:13??ham\13.txt
?????文件????????172??2010-10-23?17:13??ham\14.txt
?????文件????????531??2010-10-23?09:21??ham\15.txt
?????文件?????????90??2010-10-23?09:21??ham\16.txt
?????文件????????464??2010-10-23?09:22??ham\17.txt
?????文件????????175??2010-10-23?09:23??ham\18.txt
?????文件????????161??2010-10-23?17:14??ham\19.txt
?????文件????????234??2010-10-23?08:48??ham\2.txt
?????文件????????208??2010-10-23?09:26??ham\20.txt
?????文件????????234??2010-10-23?09:27??ham\21.txt
?????文件????????330??2010-10-23?09:28??ham\22.txt
?????文件????????607??2018-02-27?19:40??ham\23.txt
?????文件?????????42??2010-10-23?09:33??ham\24.txt
?????文件?????????89??2010-10-23?09:34??ham\25.txt
?????文件????????371??2010-10-23?08:49??ham\3.txt
?????文件????????207??2010-10-23?08:50??ham\4.txt
?????文件????????114??2010-10-23?17:11??ham\5.txt
?????文件???????1464??2010-10-23?17:12??ham\6.txt
?????文件????????109??2010-10-23?17:12??ham\7.txt
?????文件????????638??2010-10-23?08:58??ham\8.txt
?????文件????????146??2010-10-23?09:01??ham\9.txt
?????文件????????238??2010-10-23?08:28??spam\1.txt
?????文件????????217??2010-10-23?08:36??spam\10.txt
?????文件????????414??2010-10-23?08:37??spam\11.txt
?????文件????????188??2010-10-23?08:37??spam\12.txt
............此處省略26個(gè)文件信息
評(píng)論
共有 條評(píng)論