91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

  • 大小: 39KB
    文件類型: .zip
    金幣: 2
    下載: 0 次
    發布日期: 2021-06-05
  • 語言: 其他
  • 標簽: 垃圾郵件??

資源簡介

垃圾郵件分類實驗數據,包含25個垃圾郵件和25個正常郵件的txt文件,可用于機器學習實驗

資源截圖

代碼片段和文件信息

import?numpy?as?np
import?re
import?os
import?random
import?numpy?as?np

#?處理給定路徑下的文件
def?load_data(folder_path):
os.chdir(folder_path)
doc_list?=?[]
label?=?[]
for?i?in?range(1?26):
file_name?=?‘spam/{0}.txt‘.format(i)
#?將文件轉換成單詞列表
words_list?=?doc2words_list(open(file_name).read())
#?將所有單詞放到一個列表中,并制定類別
doc_list.append(words_list)
label.append(1)
file_name?=?‘ham/{0}.txt‘.format(i)
words_list?=?doc2words_list(open(file_name).read())
doc_list.append(words_list)
label.append(0)
return?doc_list?label

#?將文件轉換成單詞列表
def?doc2words_list(doc):
string_list?=?re.split(r‘\W*‘?doc)
words_list?=?[word.lower()?for?word?in?string_list?if?len(word)?>?2]
return?words_list

#?創建單詞字典
def?create_vocab_list(data_set):
vocab_set?=?set([])
for?document?in?data_set:
vocab_set?=?vocab_set?|?set(document)
return?list(vocab_set)


#?詞集模型
def?word2vec(vocab_list?input_set):
return_vec?=?[0]?*?len(vocab_list)
for?word?in?input_set:
if?word?in?vocab_list:
return_vec[vocab_list.index(word)]?=?1
else:
print(‘The?word?{0}?is?not?in?my?vocab_list‘.format(word))
return?return_vec


#?詞袋模型
def?bag_word2vec(vocab_list?input_set):
return_vec?=?[0]?*?len(vocab_list)
for?word?in?vocab_list:
return_vec[vocab_list.index(word)]?+=?1
return?return_vec

#?訓練貝葉斯
def?train_bayes(train_mat?label):
num_train_docs?=?len(train_mat)
num_words?=?len(train_mat[0])
#?非侮辱性文檔的概率
p0?=?sum(label)?/?num_train_docs
#?每個類別中每個單詞出現的次數
p0_num?=?np.ones(num_words)
p1_num?=?np.ones(num_words)
#?每個類別中所有單詞數目
p0_denom?=?2.0
p1_denom?=?2.0
for?i?in?range(num_train_docs):
if?label[i]?==?1:
p1_num?+=?train_mat[i]
p1_denom?+=?sum(train_mat[i])
else:
p0_num?+=?train_mat[i]
p0_denom?+=?sum(train_mat[i])
#?條件概率:每個類別中某個單詞出現次數/每個類別所有單詞數
p1_vec?=?np.log(p1_num/p1_denom)
p0_vec?=?np.log(p0_num/p0_denom)
return?p0?p1_vec?p0_vec

#?貝葉斯分類器
def?bayes_classify(test_arr?p0_vec?p1_vec?p0):
p1?=?sum(test_arr?*?p1_vec)?+?np.log(1?-?p0)
p0?=?sum(test_arr?*?p0_vec)?+?np.log(p0)
if?p1?>?p0:
return?1
else:
return?0


if?__name__?==?‘__main__‘:

folder_path?=?‘/Users/akira/Documents/books/MLinAction/machinelearninginaction/Ch04/email/‘
doc_list?label?=?load_data(folder_path)
vocab_list?=?create_vocab_list(doc_list)

#?產生交叉驗證集
all_set?=?set(range(50))
test_set?=?set()
for?i?in?range(10):
test_set.add(random.randint(0?49))
train_set?=?all_set?-?test_set

#?單詞轉換成詞集向量
word_set?=?[]
for?i?in?range(50):
word_set.append(word2vec(vocab_list?doc_list[i]))

#?產生訓練集和測試集
train_mat?=?[]
test_mat?=?[]
train_label?=?[]
test_label?=?[]
print(‘train_set:‘?train_set)
print(‘test_set‘?test_set)
for?i?in?train_set:
train_mat.append(word_set[i])
train_label.append(label[i])
for?i?in?test_set:
test_mat.append(word_set[i])
test_label.append(label[i])


#?訓練并測試
p0?p1_vec?p0_vec?=?train_bayes(train_mat?train_label)
error_cnt?=?0
for?i?in?range(len(test_mat)):
label_bayes?=?bayes_classify(test_mat[i]?p0_vec?p1_vec?p

?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2018-03-19?17:35??upload\
?????文件????????6148??2018-03-19?17:35??upload\.DS_Store
?????目錄???????????0??2018-03-19?17:35??__MACOSX\
?????目錄???????????0??2018-03-19?17:35??__MACOSX\upload\
?????文件?????????120??2018-03-19?17:35??__MACOSX\upload\._.DS_Store
?????文件????????3421??2018-03-19?17:32??upload\wml_bayes.py
?????文件?????????176??2018-03-19?17:32??__MACOSX\upload\._wml_bayes.py
?????目錄???????????0??2018-03-19?17:35??upload\email\
?????目錄???????????0??2018-03-19?17:35??upload\email\spam\
?????文件?????????338??2010-10-23?08:39??upload\email\spam\15.txt
?????目錄???????????0??2018-03-19?17:35??__MACOSX\upload\email\
?????目錄???????????0??2018-03-19?17:35??__MACOSX\upload\email\spam\
?????文件?????????212??2010-10-23?08:39??__MACOSX\upload\email\spam\._15.txt
?????文件?????????210??2010-10-23?08:38??upload\email\spam\14.txt
?????文件?????????212??2010-10-23?08:38??__MACOSX\upload\email\spam\._14.txt
?????文件?????????338??2010-10-23?08:40??upload\email\spam\16.txt
?????文件?????????212??2010-10-23?08:40??__MACOSX\upload\email\spam\._16.txt
?????文件?????????254??2018-03-19?16:10??upload\email\spam\17.txt
?????文件?????????252??2010-10-23?08:38??upload\email\spam\13.txt
?????文件?????????212??2010-10-23?08:38??__MACOSX\upload\email\spam\._13.txt
?????文件?????????188??2010-10-23?08:37??upload\email\spam\12.txt
?????文件?????????212??2010-10-23?08:37??__MACOSX\upload\email\spam\._12.txt
?????文件?????????217??2010-10-23?08:36??upload\email\spam\10.txt
?????文件?????????212??2010-10-23?08:36??__MACOSX\upload\email\spam\._10.txt
?????文件?????????414??2010-10-23?08:37??upload\email\spam\11.txt
?????文件?????????212??2010-10-23?08:37??__MACOSX\upload\email\spam\._11.txt
?????文件?????????169??2010-10-23?08:34??upload\email\spam\9.txt
?????文件?????????212??2010-10-23?08:34??__MACOSX\upload\email\spam\._9.txt
?????文件?????????338??2010-10-23?08:33??upload\email\spam\8.txt
?????文件?????????212??2010-10-23?08:33??__MACOSX\upload\email\spam\._8.txt
?????文件?????????238??2010-10-23?08:31??upload\email\spam\5.txt
............此處省略85個文件信息

評論

共有 條評論