資源簡介
北郵計算機研一《數據挖掘》文本分類實驗.zip

代碼片段和文件信息
“““
Bog?of?Words
“““
import?os
import?numpy?as?np
from?gensim?import?corpora
from?loadFile?import?load_file
def?build_dictionary(corpus?len_dict?dir=‘./data/‘):
????“““
????從語料中建立詞典
????:param?corpus:?語料(分詞之后的)
????:param?len_dict:?指定詞典的大小
????:param?dir:?生成的詞典將要保存的位置
????:return:?詞典對象
????“““
????if?os.path.exists(‘{}dictionary{}.dict‘.format(dir?len_dict)):
????????print(‘load?exiting?dictionary?from?{}bow{}.mm‘.format(dir?len_dict))
????????dictionary?=?corpora.Dictionary.load(‘{}dictionary{}.dict‘.format(dir?len_dict))
????else:
????????dictionary?=?corpora.Dictionary(corpus)
????????dictionary.filter_extremes(keep_n=len_dict)
????????print(‘saving?dictionary?to?{}dictionary{}.dict‘.format(dir?len_dict))
????????dictionary.save(‘{}dictionary{}.dict‘.format(dir?len_dict))
????return?dictionary
#?======================?Bag?of?Words?=======================
def?build_bow(corpus?dictionary?dir=‘./data/‘?suffix=‘train‘):
????“““
????通過下面一句得到語料中每一篇文檔對應的稀疏向量(這里是bow向量)?即?詞袋模型
????向量的每一個元素代表了一個word在這篇文檔中出現的次數
????:param?corpus:?語料
????:param?dictionary:?詞典
????:return:?詞袋bow
????“““
????if?os.path.exists(‘{}bow{}_{}.mm‘.format(dir?len(dictionary)?suffix)):
????????print(‘load?exiting?bow?from?{}bow{}_{}.mm‘.format(dir?len(dictionary)?suffix))
????????bow?=?corpora.MmCorpus(‘{}bow{}_{}.mm‘.format(dir?len(dictionary)?suffix))
????else:
????????bow?=?[dictionary.doc2bow(doc)?for?doc?in?corpus]
????????print(‘saving?bow?to?{}bow{}_{}.mm‘.format(dir?len(dictionary)?suffix))
????????corpora.MmCorpus.serialize(‘{}bow{}_{}.mm‘.format(dir?len(dictionary)?suffix)?bow)
????return?bow
if?__name__?==?‘__main__‘:
????len_dictionary?=?10000
????x_train?y_train?x_test?y_test?=?load_file(dir=‘./data/new_cuted_all_data/‘)
????dictionary?=?build_dictionary(x_train?len_dictionary)
????train_bow?=?build_bow(x_train?dictionary?suffix=‘trian‘)
????print(‘bow‘?type(train_bow)?len(train_bow))
????test_bow?=?build_bow(x_test?dictionary?suffix=‘test‘)
????print(‘bow‘?type(test_bow)?len(test_bow))
????y_train?=?np.array(y_train)
????np.save(‘./data/y_train.npy‘?y_train)
????y_test?=?np.array(y_test)
????np.save(‘./data/y_test.npy‘?y_test)
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2019-09-06?02:44??DataMining-master\
?????文件????????2395??2019-09-06?02:44??DataMining-master\BowGensim.py
?????目錄???????????0??2019-09-06?02:44??DataMining-master\CNKI\
?????目錄???????????0??2019-09-06?02:44??DataMining-master\CNKI\CNKI\
?????文件???????????0??2019-09-06?02:44??DataMining-master\CNKI\CNKI\__init__.py
?????文件?????????128??2019-09-06?02:44??DataMining-master\CNKI\CNKI\items.py
?????文件????????6017??2019-09-06?02:44??DataMining-master\CNKI\CNKI\middlewares.py
?????文件?????????761??2019-09-06?02:44??DataMining-master\CNKI\CNKI\pipelines.py
?????文件????????3174??2019-09-06?02:44??DataMining-master\CNKI\CNKI\settings.py
?????目錄???????????0??2019-09-06?02:44??DataMining-master\CNKI\CNKI\spiders\
?????文件?????????161??2019-09-06?02:44??DataMining-master\CNKI\CNKI\spiders\__init__.py
?????文件????????2911??2019-09-06?02:44??DataMining-master\CNKI\CNKI\spiders\spiders.py
?????文件?????????251??2019-09-06?02:44??DataMining-master\CNKI\scrapy.cfg
?????文件?????????969??2019-09-06?02:44??DataMining-master\NaiveBayes.py
?????文件?????????412??2019-09-06?02:44??DataMining-master\README.md
?????文件????????1290??2019-09-06?02:44??DataMining-master\SVM.py
?????目錄???????????0??2019-09-06?02:44??DataMining-master\data\
?????文件?????????580??2019-09-06?02:44??DataMining-master\data\readme.md
?????文件????????1720??2019-09-06?02:44??DataMining-master\features.py
?????文件????????1213??2019-09-06?02:44??DataMining-master\loadFile.py
?????目錄???????????0??2019-09-06?02:44??DataMining-master\notebook\
?????文件???????11221??2019-09-06?02:44??DataMining-master\notebook\report.ipynb
?????文件?????????115??2019-09-06?02:44??DataMining-master\requirements.txt
評論
共有 條評論