-
大小: 4KB文件類型: .py金幣: 1下載: 0 次發布日期: 2021-05-04
- 語言: Python
- 標簽: 20newsgroup??python??
資源簡介
http://blog.csdn.net/abcjennifer/article/details/23615947
代碼片段和文件信息
#first?extract?the?20?news_group?dataset?to?/scikit_learn_data
from?sklearn.datasets?import?fetch_20newsgroups
#all?categories
#newsgroup_train?=?fetch_20newsgroups(subset=‘train‘)
#part?categories
categories?=?[‘comp.graphics‘
?‘comp.os.ms-windows.misc‘
?‘comp.sys.ibm.pc.hardware‘
?‘comp.sys.mac.hardware‘
?‘comp.windows.x‘];
newsgroup_train?=?fetch_20newsgroups(subset?=?‘train‘categories?=?categories);
def?calculate_result(actualpred):
????m_precision?=?metrics.precision_score(actualpred);
????m_recall?=?metrics.recall_score(actualpred);
????print?‘predict?info:‘
????print?‘precision:{0:.3f}‘.format(m_precision)
????print?‘recall:{0:0.3f}‘.format(m_recall);
????print?‘f1-score:{0:.3f}‘.format(metrics.f1_score(actualpred));
????
#print?category?names
from?pprint?import?pprint
pprint(list(newsgroup_train.target_names))
#newsgroup_train.data?is?the?original?documents?but?we?need?to?extract?the?
#TF-IDF?vectors?inorder?to?model?the?text?data
from?sklearn.feature_extraction.text?import?TfidfVectorizer?HashingVectorizer
#vectorizer?=?TfidfVectorizer(sublinear_tf?=?True
#???????????????????????????max_df?=?0.5
#???????????????????????????stop_words?=?‘english‘);
#however?Tf-Idf?feather?extractor?makes?the?training?set?and?testing?set?have
#divergent?number?of?features.?(Because?they?have?different?vocabulary?in?documents)
#So?we?use?HashingVectorizer
vectorizer?=?HashingVectorizer(stop_words?=?‘english‘non_negative?=?True
???????????????????????????????n_features?=?100)
fea_train?=?vectorizer.fit_transform(newsgroup_train.data)
#return?feature?vector?‘fea_train‘?[n_samplesn_features]
print?‘Size?of?fea_train:‘?+?repr(fea_train.shape)
#11314?documents?130107?vectors?for?all?categories
print?‘The?average?feature?sparsity?is?{0:.3f}%‘.format(
fea_train.nnz/float(fea_train.shape[0]*fea_train.shape[1])*100);
#####
評論
共有 條評論