資源簡介
Python數據分析與機器學習-新聞分類任務
Python數據分析與機器學習-新聞分類任務

代碼片段和文件信息
import?pandas?as?pd
import?jieba
import?numpy
pd.set_option(‘display.height‘?9999)
pd.set_option(‘display.max_rows‘?9999)
pd.set_option(‘display.max_columns‘?9999)
pd.set_option(‘display.width‘?9999)
df_news?=?pd.read_table(“./data/val.txt“?names=[‘category‘?‘theme‘?‘URL‘?‘content‘]?encoding=“utf-8“)
df_news?=?df_news.dropna()#刪除有缺失值的行
#?print(df_news.shape)??#?(5000?4)
content?=?df_news[“content“].values.tolist()??#?新聞內容list
#?print(content[1000])
content_S?=?[]??#?新聞內容分詞之后的list
for?line?in?content:
????current_segment?=?jieba.lcut(line)
????if?len(current_segment)?>?1?and?current_segment?!=?“\r\n“:#換行符
????????content_S.append(current_segment)
#?print(content_S[1000])
df_content?=?pd.Dataframe({“content_S“:?content_S})
#?print(df_content.head())
stopwords?=?pd.read_csv(“stopwords.txt“?index_col=False?sep=“\t“?quoting=3?names=[‘stopword‘]?encoding=‘utf-8‘)
def?drop_stopwords(contents?stopwords):
????‘‘‘去除新聞中的停用詞‘‘‘
????contents_clean?=?[]??#?新聞中去掉停用詞
????all_words?=?[]??#?所有詞匯的集合(不包括停用詞)
????for?line?in?contents:
????????line_clean?=?[]
????????for?word?in?line:
????????????if?word?in?stopwords:
????????????????continue
????????????line_clean.append(word)
????????????all_words.append(word)
????????contents_clean.append(line_clean)
????return?contents_clean?all_words
contents?=?df_content[“content_S“].values.tolist()
stopwords?=?stopwords[“stopword“].values.tolist()
contents_clean?all_words?=?drop_stopwords(contents?stopwords)
df_content?=?pd.Dataframe({“contents_clean“:?contents_clean})
df_all_words?=?pd.Dataframe({‘all_words‘:?all_words})
words_count?=?df_all_words.groupby(by=[‘all_words‘])[‘all_words‘].agg({“count“:?numpy.size})
words_count?=?words_count.reset_index().sort_values(by=[“count“]?ascending=False)
#?print(words_count.head().values)
‘‘‘
[[‘中‘?5199]
?[‘中國‘?3115]
?[‘說‘?3055]
?[‘S‘?2646]
?[‘萬‘?2390]]
‘‘‘
from?wordcloud?import?WordCloud
import?matplotlib.pyplot?as?plt
import?matplotlib
matplotlib.rcParams[‘figure.figsize‘]?=?(10.0?5.0)
wordcloud?=?WordCloud(font_path=“./data/simhei.ttf“?background_color=“white“?max_font_size=80)
word_frequence?=?{x[0]:?x[1]?for?x?in?words_count.head(100).values}
wordcloud?=?wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)
plt.show()
‘‘‘TF-IDF?:提取關鍵詞‘‘‘
import?jieba.analyse
index?=?2400
print(df_news[‘content‘][index])
content_S_str?=?““.join(content_S[index])
print(“??“.join(jieba.analyse.extract_tags(content_S_str?topK=5?withWeight=False)))
‘‘‘LDA?:主題模型‘‘‘
from?gensim?import?corpora?models?similarities
import?gensim
#?做映射,相當于詞袋
dictionary?=?corpora.Dictionary(contents_clean)
corpus?=?[dictionary.doc2bow(sentence)?for?sentence?in?contents_clean]
lda?=?gensim.models.ldamodel.LdaModel(corpus=corpus?id2word=dictionary?num_topics=20)??#?num_topics需要得到主題的數量
#?一號分類結果
print(lda.print_topic(1?topn=5))??#?第一類主題
for?topic?in?lda.print_topics(num_topics=20?num_words=5
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2018-01-06?09:43??chapter16\
?????目錄???????????0??2018-01-05?20:38??chapter16\data\
?????文件????10044356??2017-06-14?23:56??chapter16\data\simhei.ttf
?????文件?????9948878??2017-07-25?08:24??chapter16\data\val.txt
?????文件??????365370??2018-01-06?09:13??chapter16\show_Chinese.png
?????文件???????17672??2017-06-14?23:55??chapter16\stopwords.txt
?????文件????????5463??2017-03-03?12:00??chapter16\中文停用詞庫.txt
?????文件????????6038??2017-03-03?07:38??chapter16\哈工大停用詞表.txt
?????文件????????8571??2017-03-03?12:00??chapter16\四川大學機器智能實驗室停用詞庫.txt
?????文件????????5644??2018-01-06?09:43??chapter16\新聞分類任務.py
評論
共有 條評論