資源簡介
微博情感傾向性分析,自己做的作業,代碼已調通,可用。

代碼片段和文件信息
#?_*_?coding:?utf-8?_*_
#好評good.txt(1列)和壞評bad.txt(1列)停用詞stop.txt(1列)
#獲取文本字符串
import?nltk
from?nltk.collocations?import??BigramCollocationFinder
from?nltk.metrics?import??BigramAssocMeasures
#安裝結巴,進入D:\software\Python\Python35\scripts,執行pip3?install?jieba即可;卸載使用pip3??uninstall?jieba即可
import?jieba
def?text():
?????f1?=?open(‘d:/positive.txt‘‘r‘encoding=‘utf-8‘)
?????f2?=?open(‘d:/negative.txt‘‘r‘encoding=‘utf-8‘)
?????line1?=?f1.readline()
?????line2?=?f2.readline()
?????str?=?‘‘
?????while?line1:
?????????str?+=?line1
?????????line1?=?f1.readline()
?????while?line2:
?????????str?+=?line2
?????????line2?=?f2.readline()
?????f1.close()
?????f2.close()
?????return?str
#返回分詞列表如:[[‘我‘‘愛‘‘北京‘‘天安門‘][‘你‘‘好‘][‘hello‘]],一條評論一個
import?os
import?codecs
def?read_file(filename):
#???with?codecs.open(‘d:/stop.txt‘‘r‘‘utf-8‘)?as?rf:
#???????with?open()
?????stop?=?[line.strip()?for?line?in??codecs.open(‘d:/stop.txt‘‘r‘‘utf-8‘).readlines()]#停用詞
?????f?=?codecs.open(filename‘r‘‘utf-8‘)
?????line?=?f.readline()
?????str?=?[]
?????while?line:
?????????s?=?line.split(‘\t‘)
?????????fenci?=?jieba.cut(s[0]cut_all=False)#False默認值:精準模式
?????????str.append(list(set(fenci)-set(stop)))
?????????line?=?f.readline()
?????return?str
#安裝nltk,進入D:\software\Python\Python35\scripts,執行pip3?install??nltk即可
from?nltk.probability?import??FreqDistConditionalFreqDist
from?nltk.metrics?import??BigramAssocMeasures
#獲取信息量最高(前number個)的特征(卡方統計)
def?jieba_feature(number):
?????posWords?=?[]
?????negWords?=?[]
?????for?items?in?read_file(‘d:/positive.txt‘):#把集合的集合變成集合
?????????for?item?in?items:
????????????posWords.append(item)
?????for?items?in?read_file(‘d:/negative.txt‘):
?????????for?item?in?items:
????????????negWords.append(item)
?????word_fd?=?FreqDist()?#可統計所有詞的詞頻
?????cond_word_fd?=?ConditionalFreqDist()?#可統計積極文本中的詞頻和消極文本中的詞頻
?????for?word?in?posWords:
?????????word_fd[word]?+=?1
?????????cond_word_fd[‘pos‘][word]?+=?1
?????for?word?in?negWords:
?????????word_fd[word]?+=?1
?????????cond_word_fd[‘neg‘][word]?+=?1
?????pos_word_count?=?cond_word_fd[‘pos‘].N()?#積極詞的數量
?????neg_word_count?=?cond_word_fd[‘neg‘].N()?#消極詞的數量
?????total_word_count?=?pos_word_count?+?neg_word_count
?????word_scores?=?{}#包括了每個詞和這個詞的信息量
?????for?word?freq?in?word_fd.items():
?????????pos_score?=?BigramAssocMeasures.chi_sq(cond_word_fd[‘pos‘][word]??(freq?pos_word_count)?total_word_count)?#計算積極詞的卡方統計量,這里也可以計算互信息等其它統計量
?????????neg_score?=?BigramAssocMeasures.chi_sq(cond_word_fd[‘neg‘][word]??(freq?neg_word_count)?total_word_count)?#同理
?????????word_scores[word]?=?pos_score?+?neg_score?#一個詞的信息量等于積極卡方統計量加上消極卡方統計量
?????best_vals?=?sorted(word_scores.items()?key=lambda?item:item[1]??reverse=True)[:number]?#把詞按信息量倒序排序。number是特征的維度,是可以不斷調整直至最優的
?????best_words?=?set([w?for?ws?in?best
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件???????7057??2017-05-14?14:06??微博情感傾向性分析\代碼\emotion.py
?????文件?????????65??2017-05-15?15:32??微博情感傾向性分析\代碼\readme.txt
?????文件???????6099??2017-05-13?12:06??微博情感傾向性分析\代碼\senti_python.py
?????文件????2027971??2017-05-13?16:58??微博情感傾向性分析\數據集\negative.txt
?????文件????1810399??2017-05-13?16:47??微博情感傾向性分析\數據集\positive.txt
?????文件??????15183??2016-10-24?18:26??微博情感傾向性分析\數據集\stop.txt
?????目錄??????????0??2017-05-15?15:52??微博情感傾向性分析\代碼
?????目錄??????????0??2017-05-15?15:52??微博情感傾向性分析\數據集
?????目錄??????????0??2017-11-13?14:56??微博情感傾向性分析
-----------?---------??----------?-----??----
??????????????3866774????????????????????9
- 上一篇:硬件課程設計-頻率發生器
- 下一篇:在線卡密生成無需數據庫
評論
共有 條評論