資源簡介
內含手機中文評論數(shù)據(jù)集(商品編號和評論),貝葉斯算法中文評論分類代碼,數(shù)據(jù)集+代碼
代碼片段和文件信息
import?jieba
import?pandas?as?pd
from?sklearn?import?metricsnaive_bayes
from?sklearn.feature_extraction.text?import?CountVectorizer#計算詞頻
file=open(‘商品編號和評論.txt‘‘r‘encoding=‘UTF-8‘)
rows=file.readline()
def?main():
????reviews=[]#評論
????score=[]#評分
????num=0
????while?rows:
????????line?=?file.readline()
????????ls?=?line.split(‘‘)
????????if?len(ls)==5:
????????????reviews.append(ls[2])
????????????score.append(ls[3])
????????if?num==10000:
????????????break
????????num+=1
????df=pd.Dataframe({
????????‘reviews‘:reviews
????????‘score‘:score
????})
????df=df[[‘reviews‘‘score‘]]#修改列名
????print(df)
????#?print(df.info())
????#?print(df.ix[0])
????df[‘score‘]=df[‘score‘].str.extract(r‘(\d+)‘expand=False).astype(int)
????#?print(df.info())
????#?print(sum(df[‘score‘].isnull()))
????df[‘score‘]=df[‘score‘].apply(score_get)
????col=df.iloc[:0]
????arrs=col.values
????result=[]#過濾完成的結果
????stopwords={}.fromkeys([‘,‘‘!‘‘。‘‘、‘‘?‘‘~‘])
????for?a?in?arrs:
????????seglist=jieba.lcut(acut_all=False)
????????final=‘‘
????????for?seg?in?seglist:
????????????if?seg?not?in?stopwords:
????????????????final+=seg
????????seglist=jieba.lcut(finalcut_all=False)
????????output=‘?‘.join(list(seglist))#每行過濾出的結果
????????result.append(output)
????#
????vectorize=CountVectorizer()#new
????#?word=vectorize.get_feature_names()#查看
????#?for?i?in?word:
????#?????print(i)
????X=vectorize.fit_transform(result)#計算過濾后列表詞頻率
????X=X.toarray()#轉為列表
????#?print(X)
????x_train=X[:-1000]
????y_train=df.iloc[:-10001]
????x_test?=?X[-1000:]
????y_test=df.iloc[-1000:1]
????nb=naive_bayes.BernoulliNB()
????nb.fit(x_trainy_train)
????nb_pre=nb.predict(x_test)
????print(nb_pre)
????word?=?vectorize.get_feature_names()
????for?w?in?word:
????????print(w)
????accuracy?=?metrics.accuracy_score(y_test?nb_pre)
????print(‘分類準確率:‘a(chǎn)ccuracy)
def?score_get(x):
????if?x<=3:
????????return?1
????else:
????????return?2
if?__name__?==?‘__main__‘:
????main()
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件????????135??2018-12-12?15:25??suanfa\.idea\encodings.xm
?????文件????????295??2018-12-12?15:25??suanfa\.idea\misc.xm
?????文件????????264??2018-12-12?15:25??suanfa\.idea\modules.xm
?????文件????????438??2018-12-12?15:25??suanfa\.idea\suanfa.iml
?????文件???????9188??2018-12-12?15:25??suanfa\.idea\workspace.xm
?????文件???????2112??2018-12-13?10:43??suanfa\suanfa.py
?????文件???75224050??2018-12-13?09:29??suanfa\商品編號和評論.txt
?????目錄??????????0??2018-12-13?10:44??suanfa\.idea
?????目錄??????????0??2018-12-13?10:44??suanfa
-----------?---------??----------?-----??----
?????????????75236482????????????????????9
- 上一篇:體育用品售賣網(wǎng)站
- 下一篇:TMDB電影數(shù)據(jù)分析
評論
共有 條評論