資源簡介
從讀取文件、去除重用詞、正則化、到詞向量、再到預(yù)測分類。最后ROC曲線評(píng)估,一整套(帶數(shù)據(jù)集),下載即運(yùn)行。如果好用,還請(qǐng)給個(gè)好評(píng)

代碼片段和文件信息
import?re
import?jieba
import?pandas?as?pd
#?導(dǎo)入文本特征向量轉(zhuǎn)化模塊
from?sklearn.feature_extraction.text?import?CountVectorizer
from?sklearn.naive_bayes?import?MultinomialNB
from?sklearn.metrics?import?classification_report
def?text_save(filename?data):??????????????????????????????#???filename為寫入CSV文件的路徑,data為要寫入數(shù)據(jù)列表.
????file?=?open(filename‘a(chǎn)‘)
????for?i?in?range(len(data)):
????????s?=?str(data[i]).replace(‘[‘‘‘).replace(‘]‘‘‘)????#???去除[]這兩行按數(shù)據(jù)不同,可以選擇
????????s?=?s.replace(“‘“‘‘).replace(‘‘‘‘)?+‘\n‘?????????#???去除單引號(hào),逗號(hào),每行末尾追加換行符
????????file.write(s)
????file.close()
????print(“保存文件成功“)
def?process():
????#?-------------第一部分:讀取數(shù)據(jù)--------------------------
????fulltrain?=?pd.read_csv(‘DataC/microwave.csv‘?encoding=‘gb18030‘)
????datatrain?=?fulltrain[‘star_rating‘].values??#?轉(zhuǎn)換為數(shù)組
????#?print(type(datatrain))
????#?#?print(datatrain.dtype)
????for?i?in?range(0?len(datatrain)):
????????if?datatrain[i]?>=?3:??#?大于等于3的變成1,其余為0
????????????datatrain[i]?=?1
????????else:
????????????datatrain[i]?=?0
????#?print(data)
????dataxtrain?=?fulltrain[‘review_body‘].values
????print(len(dataxtrain))
????#?-------------第二部分:數(shù)據(jù)處理--------------------------
????pattern?=?r“(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)|([a-zA-Z]+.\w+\.+[a-zA-Z0-9\/_]+)“
????restrain?=?[]
????for?i?in?dataxtrain:
????????tmp?=?‘‘.join(str(str(i).replace(‘\n‘?‘‘)))??#?去掉換行符
????????#?print(tmp)
????????tmp?=?re.sub(pattern?‘‘?tmp)??#?去掉網(wǎng)站
????????#?print(tmp)
????????remove_digits?=?str.maketrans(‘‘?‘‘?‘0123456789‘)??#?去除數(shù)字
????????tmp?=?tmp.translate(remove_digits)
????????#?print(tmp)
????????restrain.append(tmp)
????print(“train總計(jì):“?len(restrain))
????#?-----------------第三部分:分詞,去除停用詞--------------------
????stop_word?=?{}.fromkeys([‘,‘?‘。‘?‘!‘?‘this‘?‘me‘?‘very‘?‘is‘?‘、‘?‘:‘?‘;‘?‘(‘?‘)‘?‘-‘?‘:‘])
????print(“中文分詞后結(jié)果:“)
????corpustrain?=?[]
????for?a?in?restrain:
????????seg_list?=?jieba.cut(a.strip()?cut_all=False)??#?精確模式
????????final?=?‘‘
????????for?seg?in?seg_list:
????????????if?seg?not?in?stop_word:??#?非停用詞,保留
????????????????final?+=?seg
????????seg_list?=?jieba.cut(final?cut_all=False)
????????output?=?‘?‘.join(list(seg_list))
????????#?print(output)
????????corpustrain.append(output)
????print(len(corpustrain))
????#?-------------------第四部分將文本中的詞語轉(zhuǎn)換為詞頻矩陣--------------------------------
????vectorizer?=?CountVectorizer()
????#?計(jì)算各詞語出現(xiàn)的次數(shù)
????Xtrain?=?vectorizer.fit_transform(corpustrain)
????#?獲取詞袋中所有文本關(guān)鍵詞
????#?word?=?vectorizer.get_feature_names()
????#?#?查看詞頻結(jié)果
????#?#?print(len(word))
????#?for?w?in?word:
????#?????print(wend=“?“)
????#?print(“?“)
????#?print(“詞頻矩陣:“)
????Xtrain?=?Xtrain.toarray()
????#?print(“矩陣len:“l(fā)en(X))
????#?np.set_printoptions(threshold=np.inf)
????#?print(X)
????#?-----------------------------第五部分-數(shù)據(jù)分析---------------------
????print(“數(shù)據(jù)分析:“)
????x_train?=?Xtrain[:1300]
????y_train?=?datatrain[:130
?屬性????????????大小?????日期????時(shí)間???名稱
-----------?---------??----------?-----??----
?????文件???????4548??2020-03-07?18:00??BayesianModel\BayesianMicrowave.py
?????文件????1035040??2020-03-06?10:05??BayesianModel\DataC\microwave.csv
?????文件???????1402??2020-03-12?12:14??BayesianModel\ROC.py
?????目錄??????????0??2020-03-12?12:15??BayesianModel\DataC
?????目錄??????????0??2020-03-12?12:15??BayesianModel
-----------?---------??----------?-----??----
??????????????1040990????????????????????5
評(píng)論
共有 條評(píng)論