91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

  • 大小: 320KB
    文件類型: .zip
    金幣: 2
    下載: 1 次
    發布日期: 2021-06-05
  • 語言: Python
  • 標簽: python??VSM??

資源簡介

基于python3 編寫的VSM模型構建樣例,包含分詞處理后的輸入文件以及停用詞文件

資源截圖

代碼片段和文件信息

#?-*-?coding:?utf-8?-*-
import?json
import?math
import?numpy?as?np


def?read_file(path):
????data?=?[]
????try:
????????with?open(path?“r“?encoding=‘utf-8‘)?as?fin:
????????????for?line?in?fin:
????????????????temp_line?=?line.replace(‘\t‘?‘?‘).strip(‘\r\n‘).split(‘?‘)
????????????????data.append(temp_line)
????????????return?data
????except:
????????print(“read?error\n“)


def?save_file(path?data):
????with?open(path?“w“?encoding=‘utf-8‘)?as?fout:
????????fout.write(str(data))


#?去除停用詞
def?wipe_stopwords(filepath?data):
????stop_words?=?[]
????with?open(filepath?‘r‘?encoding=‘utf-8‘)?as?fin:
????????for?line?in?fin:
????????????stop_words.append(line.strip())

????????new_data?=?[]
????????for?line?in?data:
????????????temp_line?=?[]
????????????is_first?=?True
????????????for?word?in?line:
????????????????if?is_first:
????????????????????is_first?=?False
????????????????????continue
????????????????if?word?not?in?stop_words:
????????????????????if?word?!=?‘‘?and?word?!=?‘\n‘?and?word?!=?u‘\u3000‘:
????????????????????????temp_line.append(word)
????????????new_data.append(temp_line)
????????return?new_data


#?tfidf值計算
def?calc_tfidf(data):
????tf?=?[]
????for?line?in?data:
????????tf_dic?=?{}
????????for?index?in?range(0?len(line)):??#?Tf值計算
????????????if?line[index]?not?in?tf_dic:
????????????????tf_dic[line[index]]?=?1
????????????else:
????????????????tf_dic[line[index]]?+=?1
????????tf.append(tf_dic)

????tf_list?=?[]
????for?i?in?tf:??#?tf?公式:tf?=?(0.5?+?0.5*(tf/maxTf))*(1/len(w))
????????sort_dic?=?sorted(i.items()?key=lambda?d:?d[1]?reverse=True)??#?關鍵詞重要性排序
????????temp_dic?=?{}
????????for?j?in?range(0?len(sort_dic)):
????????????max_tf?=?sort_dic[0][1]
????????????temp_dic[sort_dic[j][0]]?=?(0.5?+?0.5*(sort_dic[j][1]/max_tf))?*?(1.0/len(sort_dic))
????????tf_list.append(temp_dic)

????idf?=?[]
????for?line?in?tf_list:??#?統計在所有文檔中出現次數
????????temp_dic?=?{}
????????for?word?in?line:
????????????for?check_line?in?tf_list:
????????????????if?word?in?check_line:
????????????????????if?word?not?in?temp_dic:
????????????????????????temp_dic[word]?=?1
????????????????????else:
????????????????????????temp_dic[word]?+=?1
????????idf.append(temp_dic)

????file_len?=?len(idf)
????tf_idf?=?[]
????for?line?in?range(0?len(idf)):??#?tf-idf值計算,公式:?idf?=?ln(N/n)?tf-idf?=?idf*tf
????????temp_dic?=?{}
????????for?word?in?idf[line]:
????????????temp_dic[word]?=?math.log((file_len?+?1)/int(idf[line][word]))?*?tf_list[line][word]
????????tf_idf.append(temp_dic)

????tf_idf_list?=?[]
????for?i?in?tf_idf:
????????sort_dic?=?dict(sorted(i.items()?key=lambda?d:?d[1]?reverse=True))
????????tf_idf_list.append(sort_dic)

????#?save_file(‘tf_idf_result.txt‘?tf_idf_list)
????return?tf_idf_list


#?余弦值計算
def?calc_cos(data):
????cos_value?=?{}
????for?doc1?in?range(len(data)):??#?最終結果保存格式為:[文檔序號1-文檔序號2]:余弦值
????????for?doc2?in?range(doc1?+?1??len(d

?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2018-05-31?21:21??VSM\
?????文件????????3659??2018-05-28?10:20??VSM\Chinese-StopWords.txt
?????文件??????202230??2013-05-22?13:30??VSM\input.txt
?????文件????????4158??2018-05-31?21:17??VSM\main.py
?????文件??????306205??2018-05-30?10:27??VSM\result.json
?????文件??????506475??2018-05-29?13:20??VSM\tf_idf_result.txt

評論

共有 條評論