91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

資源簡介

該資源屬于代碼類,用C語言和Python實現了TF-IDF算法,適用于文本分類等特征權重抽取

資源截圖

代碼片段和文件信息

#-*-?coding:utf-8?-*-
import?math
import?os
import?fileinput

TEXT?=?0??#某類別的文檔數目
SUCCEED?=?0
Docs?=?[]

Path?=?“G:\文本聚類數據集\數據集“.decode(‘utf8‘).encode(‘cp936‘)
w_Path?=?“G:\文本聚類數據集\特征數據集“.decode(‘utf8‘).encode(‘cp936‘)

##計算權重函數,tf為某詞在文章中出現的次數,df為包含該詞的文檔數,max文章中出現次數最多的詞條數
##返回值為TF-IDF權重

##把特征向量按權重進行排序
def?sort(termsTF_IDF):
????for?i?in?range(0len(terms)):
????????m?=?i
????????for?j?in?range(i+1len(terms)):
????????????if?TF_IDF[j]>TF_IDF[m]:
????????????????m?=?j
????????if?i!=m:
????????????temp?=?terms[i]
????????????terms[i]?=?terms[m]
????????????terms[m]?=?temp
????????????v?=?TF_IDF[i]
????????????TF_IDF[i]?=?TF_IDF[m]
????????????TF_IDF[m]?=?v

def?save_words(path):
????global?Docs??TEXT
????terms?=?[]
????fp?=?open(path“r“)
????while?True:
????????line?=?fp.readline()
????????if?not?line?:?break
????????terms.append(line)
????fp.close()
????Docs.append(terms)
????TEXT?+=?1

def?GenerateIDF(path):
????global?Docs??TEXT
????terms?=?[]
????IDF?=?[]
????idf?=?0.0
????fp?=?open(path“r“)
????while?True:
????????line?=?fp.readline()
????????if?not?line?:?break
????????flag?=?0
????????for?i?in?range(0len(terms)):
????????????if?line?==?terms[i]:
????????????????flag?=?1
????????if?flag?==?0:
????????????terms.append(line)
????fp.close()
????for?j?in?range(0len(terms)):
????????df?=?0
????????for?i?in?range(0len(Docs)):
????????????flag?=?0
????????????doc?=?Docs[i]
????????????for?k?in?range(0len(doc)):
????????????????if?terms[j]?==?doc[k]:
????????????????????flag?=?1
????????????????????break
????????????if?flag?==?1:
????????????????df?+=?1
????????idf?=?math.log(float(TEXT)/float(df)+0.01)
????????IDF.append(idf)
????return?IDFterms

def?GenerateTF(pathterms):
????all_terms?=?[]
????TF?=?[]
????terms_count?=?len(terms)
????fp?=?open(path“r“)
????while?True:
????????line?=?fp.readline()
????????if?not?line?:?break
????????all_terms.append(line)
????for?i?in?terms:
????????tf?=?0
????????for?j?in?all_terms:
????????????if?i?==?j:
????????????????tf?+=1
????????TF.append(float(tf)/float(terms_count))
????fp.close()
????return?TF

def?save_weight(TFIDFtermspath):
????global?SUCCEED
????TF_IDF?=?[]
????top?=?200
????if?len(TF)????????top?=?len(TF)
????for?i?in?range(0len(TF)):
????????TF_IDF.append(float(TF[i])*float(IDF[i]))
????fp?=?open(path“w+“)
????sort(termsTF_IDF)
????for?i?in?range(0top):
????????string?=?terms[i].strip()+“?“+str(TF_IDF[i])+‘\n‘
????????fp.write(string)
????????SUCCEED?+=?1
????fp.close()

def?read_dir(pathw_path):
????global?SUCCEED
????file_list?=?[]
????files?=?os.listdir(path)
????print?“please?wait......“
????for?f?in?files:
????????file_list.append(f)
????????r_name?=?path?+?‘\\‘?+?f
????????save_words(r_name)
????print?“sum?of?docs?is:%d“%TEXT
????for?i?in?file_list:
????????print?i
????????name?=?path?+?‘\\‘?+?i
????????w_name?=?w_path?+?‘\\‘?+?i
????????IDFterms

?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----

?????文件???????3457??2015-10-22?22:52??TFIDFMeasure.py

?????文件???????3455??2015-05-25?19:40??DFTF.CPP

-----------?---------??----------?-----??----

?????????????????6912????????????????????2


評論

共有 條評論