資源簡介
使用python實現中文文本聚類,利用kmeans算法,包含jiba分詞方法等

代碼片段和文件信息
‘‘‘
Created?on?Mar?24?2011
Ch?11?code
@author:?Peter
‘‘‘
from?numpy?import?*
def?loadDataSet():
????return?[
?????????[12]
????????[2345]
????????[1346]
????????[2134]
????????[2136]
????]
def?createC1(dataSet):
????C1?=?[]
????for?transaction?in?dataSet:
????????for?item?in?transaction:
????????????if?not?[item]?in?C1:
????????????????C1.append([item])
????????????????
????C1.sort()
????return?map(frozenset?C1)#use?frozen?set?so?we
????????????????????????????#can?use?it?as?a?key?in?a?dict????
def?scanD(D?Ck?minSupport):
????ssCnt?=?{}
????for?tid?in?D:
????????for?can?in?Ck:
????????????if?can.issubset(tid):
????????????????if?not?ssCnt.has_key(can):?ssCnt[can]=1
????????????????else:?ssCnt[can]?+=?1
????numItems?=?float(len(D))
????retList?=?[]
????supportData?=?{}
????for?key?in?ssCnt:
????????support?=?ssCnt[key]/numItems
????????if?support?>=?minSupport:
????????????retList.insert(0key)
????????supportData[key]?=?support
????return?retList?supportData
def?aprioriGen(Lk?k):?#creates?Ck
????retList?=?[]
????lenLk?=?len(Lk)
????for?i?in?range(lenLk):
????????for?j?in?range(i+1?lenLk):?
????????????L1?=?list(Lk[i])[:k-2];?
????????????L2?=?list(Lk[j])[:k-2]
????????????L1.sort();?
????????????L2.sort()
????????????print?L1L2
????????????if?L1==L2:?#if?first?k-2?elements?are?equal
????????????????retList.append(Lk[i]?|?Lk[j])?#set?union
????return?retList
def?apriori(dataSet?minSupport?=?0.5):
????C1?=?createC1(dataSet)
????D?=?map(set?dataSet)
????L1?supportData?=?scanD(D?C1?minSupport)
????L?=?[L1]
????k?=?2
????while?(len(L[k-2])?>?0):
????????Ck?=?aprioriGen(L[k-2]?k)
????????Lk?supK?=?scanD(D?Ck?minSupport)#scan?DB?to?get?Lk
????????supportData.update(supK)
????????L.append(Lk)
????????k?+=?1
????return?L?supportData
def?generateRules(L?supportData?minConf=0.7):??#supportData?is?a?dict?coming?from?scanD
????bigRuleList?=?[]
????for?i?in?range(1?len(L)):#only?get?the?sets?with?two?or?more?items
????????for?freqSet?in?L[i]:
????????????H1?=?[frozenset([item])?for?item?in?freqSet]
????????????if?(i?>?1):
????????????????rulesFromConseq(freqSet?H1?supportData?bigRuleList?minConf)
????????????else:
????????????????calcConf(freqSet?H1?supportData?bigRuleList?minConf)
????return?bigRuleList?????????
def?calcConf(freqSet?H?supportData?brl?minConf=0.7):
????prunedH?=?[]?#create?new?list?to?return
????for?conseq?in?H:
????????conf?=?supportData[freqSet]/supportData[freqSet-conseq]?#calc?confidence
????????if?conf?>=?minConf:?
????????????print?freqSet-conseq‘-->‘conseq‘conf:‘conf
????????????brl.append((freqSet-conseq?conseq?conf))
????????????prunedH.append(conseq)
????return?prunedH
def?rulesFromConseq(freqSet?H?supportData?brl?minConf=0.7):
????m?=?len(H[0])
????if?(len(freqSet)?>?(m?+?1)):?#try?further?merging
????????Hmp1?=?aprioriGen(H?m+1)#create?Hm+1?new?candidates
????????Hmp1?
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2015-10-11?04:20??chinese_text_cluster-master\
?????目錄???????????0??2015-10-11?04:20??chinese_text_cluster-master\Association_Analysis\
?????目錄???????????0??2015-10-11?04:20??chinese_text_cluster-master\Association_Analysis\Apriori\
?????文件????????6012??2015-10-11?04:20??chinese_text_cluster-master\Association_Analysis\Apriori\apriori.py
?????文件????????5189??2015-10-11?04:20??chinese_text_cluster-master\Association_Analysis\Apriori\apriori.pyc
?????文件???????38906??2015-10-11?04:20??chinese_text_cluster-master\Association_Analysis\Apriori\bills20DataSet.txt
?????文件??????137426??2015-10-11?04:20??chinese_text_cluster-master\Association_Analysis\Apriori\lawAssnRules.txt
?????文件????????1806??2015-10-11?04:20??chinese_text_cluster-master\Association_Analysis\Apriori\meaning20.txt
?????文件??????570408??2015-10-11?04:20??chinese_text_cluster-master\Association_Analysis\Apriori\mushroom.dat
?????文件????????5585??2015-10-11?04:20??chinese_text_cluster-master\Association_Analysis\Apriori\recent100bills.txt
?????文件????????1050??2015-10-11?04:20??chinese_text_cluster-master\Association_Analysis\Apriori\recent20bills.txt
?????目錄???????????0??2015-10-11?04:20??chinese_text_cluster-master\Association_Analysis\FP-growth\
?????文件????????6615??2015-10-11?04:20??chinese_text_cluster-master\Association_Analysis\FP-growth\fpGrowth.py
?????目錄???????????0??2015-10-11?04:20??chinese_text_cluster-master\Classification\
?????目錄???????????0??2015-10-11?04:20??chinese_text_cluster-master\Classification\AdaBoost\
?????目錄???????????0??2015-10-11?04:20??chinese_text_cluster-master\Classification\AdaBoost\EXTRAS\
?????文件?????????522??2015-10-11?04:20??chinese_text_cluster-master\Classification\AdaBoost\EXTRAS\README.txt
?????文件?????????784??2015-10-11?04:20??chinese_text_cluster-master\Classification\AdaBoost\EXTRAS\simpleDataPlot.py
?????文件????????5548??2015-10-11?04:20??chinese_text_cluster-master\Classification\AdaBoost\adaboost.py
?????文件????????4719??2015-10-11?04:20??chinese_text_cluster-master\Classification\AdaBoost\adaboost.pyc
?????文件???????13614??2015-10-11?04:20??chinese_text_cluster-master\Classification\AdaBoost\horseColicTest2.txt
?????文件???????60778??2015-10-11?04:20??chinese_text_cluster-master\Classification\AdaBoost\horseColicTraining2.txt
?????文件????????3462??2015-10-11?04:20??chinese_text_cluster-master\Classification\AdaBoost\old_adaboost.py
?????目錄???????????0??2015-10-11?04:20??chinese_text_cluster-master\Classification\Bayes\
?????目錄???????????0??2015-10-11?04:20??chinese_text_cluster-master\Classification\Bayes\EXTRAS\
?????文件?????????522??2015-10-11?04:20??chinese_text_cluster-master\Classification\Bayes\EXTRAS\README.txt
?????文件?????????961??2015-10-11?04:20??chinese_text_cluster-master\Classification\Bayes\EXTRAS\create2Normal.py
?????文件?????????456??2015-10-11?04:20??chinese_text_cluster-master\Classification\Bayes\EXTRAS\monoDemo.py
?????文件????????7247??2015-10-11?04:20??chinese_text_cluster-master\Classification\Bayes\bayes.py
?????文件????????6957??2015-10-11?04:20??chinese_text_cluster-master\Classification\Bayes\bayes.pyc
?????文件???????15141??2015-10-11?04:20??chinese_text_cluster-master\Classification\Bayes\email.zip
............此處省略3084個文件信息
評論
共有 條評論