資源簡介
KNN文本聚類,下下來就能用,用于文本的無監督學習,注意內存

代碼片段和文件信息
#?coding:?utf-8
#?In[1]:
import?matplotlib.pyplot?as?plt
from?collections?import?defaultdict
from?collections?import?Counter
from?scipy.sparse?import?csr_matrix?find
import?numpy?as?np
import?random
from?sklearn.utils?import?shuffle
from?sklearn.metrics?import?calinski_harabaz_score
def?csr_build(dataIndex?value?nnz?nrows):
????ind?=?np.zeros(nnz?dtype=np.int)
????val?=?np.zeros(nnz?dtype=np.double)
????ptr?=?np.zeros(nrows+1?dtype=np.int)
????i?=?0
????n?=?0
????
????for?(dv)?in?zip(dataIndex?value):
????????l?=?len(d)
????????for?j?in?range(l):
#?????????????print?j?k
????????????ind[int(j)?+?n]?=?d[j]
????????????val[int(j)?+?n]?=?v[j]
????????
????????ptr[i+1]?=?ptr[i]?+?l
????????n?+=?l
????????i?+=?1
????
????mat?=?csr_matrix((val?ind?ptr)?shape=(nrows?max(ind)+1)?dtype=np.double)
????mat.sort_indices()
????
????return?mat????????
#?scale?matrix?and?normalize?its?rows
def?csr_idf(mat?copy=False?**kargs):
????r“““?Scale?a?CSR?matrix?by?idf.?
????Returns?scaling?factors?as?dict.?If?copy?is?True?
????returns?scaled?matrix?and?scaling?factors.
????“““
????if?copy?is?True:
????????mat?=?mat.copy()
????nrows?=?mat.shape[0]
????nnz?=?mat.nnz
????ind?val?ptr?=?mat.indices?mat.data?mat.indptr
????#?document?frequency
????df?=?defaultdict(int)
????for?i?in?ind:
????????df[i]?+=?1
????#?inverse?document?frequency
????for?kv?in?df.items():
????????df[k]?=?np.log(nrows?/?float(v))??##?df?turns?to?idf?-?reusing?memory
????#?scale?by?idf
????for?i?in?range(0?nnz):
????????val[i]?*=?df[ind[i]]
????????
????return?df?if?copy?is?False?else?mat
def?csr_l2normalize(mat?copy=False?**kargs):
????r“““?Normalize?the?rows?of?a?CSR?matrix?by?their?L-2?norm.?
????If?copy?is?True?returns?a?copy?of?the?normalized?matrix.
????“““
????if?copy?is?True:
????????mat?=?mat.copy()
????nrows?=?mat.shape[0]
????nnz?=?mat.nnz
????ind?val?ptr?=?mat.indices?mat.data?mat.indptr
????#?normalize
????for?i?in?range(nrows):
????????rsum?=?0.0????
????????for?j?in?range(ptr[i]?ptr[i+1]):
????????????rsum?+=?val[j]**2
????????if?rsum?==?0.0:
????????????continue??#?do?not?normalize?empty?rows
????????rsum?=?float(1.0/np.sqrt(rsum))
????????for?j?in?range(ptr[i]?ptr[i+1]):
????????????val[j]?*=?rsum
????????????
????if?copy?is?True:
????????return?mat
def?initCentorids(x?k):
????x_shuffle?=?shuffle(x?random_state=0)
????return?x_shuffle[:k:]
#?In[15]:
def?sim(x1?x2):
????sims?=?x1.dot(x2.T)
????return?sims
#?In[16]:
def?findCentroids(mat?centroids):
????idx?=?list()
????simsMatrix?=?sim(mat?centroids)
????for?i?in?range(simsMatrix.shape[0]):
????????row?=?simsMatrix.getrow(i).toarray()[0].ravel()
????????top_indices?=?row.argsort()[-1]
????????top_values?=?row[row.argsort()[-1]]
#?????????print?top_indices
????????idx.append(top_indices?+?1)
????return?idx
def?computeMeans(mat?idx?k):
????centroids?=?list()
????for?i?in?range(1k+1):
????????indi?=?[j?for?j?x?in?enumerate(idx)?if?x?==?i]
????????members?=?mat[indi:]
????????if?
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2017-05-09?23:43??Text_Clustering-master\
?????目錄???????????0??2017-05-09?23:43??Text_Clustering-master\.ipynb_checkpoints\
?????文件???????10433??2017-05-09?23:43??Text_Clustering-master\.ipynb_checkpoints\Text_clustering-checkpoint.ipynb
?????文件???????47137??2017-05-09?23:43??Text_Clustering-master\1.png
?????文件??????198806??2017-05-09?23:43??Text_Clustering-master\HW6.pdf
?????文件????????3302??2017-05-09?23:43??Text_Clustering-master\README.md
?????文件???????20494??2017-05-09?23:43??Text_Clustering-master\Text_clustering.ipynb
?????文件???????17159??2017-05-09?23:43??Text_Clustering-master\output.dat.txt
?????目錄???????????0??2017-05-09?23:43??Text_Clustering-master\report\
?????文件???????94769??2017-05-09?23:43??Text_Clustering-master\report\README.pdf
?????目錄???????????0??2017-05-09?23:43??Text_Clustering-master\src\
?????文件????????4998??2017-05-09?23:43??Text_Clustering-master\src\Text_clustering.py
?????文件?????7369300??2017-05-09?23:43??Text_Clustering-master\src\train.dat.txt
?????文件?????7369300??2017-05-09?23:43??Text_Clustering-master\train.dat.txt
- 上一篇:企業微信開發升級版
- 下一篇:微信小程序源碼帶秒殺
評論
共有 條評論