-
大小: 13KB文件類型: .zip金幣: 2下載: 0 次發(fā)布日期: 2021-05-25
- 語言: Python
- 標(biāo)簽: k-means??python??數(shù)據(jù)??
資源簡介
k-means的python實(shí)現(xiàn)及數(shù)據(jù),使用numpy實(shí)現(xiàn)了k-means的算法實(shí)例

代碼片段和文件信息
import?numpy?as?np
import?matplotlib.pyplot?as?plt
#?返回距離樣本最近的質(zhì)心的下標(biāo)索引
def?group_one(sample?centers):
????distance_vect?=?np.sum((sample-centers)**2?axis=1)
????return?np.argmin(distance_vect)
#?將所有樣本分組到k個(gè)質(zhì)心,返回二維列表[[屬于分組1的樣本][屬于分組2的樣本]...]
def?group_all(data?k?centers):
????#?這里使用二維列表,而不是ndarray的原因在于,每個(gè)分組的大小,也就是樣本的個(gè)數(shù)是不確定的
????#?而array是確定大小的,強(qiáng)行轉(zhuǎn)換這里會(huì)變成列表對(duì)象的數(shù)組,效率低下且更容易出錯(cuò)
????#?如果有更好的做法歡迎交流
????groups?=?[]
????for?index?in?range(k):
????????groups.append([])
????#?對(duì)每一個(gè)樣本進(jìn)行分組
????for?sample?in?data:
????????index?=?group_one(sample?centers)
????????groups[index].append(sample.tolist())
????return?groups
#?根據(jù)樣本分組,更新每個(gè)質(zhì)心的位置
def?update_centers(data?k?groups):
????centers?=?np.zeros((k?data.shape[1]))
????for?index?in?range(k):
????????centers[index]?=?np.mean(np.array(groups[index])?axis=0)
????return?centers
#?檢測(cè)與上一次迭代的更新差值
def?iter_diff(old_centers?new_centers):
????return?np.sum(np.abs(old_centers?-?new_centers))
#?生成隨機(jī)質(zhì)心
def?rand_center(data?k):
????#?共k個(gè)質(zhì)心,data.shape[1]是每個(gè)數(shù)據(jù)樣本的維度,質(zhì)心的維度應(yīng)與樣本的維度一致。
????centers?=?np.random.rand(k?data.shape[1])
????#?rand隨機(jī)的范圍是零到一,要適用于樣本的范圍需要進(jìn)行縮放
????#?這里使用樣本在該維度的最大值作為每個(gè)維度上的縮放倍數(shù)
????scale?=?np.max(data?axis=0)
????centers?*=?scale
????return?centers
#?迭代主體函數(shù)
def?classify(data?k?threshold?max_iter=0):
????centers?=?rand_center(data?k)
????loss?=?float(“inf“)
????iter_count?=?0
????#?當(dāng)loss小于閾值,或迭代次數(shù)大于指定最大次數(shù)時(shí)(若不指定則只判斷l(xiāng)oss足夠低)終止
????while?loss?>?threshold?and?((max_iter?==?0)?or?iter_count?????????groups?=?group_all(data?k?centers)
????????old_centers?=?centers
????????centers?=?update_centers(data?k?groups)
????????loss?=?iter_diff(old_centers?centers)
????????iter_count?+=?1
????????print(“iter_%d?:?loss=%f“?%?(iter_count?loss))
????return?centers?groups
#?繪圖
def?paint_result(data?centers?k?groups?debug=False):
????c?=?[]
????flatten_group?=?[]
????for?index?in?range(k):
????????for?item?in?groups[index]:
????????????c.append(index)
????????????flatten_group.append(item)
????groups?=?np.array(flatten_group)
????if?debug:
????????plt.scatter(groups[:?0]?groups[:?1])
????else:
????????plt.scatter(groups[:?0]?groups[:?1]c=c)
????plt.scatter(centers[:?0]?centers[:?1]?color=“red“)
????plt.show()
def?main():
????data?=?np.loadtxt(“d:/data.csv“?delimiter=““)
????data.resize((500?2))
????center?groups?=?classify(data?3?0?0)
????paint_result(data?center?3?groups)
if?__name__?==?‘__main__‘:
????main()
?屬性????????????大小?????日期????時(shí)間???名稱
-----------?---------??----------?-----??----
?????文件???????25501??2018-03-18?23:56??data.csv
?????文件????????3160??2018-03-19?22:17??kmeans.py
評(píng)論
共有 條評(píng)論