資源簡介
python語言實現kaggle中的數字挖掘入門實例數字識別,附代碼用到的數據與算法運行得到的結果,算法運行時間大概為4小時左右,主要是因為knn算法計算量較大。
代碼片段和文件信息
#-*-coding:utf-8-*-
import?sysos
import?csv
from?numpy?import?*
import?operator
path=sys.path[0]
def?loadTrainData():
trdata=list()
with?open(path+‘/train.csv‘)?as?file:
lines=csv.reader(file)
for?line?in?lines:#839*785
trdata.append(line)
trdata.remove(trdata[0])
trdata=array(trdata)
rowcol=len(trdata)len(trdata[0])
print?‘train‘rowcol
label=[]
data=zeros((rowcol-1))
for?i?in?range(0row):
label.append(trdata[i][0])
for?j?in?range(1col):
data[i][j-1]=trdata[i][j]
label=array(label).reshape(len(label)1)
data=array(data).reshape(len(label)len(trdata[0])-1)
label=toint(label)
data=toint(data)
return?nomalizing(data)label
def?loadTestdata():
tedata=list()
with?open(path+‘/test.csv‘)?as?file:
i=1
lines=csv.reader(file)
for?line?in?lines:
tedata.append(line)
tedata.remove(tedata[0])
rowcol=len(tedata)len(tedata[0])
print?‘test‘rowcol
data=zeros((rowcol))
for?i?in?range(0row):
for?j?in?range(0col):
data[i][j]=tedata[i][j]
data=toint(data)
return?nomalizing(data)
def?loadTestResult():
#28001*2??
terdata=list()
label=list()
with?open(path+‘/sample_submission.csv‘)?as?file:
lines=csv.reader(file)
for?line?in?lines:
terdata.append(line)
terdata.remove(terdata[0])
rowcol=len(terdata)len(terdata[0])
print?‘test‘rowcol
for?i?in?range(0row):
label.append(terdata[i][1])
label=array(label).reshape(len(label)1)
return?toint(label)
def?nomalizing(array):
mn=shape(array)
for?i?in?xrange(m):
for?j?in?xrange(n):
if?array[i][j]!=0:
array[i][j]=1
return?array
def?toint(array):
mn=shape(array)
print?mn
newdata=zeros((mn))
for?i?in?xrange(m):
for?j?in?xrange(n):
newdata[i][j]=int(array[i][j])
return?newdata
def?classify(inX?dataSet?labels?k):
inX=mat(inX)#1*28000
dataSet=mat(dataSet)#42000*784
labels=mat(labels)#1*42000
dataSetSize?=?dataSet.shape[0]???#42000??????????????
diffMat?=?tile(inX?(dataSetSize1))?-?dataSet????#?將數組inX作為元素構成42000*784的矩陣,即將inX重復42000遍,并與dataSet相減,求出inX與每一個train向量之間的距離
sqDiffMat?=?array(diffMat)**2??#對矩陣中的每個元素求平方
distances?=?sqDiffMat.sum(axis=1)#對矩陣中的每一行求和,得到42000*1的矩陣
#distances?=?sqDistances**0.5?
? sortedDistIndicies?=?distances.argsort()#對距離矩陣由小到大排序,得到排序矩陣,其中存儲每個值的索引
classCount={}????????????????????????????????????????
for?i?in?range(k):
voteIlabel?=?labels[0sortedDistIndicies[i]]??
classCount[voteIlabel]?=?classCount.get(voteIlabel0)?+?1??
sortedClassCount?=?sorted(classCount.iteritems()?key=operator.itemgetter(1)?reverse=True)??
return?sortedClassCount[0][0]??
def?saveResult(result):
l=array(result).reshape(len(result)1)
rowcol=?shape(l)
l1=[]
for?i?in?range(row):
for?j?in?range(col):
l1.append(int(float(l[i][j])))
print?l1
j=1
with?open(‘result3.csv‘‘wb‘)?as?file:
myW=csv.writer(file)
tmp=[]
tmp.append(‘ImageId‘)
tmp.append(‘Label‘)
myW.writerow(tmp)
for?i?in?l1:
tmp=[]
tmp.append(str(j))
j+=1
tmp.append(i)
myW.writerow(tmp)
#with?open(path+‘/re
評論
共有 條評論