資源簡介
利用十大經典機器學習算法之一的SVM(支持向量機)算法,實現文本分類,用于自然語言處理。
代碼片段和文件信息
from?__future__?import?print_function
import?numpy?as?np
import?time
from?sklearn.neighbors?import?KNeighborsClassifier
from?sklearn.svm?import?SVC??
from?sklearn?import?metrics
import?re
import?os
import?math
trainscp=‘train.txt‘
testscp=‘test.txt‘
outpath=‘result‘
latentpath=‘latent_topic‘
mode=‘a‘
resultfile=outpath?+?‘RESULT‘??+?‘.txt‘
normflag=False
def?normalizeModel_1d(a?out=None):??
????if?out?is?None:?out?=?np.empty_like(a)
????s?=?np.sum(a**2)
????s?=?math.sqrt(s)
????if?s?!=?0.0?and?len(a)?!=?1:
????????np.divide(a?s?out)
????return?out
def?normalizeModel(M?axis=0?out=None):
????if?len(M.shape)?==?1:?return?normalizeModel_1d(M?out)
????if?out?is?None:?out?=?np.empty_like(M)
????if?axis?==?0:
????????M?=?M.T?
????????out?=?out.T
????for?i?in?range(len(M)):?
????????normalizeModel_1d(M[i]?out[i])??
????if?axis?==?0:?out?=?out.T
????return?out
????????
def?file2matrix(filelisttfidfpath?norm=normflag):
????fr?=?open(filelist)?????????
????files?=?[line.strip()?for?line?in?fr.readlines()]
????number_of_samples?=?len(files)?
????fr.close()
????list_of_line=files[0].split()
????FileTFIDF=os.path.join(tfidfpathfiles[0].split()[0])
????fd=open(FileTFIDF‘r‘)
????fea_dim=len(fd.readlines())
????fd.close()
????SampleMat?=?np.zeros((number_of_samplesfea_dim)dtype=float)
????Label=np.zeros((number_of_samples)dtype=np.int)
????fileindex?=?0
????for?file?in?files:
????????list_of_line=file.split()
????????FileTFIDF=os.path.join(tfidfpathlist_of_line[0])
????????str1=re.subn(‘ENG‘‘‘list_of_line[1])
????????topicid=str1[0]
????????Label[fileindex]=topicid
????????fd=open(FileTFIDF‘r‘)
????????contents=fd.readlines()
????????fea_col=0
????????normValue?=?0.0
????????for?ele?in?contents:
????????????SampleMat[fileindexfea_col]=float(ele)
????????????if?norm:
????????????????normValue?+=?float(ele)?**?2
????????????fea_col?+=?1
????????if?norm:
????????????normValue=math.sqrt(normValue)
????????????for?i?in?range(0fea_dim):
????????????????SampleMat[fileindexi]=SampleMat[fileindexi]/normValue
????????fileindex?+=?1??????
??????
????????fd.close()
????return?SampleMatLabel
def?file2matrix_v2(filestfidfpath?norm=normflag):
????number_of_samples?=?len(files)?
????list_of_line=files[0].split()
????FileTFIDF=os.path.join(tfidfpathfiles[0].split()[0])
????fd=open(FileTFIDF‘r‘)
????fea_dim=len(fd.readlines())
????fd.close()
????SampleMat?=?np.zeros((number_of_samplesfea_dim)dtype=float)
????Label=np.zeros((number_of_samples)dtype=np.int)
????fileindex?=?0
????for?file?in?files:
????????list_of_line=file.split()
????????FileTFIDF=os.path.join(tfidfpathlist_of_line[0])
????????str1=re.subn(‘ENG‘‘‘list_of_line[1])
????????topicid=str1[0]
????????Label[fileindex]=topicid
????????fd=open(FileTFIDF‘r‘)
????????contents=fd.readlines()
????????fea_col=0
????????normValue?=?0.0
????????for?ele?in?contents:
????????????SampleMat[fil
- 上一篇:python圖像處理.rar
- 下一篇:天天酷跑Python.docx
評論
共有 條評論