91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

資源簡介

基于svm的中文文本自動分類系統(tǒng)的語料庫, 包含了17類, 全部都是自己爬的. 詳情:http://blog.csdn.net/yinchuandong2/article/details/17717449

資源截圖

代碼片段和文件信息

import?urllib2
import?urllib
import?re
import?chardet
import?sys

class?HTML_Tool:
????BgnCharToNoneRex?=?re.compile(“(\t|\n|?||)“)
????EndCharToNoneRex?=?re.compile(“<.*?>“)
????BgnPartRex?=?re.compile(““)
????CharToNewLineRex?=?re.compile(“(
|

||
|
)“)
????CharToNextTabRex?=?re.compile(““)
????replaceTab?=?[(“<““<“)(“>““>“)(“&““&“)(“&““\““)(“nbsp;““?“)]
????
????def?Replace_Char(selfx):
????????x?=?self.BgnCharToNoneRex.sub(““x)
????????x?=?self.BgnPartRex.sub(“\n????“x)
????????x?=?self.CharToNewLineRex.sub(“\n“x)
????????x?=?self.CharToNextTabRex.sub(“\t“x)
????????x?=?self.EndCharToNoneRex.sub(““x)

????????for?t?in?self.replaceTab:
????????????x?=?x.replace(t[0]t[1])
????????return?x

class?crawler:
????def?__init__(self):
????????self.page?=?11
????????self.myTool?=?HTML_Tool()
????????self.urllist?=?[]

????????self.index?=?1


????def?downloadpage(selfurl):?
????????myResponse??=?urllib2.urlopen(url)
????????myPage?=?myResponse.read()
????????typeEncode?=?sys.getfilesystemencoding()
????????infoencode?=?chardet.detect(myPage).get(‘encoding‘‘utf-8‘)
????????html?=?myPage.decode(infoencode‘ignore‘).encode(typeEncode)
????????links?=?re.findall(‘????????for?link?in?links:
????????????link?=‘http://studa.net‘?+?link
????????????self.download(link)
????????self.index?=+?1
????????url?=?“http://www.studa.net/dilidizhi/index0“?+?str(self.index)+“.html“
????????self.getIndexPage(url)

????def?download(selfurl):
????????print?url
????????url2?=?url.replace(“.html““-2.html“)
????????myResponse1??=?urllib2.urlopen(url)
????????myPage1?=?myResponse1.read()
????????myResponse2??=?urllib2.urlopen(url2)
????????myPage2?=?myResponse2.read()
????????typeEncode?=?sys.getfilesystemencoding()
????????infoencode?=?chardet.detect(myPage1).get(‘encoding‘‘utf-8‘)
????????html1?=?myPage1.decode(infoencode‘ignore‘).encode(typeEncode)
????????html2?=?myPage2.decode(infoencode‘ignore‘).encode(typeEncode)
????????myItems1?=?re.findall(‘(.*?)
‘html1re.S)
????????myItems2?=?re.findall(‘(.*?)
‘html2re.S)????????
????????file_object1?=?open(str(self.page)+‘.txt‘?‘w+‘)
????????file_object1.write(self.myTool.Replace_Char(myItems1[0]))
????????file_object1.close()
????????self.page?+=?1
????????file_object2?=?open(str(self.page)+‘.txt‘?‘w+‘)
????????file_object2.write(self.myTool.Replace_Char(myItems2[0]))
????????file_object2.close()
????????self.page?+=?1

????def?getIndexPage(self?url):
????????print?url
????????if?self.page?==?200:
????????????exit()
????????self.downloadpage(url)






crawler().getIndexPage(“http://www.studa.net/dilidizhi/index.html“)

?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2013-12-30?05:51??article\
?????目錄???????????0??2013-12-30?05:55??article\農(nóng)林\
?????文件????????4731??2013-12-30?05:55??article\農(nóng)林\65.txt
?????文件?????????634??2013-12-30?05:55??article\農(nóng)林\50.txt
?????文件????????4482??2013-12-30?05:55??article\農(nóng)林\20.txt
?????文件????????3430??2013-12-30?05:55??article\農(nóng)林\34.txt
?????文件????????7697??2013-12-30?05:55??article\農(nóng)林\72.txt
?????文件????????5572??2013-12-30?05:55??article\農(nóng)林\52.txt
?????文件????????4169??2013-12-30?05:55??article\農(nóng)林\9.txt
?????文件????????3933??2013-12-30?05:55??article\農(nóng)林\45.txt
?????文件????????5998??2013-12-30?05:55??article\農(nóng)林\10.txt
?????文件????????2957??2013-12-30?05:55??article\農(nóng)林\27.txt
?????文件????????5434??2013-12-30?05:55??article\農(nóng)林\23.txt
?????文件????????1594??2013-12-30?05:55??article\農(nóng)林\75.txt
?????文件????????4278??2013-12-30?05:55??article\農(nóng)林\63.txt
?????文件????????2596??2013-12-30?05:55??article\農(nóng)林\1.txt
?????文件????????6722??2013-12-30?05:55??article\農(nóng)林\74.txt
?????文件????????3596??2013-12-30?05:55??article\農(nóng)林\16.txt
?????文件????????1990??2013-12-30?05:55??article\農(nóng)林\44.txt
?????文件????????4474??2013-12-30?05:55??article\農(nóng)林\78.txt
?????文件????????2224??2013-12-30?05:55??article\農(nóng)林\2.txt
?????文件????????4001??2013-12-30?05:55??article\農(nóng)林\15.txt
?????文件????????2907??2013-12-30?05:55??article\農(nóng)林\67.txt
?????文件????????6233??2013-12-30?05:55??article\農(nóng)林\80.txt
?????文件????????9977??2013-12-30?05:55??article\農(nóng)林\71.txt
?????文件????????1636??2013-12-30?05:55??article\農(nóng)林\13.txt
?????文件????????2492??2013-12-30?05:55??article\農(nóng)林\76.txt
?????文件????????7723??2013-12-30?05:55??article\農(nóng)林\51.txt
?????文件????????2667??2013-12-30?05:55??article\農(nóng)林\32.txt
?????文件????????2896??2013-12-30?05:55??article\農(nóng)林\56.txt
?????文件????????2145??2013-12-30?05:55??article\農(nóng)林\40.txt
............此處省略3222個文件信息

評論

共有 條評論