資源簡介
主題爬蟲的完整實現,具有文章內容判重,主題相似度計算,url去重,通用正文抽取算法,網頁內容分詞,關鍵詞自動抽取等功能。

代碼片段和文件信息
#?-*-?coding:utf-8?-*-
class?ContentExtract:
????def?__init__(self):
????????pass
????def?extract(self?content):
????????#?塊
????????block?=?[]
????????lines?=?content.split(‘\n‘)
????????count?=?0
????????str_temp?=?‘‘
????????#?塊長度
????????line_len?=?[]
????????for?line?in?lines:
????????????str_temp?=?str_temp?+?line?+?“?“
????????????count?=?count?+?1
????????????if?count?==?3:
????????????????str_temp?=?str_temp.replace(“\n“?““)
????????????????str_temp?=?str_temp.replace(“\t“?““)
????????????????str_temp?=?str_temp.replace(“\r“?““)
????????????????str_temp?=?str_temp.replace(“?“?““)
????????????????#?加入塊
????????????????block.append(str_temp)
????????????????#?加入塊長度
????????????????line_len.append(len(str_temp))
????????????????str_temp?=?‘‘
????????????????count?=?0
????????count_len?=?0
????????leng?=?len(block)
????????#?while?count_len?????????#?????print?count_len?line_len[count_len]?block[count_len]
????????#?????count_len?=?count_len?+?1
????????count_start?=?0
????????start_true?=?0
????????end_true?=?0
????????line_choice_true?=?[]
????????content_true?=?‘‘
????????start_temp?=?0
????????end_temp?=?0
????????line_choice_temp?=?[]
????????content_temp?=?‘‘
????????while?count_start?????????????#?起始行的長度大于50,起始行的前一行要小于30,起始行的后一行要大于50,真正起始行要大于上一個的起始行
????????????if?line_len[count_start]?>?50??and?line_len[count_start-1]?50?and?count_start-1?not?in?line_choice_temp?and?(line_len[count_start+1]?>?50?or?line_len[count_start]?>?300):
????????????????line_choice_temp?=?[]
????????????????content_temp?=?‘‘
????????????????start_temp?=?count_start
????????????????#?print?‘22‘
????????????????line_choice_temp.append(count_start)??#?將起始行加入選中的塊號
????????????????content_temp?=?content_temp?+?block[start_temp]??#?將起始行加入正文的字符串
????????????????#?print?“start:“?line_choice_temp
????????????#?如果該塊不是起始行,該塊的長度小于5,該塊的前一塊在選中的塊中,就將該塊記作結束塊
????????????elif?line_len[count_start]?==?0?and?count_start?-?1?in?line_choice_temp:
????????????????if?line_len[count_start+1]?5:
????????????????????end_temp?=?count_start
????????????????????#?print?“end:“?line_choice_temp
????????????????????#?print?len(content_temp)
????????????????????#?print?len(content_true)
????????????????????#?print?line_len[count_start+1]
????????????????????if?len(content_temp)?>?len(content_true):
????????????????????????#?for?item?in?line_choice_temp:
????????????????????????#?????line_choice_true.append(item)
????????????????????????#?print?len(content_temp)
????????????????????????#?print?len(content_true)
????????????????????????line_choice_true?=?line_choice_temp
????????????????????????start_true?=?start_temp
????????????????????????end_true?=?end_temp
????????????????????????content_true?=?content_temp
????????????????else:
????????????????????line_choice_temp.append(count_start)
????????????????????content_temp?=?content_temp?+?block[count_start]
??????????????????
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2016-12-18?16:05??network_spider\
?????文件????????3962??2016-12-03?23:52??network_spider\html_contentextract.py
?????文件????????1748??2016-12-03?23:52??network_spider\html_contentextract.pyc
?????文件????????1165??2016-12-03?23:52??network_spider\html_downloader.py
?????文件????????1295??2016-12-03?23:52??network_spider\html_downloader.pyc
?????文件?????????953??2016-12-03?23:52??network_spider\html_manager.py
?????文件????????1692??2016-12-03?23:52??network_spider\html_manager.pyc
?????文件?????????764??2016-12-03?23:52??network_spider\html_parser.py
?????文件????????1250??2016-12-03?23:52??network_spider\html_parser.pyc
?????文件?????????124??2016-12-03?23:52??network_spider\keyword.txt
?????文件????????7271??2016-12-03?23:52??network_spider\main.py
?????文件?????????854??2016-12-18?16:10??network_spider\mysql_manager.py
?????文件????????1438??2016-12-03?23:52??network_spider\mysql_manager.pyc
?????文件?????????425??2016-12-04?14:41??network_spider\README.txt
?????文件????????6057??2016-12-03?23:52??network_spider\stopword.txt
?????文件????????2138??2016-12-03?23:52??network_spider\test.py
?????文件????????3035??2016-12-03?23:52??network_spider\test_similarity.py
?????文件????????3258??2016-12-03?23:52??network_spider\test_similarity.pyc
?????文件??????186458??2016-12-03?23:52??network_spider\test_strstrip.html
?????文件???????????0??2016-12-03?23:52??network_spider\__init__.py
- 上一篇:Catia圓柱直斜齒輪生成插件
- 下一篇:sunlips一代軟件
評論
共有 條評論