91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

資源簡介

主題爬蟲的完整實現,具有文章內容判重,主題相似度計算,url去重,通用正文抽取算法,網頁內容分詞,關鍵詞自動抽取等功能。

資源截圖

代碼片段和文件信息

#?-*-?coding:utf-8?-*-


class?ContentExtract:
????def?__init__(self):
????????pass

????def?extract(self?content):
????????#?塊
????????block?=?[]
????????lines?=?content.split(‘\n‘)
????????count?=?0
????????str_temp?=?‘‘
????????#?塊長度
????????line_len?=?[]
????????for?line?in?lines:
????????????str_temp?=?str_temp?+?line?+?“?“
????????????count?=?count?+?1
????????????if?count?==?3:
????????????????str_temp?=?str_temp.replace(“\n“?““)
????????????????str_temp?=?str_temp.replace(“\t“?““)
????????????????str_temp?=?str_temp.replace(“\r“?““)
????????????????str_temp?=?str_temp.replace(“?“?““)
????????????????#?加入塊
????????????????block.append(str_temp)
????????????????#?加入塊長度
????????????????line_len.append(len(str_temp))
????????????????str_temp?=?‘‘
????????????????count?=?0

????????count_len?=?0
????????leng?=?len(block)
????????#?while?count_len?????????#?????print?count_len?line_len[count_len]?block[count_len]
????????#?????count_len?=?count_len?+?1

????????count_start?=?0

????????start_true?=?0
????????end_true?=?0
????????line_choice_true?=?[]
????????content_true?=?‘‘

????????start_temp?=?0
????????end_temp?=?0
????????line_choice_temp?=?[]
????????content_temp?=?‘‘

????????while?count_start?????????????#?起始行的長度大于50,起始行的前一行要小于30,起始行的后一行要大于50,真正起始行要大于上一個的起始行
????????????if?line_len[count_start]?>?50??and?line_len[count_start-1]??50?or?line_len[count_start]?>?300):
????????????????line_choice_temp?=?[]
????????????????content_temp?=?‘‘
????????????????start_temp?=?count_start
????????????????#?print?‘22‘
????????????????line_choice_temp.append(count_start)??#?將起始行加入選中的塊號
????????????????content_temp?=?content_temp?+?block[start_temp]??#?將起始行加入正文的字符串
????????????????#?print?“start:“?line_choice_temp
????????????#?如果該塊不是起始行,該塊的長度小于5,該塊的前一塊在選中的塊中,就將該塊記作結束塊
????????????elif?line_len[count_start]?==?0?and?count_start?-?1?in?line_choice_temp:
????????????????if?line_len[count_start+1]?????????????????????end_temp?=?count_start
????????????????????#?print?“end:“?line_choice_temp
????????????????????#?print?len(content_temp)
????????????????????#?print?len(content_true)
????????????????????#?print?line_len[count_start+1]
????????????????????if?len(content_temp)?>?len(content_true):
????????????????????????#?for?item?in?line_choice_temp:
????????????????????????#?????line_choice_true.append(item)
????????????????????????#?print?len(content_temp)
????????????????????????#?print?len(content_true)
????????????????????????line_choice_true?=?line_choice_temp
????????????????????????start_true?=?start_temp
????????????????????????end_true?=?end_temp
????????????????????????content_true?=?content_temp
????????????????else:
????????????????????line_choice_temp.append(count_start)
????????????????????content_temp?=?content_temp?+?block[count_start]
??????????????????

?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2016-12-18?16:05??network_spider\
?????文件????????3962??2016-12-03?23:52??network_spider\html_contentextract.py
?????文件????????1748??2016-12-03?23:52??network_spider\html_contentextract.pyc
?????文件????????1165??2016-12-03?23:52??network_spider\html_downloader.py
?????文件????????1295??2016-12-03?23:52??network_spider\html_downloader.pyc
?????文件?????????953??2016-12-03?23:52??network_spider\html_manager.py
?????文件????????1692??2016-12-03?23:52??network_spider\html_manager.pyc
?????文件?????????764??2016-12-03?23:52??network_spider\html_parser.py
?????文件????????1250??2016-12-03?23:52??network_spider\html_parser.pyc
?????文件?????????124??2016-12-03?23:52??network_spider\keyword.txt
?????文件????????7271??2016-12-03?23:52??network_spider\main.py
?????文件?????????854??2016-12-18?16:10??network_spider\mysql_manager.py
?????文件????????1438??2016-12-03?23:52??network_spider\mysql_manager.pyc
?????文件?????????425??2016-12-04?14:41??network_spider\README.txt
?????文件????????6057??2016-12-03?23:52??network_spider\stopword.txt
?????文件????????2138??2016-12-03?23:52??network_spider\test.py
?????文件????????3035??2016-12-03?23:52??network_spider\test_similarity.py
?????文件????????3258??2016-12-03?23:52??network_spider\test_similarity.pyc
?????文件??????186458??2016-12-03?23:52??network_spider\test_strstrip.html
?????文件???????????0??2016-12-03?23:52??network_spider\__init__.py

評論

共有 條評論