91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

  • 大小: 18.23MB
    文件類型: .zip
    金幣: 1
    下載: 0 次
    發布日期: 2023-07-04
  • 語言: Python
  • 標簽:

資源簡介

爬取百度百科中文頁面,抽取三元組信息,構建中文知識圖譜

資源截圖

代碼片段和文件信息

import?re
from?scrapy.selector?import?Selector

import?pickle
import?glob
from?pathlib?import?Path
import?ossys
import?threading

print(‘loading?pages‘)
pages=glob.glob(‘../webpages/*‘)
print(‘loading?pages?done.‘)
savepath=‘./paged.bin‘

print(len(pages))
print(pages[0])
paged=[]
if?os.path.exists(savepath):
paged=pickle.load(open(savepath‘rb‘))
print(‘load?state‘)
lock=threading.Lock()
fail_file=open(‘./fail_para.txt‘‘w‘)
class?MyThread(threading.Thread):
def?__init__(self):
threading.Thread.__init__(self)
self._running?=?True
def?terminate(self):
self._running?=?False
def?extract(selfpage):
#用Xpath提取出
中的所有內容
line=Selector(text=open(page‘r‘).read()).xpath(‘//div[contains(@class?“main-content“)]‘)
title=line.xpath(‘//h1//text()‘).extract()
para=re.sub(‘\[[0-9]+\]‘?‘‘?‘‘.join(word?for?word?in?line.xpath(‘//div[contains(@class?“para“)]//text()‘).extract()?if?len(word)>1))
#?print(para)
print(‘process?file:‘+str(title))
output?=?open(‘./info-para/‘+‘‘.join(title).replace(‘/‘‘‘)+‘.txt‘‘w‘)
output.write(para)
output.close()
def?run(self):
try:
while?len(pages)>0?and?self.running:
lock.acquire()
page=pages[0]
pages.remove(page)
lock.release()
self.extract(page)
lock.acquire()
paged.append(page)
lock.release()
except?Exception?as?e:
print(‘fail?to?extract..‘str(e))
fail_file.write(page)



list_thread=[]
try:
print(‘start...‘)
for?i?in?range(12):
????list_thread.append(MyThread())
for?th?in?list_thread:
????th.start()
????th.join()
except:
????for?th?in?list_thread:
????????th.terminate()
????print(‘error!‘?sys.exc_info()[0])
finally:
????print(‘save?state‘)
????pickle.dump(paged?open(‘paged.bin‘?‘wb‘))
????fail_file.close()



?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2019-06-10?02:25??WEB_KG-master\
?????文件??????????40??2019-06-10?02:25??WEB_KG-master\.gitignore
?????文件????????1177??2019-06-10?02:25??WEB_KG-master\README.md
?????目錄???????????0??2019-06-10?02:25??WEB_KG-master\ie\
?????文件????????1788??2019-06-10?02:25??WEB_KG-master\ie\extract-para.py
?????文件????????2281??2019-06-10?02:25??WEB_KG-master\ie\extract-table.py
?????目錄???????????0??2019-06-10?02:25??WEB_KG-master\kg\
?????文件????????1083??2019-06-10?02:25??WEB_KG-master\kg\build-triple-from-table.py
?????文件????????1175??2019-06-10?02:25??WEB_KG-master\kg\insert_to_neo4j.py
?????文件??????397289??2019-06-10?02:25??WEB_KG-master\kg\kg.png
?????文件????53091044??2019-06-10?02:25??WEB_KG-master\kg\triples.txt
?????目錄???????????0??2019-06-10?02:25??WEB_KG-master\spider\
?????文件????????1337??2019-06-10?02:25??WEB_KG-master\spider\html_downloader.py
?????文件????????2366??2019-06-10?02:25??WEB_KG-master\spider\html_parser.py
?????文件????????2189??2019-06-10?02:25??WEB_KG-master\spider\spider_main.py
?????文件?????????648??2019-06-10?02:25??WEB_KG-master\spider\url_manager.py

評論

共有 條評論