-
大小: 11KB文件類型: .py金幣: 1下載: 0 次發布日期: 2023-12-20
- 語言: Python
- 標簽:
資源簡介
https://github.com/helloMickey/project_previous/tree/master/judicial-data-analysis
爬取法律判決書的日期、年份、處理法院,并下載相應文書。
代碼簡單修改參數即可爬取不同的案件
代碼片段和文件信息
#?coding:utf-8
import?socket
socket.setdefaulttimeout(60)
import?requests
import?urllib2
#?import?cchardet
import?os?time
from?lxml?import?etree
import?threading
import?re
import?random
import?sys
reload(sys)
sys.setdefaultencoding(‘utf-8‘)
#?filenames=os.listdir(‘.‘)
#?count=0
#?for?fname?in?filenames:
#? if?fname.startswith(‘gid_log‘):
#? count+=1
#?gid_path=‘gid_log_%d‘?%(count)
#?1、2步分開運行要注意gid_path
#?gid_path=‘gid_log_12‘
def?get_html(url):??#?得到網頁源碼
????headers?=?{
????????“Accept-Language“:?“zh-CNzh;q=0.8“
????????“Accept-Encoding“:?“gzip?deflate?sdch“
????????“Accept“:?“text/htmlapplication/xhtml+xmlapplication/xml;q=0.9image/webp*/*;q=0.8“
????????“User-Agent“:?“Mozilla/5.0?(Windows?NT?6.1;?WOW64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/45.0.2454.101?Safari/537.36“
????????“Host“:?“www.pkulaw.cn“
????????“Cookie“:?“bdyh_record=1970324860086081%2C1970324860087844%2C1970324860087837%2C1970324860087907%2C1970324860085114%2C1970324860087657%2C1970324860087697%2C1970324860087631%2C1970324860087701%2C1970324860087851%2C1970324860086614%2C1970324860000764%2C1970324845231811%2C1970324860004991%2C1970324860002384%2C1970324845231794%2C1970324845231624%2C1970324860002207%2C1970324860046814%2C1970324860046704%2C;?CheckIPAuto=0;?CheckIPDate=2016-10-15?10:03:46;?gm3jc5afyl35gm2yt55kc4m1isIPlogin=1;?ASP.NET_SessionId=davttbjhikxhqyn1lj5alhsb;?Hm_lvt_58c470ff9657d300e66c7f33590e53a8=1476497011147649834814764985281476499578;?Hm_lpvt_58c470ff9657d300e66c7f33590e53a8=1476499578;?Hm_lvt_8266968662c086f34b2a3e2ae9014bf8=1476497011147649834814764985281476499578;?Hm_lpvt_8266968662c086f34b2a3e2ae9014bf8=1476499578;?CookieId=gm3jc5afyl35gm2yt55kc4m1;?FWinCookie=1“
????????“Upgrade-Insecure-Requests“:?“1“
????????“Proxy-Connection“:?“keep-alive“
????}
????html?=?requests.get(url?headers=headers).text
????return?html
def?write2file(content?filename):??#?將爬取的文書寫入文件保存
????try:
????????f?=?open(filename?‘w‘)
????except?Exception?e:
????????filename?=?filename.split(u‘、‘)[0]?+?‘_error_filename.txt‘
????????f?=?open(filename?‘w‘)
????f.write(content.encode(‘utf-8‘))
????f.close()
????#?下載ihref對應的文書
def?load_one_wenshu(gid?title):
????ex_href?=?‘http://www.pkulaw.cn/case/FullText/_getFulltext?library=pfnl&gid=#gid#&loginSucc=0‘
????href?=?ex_href.replace(‘#gid#‘?gid)
????html?=?get_html(href)
????page?=?etree.HTML(html)
????content?=?page.xpath(‘body‘)[0].xpath(‘string(.)‘).strip()
????write2file(content?filepath?+?os.sep?+?title?+?‘.txt‘)
def?load_one_page_wenshu(gid_list?titles):??#?多線程抓取多個href的文書
????#?threads=[]???#?嘗試多線程加速?失敗?訪問頻繁?出現驗證碼?封ip
????#?for?i?in?range(len(gid_list)):
????#? gidtitle=gid_list[i]titles[i]
????#? threads.append(threading.Thread(target=load_one_wenshuargs=(gidtitle)))
????#?for?t?in?threads:
????#? t.start()
????#?t.join()??#?阻塞
????for?i?in?range(len(gid_list)):??#?順序爬取?時間過長?一個月大概需要20~30h
????????load_one_wenshu(
評論
共有 條評論