資源簡介
對上個爬蟲代碼的補充,主要用于武漢理工大學課表的爬取
代碼片段和文件信息
#!/usr/bin/env?python
#?-*-?coding:utf-8?-*-
#author:universtar
#time:18/4/12
from?urllib?import?request
from?urllib?import?parse
from?bs4?import?BeautifulSoup
import?time
import?re
#響應頭信息
headers?=?{
????‘User-Agent‘:‘Mozilla/5.0?(Windows?NT?6.1;?WOW64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/57.0.2987.98?Safari/537.36‘
}
#目標url
url?=?‘http://sso.jwc.whut.edu.cn/Certification//login.do‘
#獲取原網頁返回的html
def?get_html(urluserNamepassword):
????#添加進入教務處的信息
????data?=?{
????????‘systemId‘:‘‘
????????‘xmlmsg‘:‘‘
????????‘userName‘:userName
????????‘password‘:password
????????‘type‘:‘xs‘
????}
????#將信息格式編碼為html格式
????data?=?parse.urlencode(data).encode(‘utf-8‘)
????#提交請求
????req?=?request.Request(url=urlheaders=headersdata=data)
????response?=?request.urlopen(req)
????#獲取網頁html代碼
????html?=??response.read()
????return?html
#
def?get_info(htmlresponse):
????#獲得soup對象
????soup?=?BeautifulSoup(htmlresponse?‘html.parser‘?from_encoding=‘utf-8‘)
????#從soup對象中截取到所要的信息
????infos?=?soup.find_all(‘div‘style=“margin-top:?2px;?font-size:?10px“)
評論
共有 條評論