資源簡介
此資源僅供學習用途,當前selenium都是基于無頭模式的firefox或者chrome等瀏覽器進行爬蟲抓取,天眼查的反爬技術算是很不錯的,僅僅用于個人學習用,并不可以進行大數據的爬取 技術: python selenium 爬蟲 模擬登陸 xpath css選擇器 可自己裝proxy 想添加翻頁功能可以參考里面的代碼模板 也可以加我QQ問;必須將deckodriver放在相同路徑 記住:自己輸入賬號和密碼!每次輸入之后都要回車!

代碼片段和文件信息
#!/usr/bin/python
#?-*-?coding:utf-8?-*-
#?author:?Jola
#?datetime:2018/4/20?17:15
#?software-version:?python?3.5
import?time
from?selenium?import?webdriver
from?selenium.webdriver?import?Firefox
class?GetCompanyInfo(object):
????“““
????爬取天眼查下的企業的信息
????“““
????def?__init__(self):
????????“““
????????初始化爬蟲執行代理,使用firefox訪問
????????“““
????????self.username?=?‘13160676288‘
????????self.password?=?‘panjie19970620‘
????????self.options?=?webdriver.FirefoxOptions()
????????self.options.add_argument(‘-headless‘)??#?無頭參數
????????self.geckodriver?=?r‘geckodriver.exe‘
????????self.driver?=?Firefox(executable_path=self.geckodriver?firefox_options=self.options)
????????self.start_url?=?‘https://www.tianyancha.com‘
????def?test(self):
????????“““
????????調試專用
????????:return:
????????“““
????????start_url?=?‘http://y2.twenteen.cn/Home/Index‘
????????self.driver.get(start_url)
????????cookies?=?{
????????????‘ASP.NET_SessionId‘:?‘v3gnz3zsx0l2vxqmszhzat4w‘
????????????‘Hm_lvt_ddd605dfec122be0f190ebb874331df1‘:?‘1524279814‘
????????????‘Hm_lpvt_ddd605dfec122be0f190ebb874331df1‘:?‘152428022‘
????????}
????????for?k?v?in?cookies.items():
????????????self.driver.add_cookie({
????????????????‘name‘:?k
????????????????‘value‘:?v
????????????})
????????time.sleep(1)
????????print(self.driver.page_source)
????????self.driver.close()
????def?login(self):
????????“““
????????登錄并檢查狀態
????????:return:
????????“““
????????try:
????????????self.driver.get(self.start_url)
????????????print(self.driver.get_cookies())
????????????username?=?self.index_login()
????????????username_pattern?=?username[:3]?+?‘?****?‘?+?username[-4:]
????????????print(username_pattern)
????????????page?=?self.driver.page_source
????????????is_login?=?page.find(username_pattern)
????????????print(is_login)
????????????if?is_login?!=?-1:
????????????????print(‘登錄成功‘)
????????except?Exception?as?e:
????????????print(e)
????def?index_login(self):
????????“““
????????主頁下的登錄模式
????????:return:
????????“““
????????get_login?=?self.driver.find_elements_by_xpath(‘//a[@class=“media_port“]‘)[0]???#?登錄/注冊
????????print(get_login.text)
????????#?url為login的input
????????get_login.click()
????????login_by_pwd?=?self.driver.find_element_by_xpath(‘//div[@class=“bgContent“]/div[2]/div[2]/div‘)?????#?切換到手機登錄
????????print(login_by_pwd.text)
????????login_by_pwd.click()
????????input1?=?self.driver.find_element_by_xpath(‘//div[@class=“bgContent“]/div[2]/div/div[2]/input‘)?????#?手機號碼
????????input2?=?self.driver.find_element_by_xpath(‘//div[@class=“bgContent“]/div[2]/div/div[3]/input‘)?????#?密碼
????????print(input1.get_attribute(‘placeholder‘))
????????print(input2.get_attribute(‘placeholder‘))
????????username?password?=?self._check_user_pass()
????????input1.send_keys(username)
????????input2.send_keys(password)
????????login_button?=?self.driver.find_element_by_xpath(‘//div[@class=“bgContent“]/div[2]/di
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件????????9595??2018-04-21?22:42??crawl.py
?????文件?????9684296??2018-04-08?20:49??geckodriver.exe
評論
共有 條評論