資源簡介
利用selenium+pyquery對拉勾網進行爬取相應招聘信息,并且將爬取到的信息導入數據庫mysql中,
代碼片段和文件信息
import?re
import?time
from?selenium?import?webdriver
from?selenium.webdriver.common.by?import?By
from?selenium.webdriver.support.ui?import?WebDriverWait
from?selenium.webdriver.support?import?expected_conditions?as?EC
from?pyquery?import?PyQuery?as?pq
#?from?config?import?*
import?pymysql
browser?=?webdriver.Chrome()
wait?=?WebDriverWait(browser?10)
key_word?=?‘python爬蟲‘
host=“localhost“
user=“root“
password=“******“
db=“lagou“
TableName=‘shenzhen‘
sitys?=?{‘beijing‘:‘1‘?‘shanghai‘:‘2‘?‘shenzhen‘:‘3‘?‘guangzhou‘:‘4‘?‘hangzhou‘:‘5‘‘chengdou‘:‘6‘?‘nanjing‘:‘7‘?‘wuhan‘:‘8‘‘xian‘:‘9‘?‘xiamen‘:‘10‘}
key_sity?=?‘guangzhou‘
def?search():
????try:
????????url?=?‘https://www.lagou.com/‘
????????browser.get(url)
????????if?wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR?‘#cboxClose‘))):
????????????close_submit?=?wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR?‘#cboxClose‘)))
????????????close_submit.click()
????????input?=?wait.until(EC.presence_of_element_located((By.CSS_SELECTOR?‘#search_input‘)))
????????submit?=?wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR?‘#search_button‘)))
????????time.sleep(1)
????????input.clear()
????????input.send_keys(key_word)
????????submit.click()
????????city_select=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR
????????????‘#filterCollapse?>?div:nth-child(1)?>?div.choose-detail?>?li?>?div.other-hot-city?>?div?>?a:nth-child(%s)‘?%
????????????sitys[key_sity])))
????????city_select.click()
????????total_page?=?wait.until(EC.presence_of_element_located((By.CSS_SELECTOR?‘#s_position_list?>?div.item_con_pager?>?div?>?span:nth-child(5)‘)))
????????job_num?=?wait.until(EC.presence_of_element_located((By.CSS_SELECTOR?‘#tab_pos?>?span‘)))
????????return?total_page.textjob_num.text
????except?TimeoutError:
????????print(TimeoutError)
????????return?search()
def?get_html():
????wait.until(EC.presence_of_element_located((By.CSS_SELECTOR?‘#s_position_list?.item_con_list?.con_list_item‘)))
????html?=?browser.page_source
????return?html
def?next_page():
????counter?=?1
????get_products()
????pattern=re.compile(‘···.*?“pager_not_current“>(.*?)‘?re.S)
????total_page?=?re.findall(pattern?get_html())[0].strip()
????try:
????????f
評論
共有 條評論