資源簡介
利用python編寫了一個爬蟲代碼,爬取房天下商品房信息,可以更改鏈接地址,爬取其他信息
代碼片段和文件信息
#!usr/bin/python
#-*-coding:utf-8-*-
#coding:gbk
from?lxml?import?etree
import?requests
import?re
import?numpy?as?np
import?json
import?sys
reload(sys)
sys.setdefaultencoding(‘utf-8‘)
def?fangtianxia(url):
????head={‘User-Agent‘:‘Mozilla/5.0?(Windows?NT?6.1;?WOW64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/45.0.2454.101?Safari/537.36‘}
????html=requests.get(urlheaders=head).content.decode(‘gbk‘)
????selector=etree.HTML(html)
????content_field=selector.xpath(‘//div[@class=“nl_con?clearfix“]/ul‘)[0]
????urlurl_lpurl_hx=[][][]
????file=open(‘fangtianxia.txt‘‘a‘)
????for?each?in?content_field.xpath(‘li‘):
????????website=each.xpath(‘div[1]/div[2]/div[1]/div[1]/a‘)[0].xpath(‘@href‘)[0]
????????url.append(website)
????????loupan=each.xpath(‘div[1]/div[2]/div[1]/div[1]/a/text()‘)[0].strip()
????????try:
????????????region=each.xpath(‘div[1]/div[2]/div[3]/div[1]/a/span/text()‘)[0].replace(“]“““).replace(“[“““).strip()
????????except?Exceptione:
????????????print?e
????????????region=np.nan
????????try:
????????????address?=?each.xpath(‘div[1]/div[2]/div[3]/div[1]/a‘)[0].xpath(‘@title‘)[0]
????????except?Exceptione:
????????????address=np.nan
????????try:
????????????price=each.xpath(‘div[1]/div[2]/div[5]/span/text()‘)[0]+each.xpath(‘div[1]/div[2]/div[5]/em/text()‘)[0]
????????except?Exceptione:
????????????print?e
????????????price=np.nan
????????try:
????????????phone=each.xpath(‘div[1]/div[2]/div[3]/div[2]/p/text()[1]‘)[0]+“轉“?+?each.xpath(‘div[1]/div[2]/div[3]/div[2]/p/text()[2]‘)[0]
????????except?Exception?e:
????????????print?e
????????????phone?=?np.nan
????????print?we
- 上一篇:python爬取雅虎財經股票交易數據
- 下一篇:python實現圖片個性化文字編輯
評論
共有 條評論