-
大小: 5.28MB文件類型: .zip金幣: 2下載: 0 次發(fā)布日期: 2023-10-04
- 語言: Python
- 標簽:
資源簡介
網站圖片爬蟲(已包含:微博,微信公眾號,花瓣網)及免費IP代理 豆瓣電影爬蟲

代碼片段和文件信息
#encoding:utf-8
import?requests
import?json
import?ossystime
from?lxml?import?etree
from?scrapy.selector?import?Selector
from?scrapy.http?import?HtmlResponse
from?bs4?import?BeautifulSoup
import?re
reload(sys)
sys.setdefaultencoding(“utf-8“)
LANGUAGES_RE?=?re.compile(ur“語言:?(.+?)
“)
COUNTRIES_RE?=?re.compile(ur“制片國家/地區(qū):?(.+?)
“)
ALTERNATE_NAME_RE?=?re.compile(ur“又名:?(.+?)
“)
RELEASE_TIME_RE?=?re.compile(ur“上映日期:?(.+?)
“)
NUM_RE?=?re.compile(r“(\d+)“)
data_save_file?=?“douban_donghua_results.txt“
headers?=?{
‘Accept‘:‘*/*‘
‘Accept-Encoding‘:‘gzip?deflate?br‘
‘Accept-Language‘:‘zh-CNzh;q=0.8en;q=0.6‘
‘Connection‘:‘keep-alive‘
‘Host‘:‘movie.douban.com‘
‘Referer‘:‘https://movie.douban.com/explore‘
‘User-Agent‘:‘Mozilla/5.0?(Macintosh;?Intel?Mac?OS?X?10_12_1)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/59.0.3071.115?Safari/537.36‘
‘X-Requested-With‘:‘xmlHttpRequest‘
}
def?get_item_list(d_urld_typed_tagd_sortd_page_limitd_page_start):
params?=?{}
params[“type“]?=?d_type
params[“tag“]?=?d_tag
if?d_sort?!=?““:
params[“sort“]?=?d_sort
params[“page_limit“]?=?d_page_limit
params[“page_start“]?=?d_page_start
response?=?requests.get(d_urlheaders?=?headersparams?=?paramstimeout?=10)
json_obj?=?response.json()
json_array?=?json_obj[“subjects“]
return?json_array
def?get_item_list_from_newsearch(d_urld_sortd_ranged_tagd_page_start):
params?=?{}
params[“sort“]?=?d_sort
params[“tags“]?=?d_tag
params[“range“]?=?d_range
params[“start“]?=?d_page_start
response?=?requests.get(d_urlheaders?=?headersparams?=?paramstimeout?=10)
json_obj?=?response.json()
json_array?=?json_obj[“data“]
return?json_array
def?get_item_detail(item_detail_url):
result_obj?=?{}
result_obj[“subject_id“]?=?int(item_detail_url.split(“/“)[-2])
celebrities_url?=?“https://movie.douban.com/subject/“+str(result_obj[“subject_id“])+“/celebrities“
(directors_cn_namesdirectors_en_namesactors_cn_namesactors_en_names)=get_directors_and_actors(celebrities_url)
result_obj[“directors_cn_names“]?=?directors_cn_names
result_obj[“directors_en_names“]?=?directors_en_names
result_obj[“actors_cn_names“]?=?actors_cn_names
result_obj[“actors_en_names“]?=?actors_en_names
response?=?requests.get(item_detail_urlheaders?=?headerstimeout?=?10)
selector?=?etree.HTML(response.text)
s_response?=?HtmlResponse(url=item_detail_urlbody?=?response.textencoding=‘utf-8‘)
name?=?s_response.selector.xpath(“//title/text()“).extract()
if?name:?result_obj[“movie_name“]?=?name[0].replace(u“?(豆瓣)“?““).strip()
genres?=?s_response.selector.xpath(“//span[@property=‘v:genre‘]/text()“).extract()
if?genres:?result_obj[“genres“]?=?genres
S?=?““.join(s_response.selector.xpath(“//div[@id=‘info‘]“).extract())
M?=?COUNTRIES_RE.search(S)
if?M?is?not?None:
result_obj[“countries“]?=?[country.strip()?for?country?in?M.group(1).split(“/“)]
L?=?LANGUAGES_RE.search(S)
if?L?is?not?None:
result_obj[“l(fā)anguages“]?=?[?lang.st
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2017-10-09?07:55??image_crawler-master\
?????目錄???????????0??2017-10-09?07:55??image_crawler-master\DoubanMovie\
?????文件????????6307??2017-10-09?07:55??image_crawler-master\DoubanMovie\movie_crawler.py
?????文件????????1850??2017-10-09?07:55??image_crawler-master\DoubanMovie\write_to_mysql.py
?????目錄???????????0??2017-10-09?07:55??image_crawler-master\Huaban\
?????文件????????4257??2017-10-09?07:55??image_crawler-master\Huaban\explain.md
?????文件????????4719??2017-10-09?07:55??image_crawler-master\Huaban\huaban_crawler.py
?????文件?????1437852??2017-10-09?07:55??image_crawler-master\Huaban\huaban_travel_places_result.txt
?????目錄???????????0??2017-10-09?07:55??image_crawler-master\IpProxy\
?????目錄???????????0??2017-10-09?07:55??image_crawler-master\IpProxy\Ip181FreeProxy\
?????文件????????1086??2017-10-09?07:55??image_crawler-master\IpProxy\Ip181FreeProxy\get_ip181.py
?????目錄???????????0??2017-10-09?07:55??image_crawler-master\IpProxy\KuaiFreeProxy\
?????文件????????1088??2017-10-09?07:55??image_crawler-master\IpProxy\KuaiFreeProxy\get_kuaifreeproxy.py
?????目錄???????????0??2017-10-09?07:55??image_crawler-master\IpProxy\XunFreeProxy\
?????文件????????1155??2017-10-09?07:55??image_crawler-master\IpProxy\XunFreeProxy\get_xunfreeproxy.py
?????文件?????????714??2017-10-09?07:55??image_crawler-master\README.md
?????目錄???????????0??2017-10-09?07:55??image_crawler-master\SinaWeibo\
?????文件????10883820??2017-10-09?07:55??image_crawler-master\SinaWeibo\chromedriver
?????文件???????30151??2017-10-09?07:55??image_crawler-master\SinaWeibo\image_result.md
?????文件????????8873??2017-10-09?07:55??image_crawler-master\SinaWeibo\weibo_crawler.py
?????文件????????5080??2017-10-09?07:55??image_crawler-master\SinaWeibo\weibo_hot_topic_crawler.py
?????目錄???????????0??2017-10-09?07:55??image_crawler-master\WechatOfficialAccounts\
?????文件????????2333??2017-10-09?07:55??image_crawler-master\WechatOfficialAccounts\spider_wechat_official_accounts.py
評論
共有 條評論