資源簡介
python爬取情話源碼,

代碼片段和文件信息
import?os
#導入request庫
import?requests
#導入Beautiful?Soup庫
from?bs4?import?BeautifulSoup?as?BS
#定義獲取數據的方法
def?Get_data():
????#設置請求頭headers
????headers?=?{
????????‘User-Agent‘:?‘Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/68.0.3440.106?Safari/537.36‘
????????‘cookie‘:?‘acw_tc=6f3e47cc15498749418558688e3d6410ea97be9a552b180841c5079605;?PHPSESSID=s2hjdg3slh32omchfsib4himk0;?UM_distinctid=168dbbfe8c533e-0e42df11c1c0a7-9393265-100200-168dbbfe8c71088;?CNZZDATA1256319371=345851669-1549873534-%7C1549873534;?CNZZDATA1257119496=134880590-1549870452-%7C1549870452;?Hm_lvt_a48e6ab107a4e68d47e6fdb5d83961e5=1549875015;?Hm_lvt_3c8ecbfa472e76b0340d7a701a04197e=1549875021;?CNZZDATA1254708131=653924416-1549874976-%7C1549874976;?CNZZDATA1275922735=1673340420-1549874690-%7C1549874690;?CNZZDATA1257131565=1820590917-1549873827-%7C1549873827;?CNZZDATA1257125147=1301871275-1549871347-https%253A%252F%252Fwww.duanwenxue.com%252F%7C1549871347;?Hm_lpvt_a48e6ab107a4e68d47e6fdb5d83961e5=1549875906;?Hm_lpvt_3c8ecbfa472e76b0340d7a701a04197e=1549875913;?ajax_award_timestamp=1549875887;?ajax_award_timestamp__ckMd5=706904497e9c8dfd;?ajax_award_key=94a17405d22ab8aacbabbbfdb9d4740c;?ajax_award_key__ckMd5=3e97bde02ab1998d‘
????????‘referer‘:?‘https://www.duanwenxue.com/huayu/tianyanmiyu/list_69.html‘
????????‘upgrade-insecure-requests‘?:?‘1‘
????}
????#根據url中最后數字的不同循環獲取情話并調用parse_text寫入文件
????for?i?in?range(170):
????????#使用format可以將i作為占位符的{}里面的內容并填入
????????url?=?‘https://www.duanwenxue.com/huayu/tianyanmiyu/list_{}.html‘.format(i)
????????#調用requests的get方法爬取內容
????????response?=?requests.get(urlheaders=headers)
????????#調用parse_text方法來解析網頁內容
????????parse_text(response.text)
def?parse_text(text):
????articles?=?[]
????if?text:
????????#使用lxml解析器,常用的解析html的解析器
????????soup?=?BS(text‘lxml‘)
????????#調用find方法來首先找到class為list-short-article的div元素再找到target屬性為blank的a標簽
????????arttis?=?soup.find(‘div‘?class_=‘list-short-article‘).find_all(‘a‘?{‘target‘:?“_blank“})
????????#這里是通過列表推導式以及for循環獲取到每個a標簽里面的text內容并通過strip去除空格
????????articles?=?[arttis[i].text.strip()?for?i?in?range(len(arttis))?]
????#將解析后的網頁內容存放在text文件中
????#a:表示打開一個文件用于追加。如果該文件已經存在,文件指針會放在文件末尾,也就是說文件內容會被寫入到已有內容之后
????#如果該文件不存在,創建新文件進行寫入
????with?open(‘Love_words.txt‘‘a‘encoding=‘utf-8‘)?as?f:
????????for?i?in?articles:
????????????#每一條內容后面追加換行符
????????????f.write(i+‘\n‘)
if?__name__?==?‘__main__‘:
????os.remove(‘Love_words.txt‘)
????Get_data()
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件??????316294??2019-02-13?14:49??python爬取情話源碼\Love_words.txt
?????文件????????3016??2019-02-15?10:47??python爬取情話源碼\get_data.py
?????目錄???????????0??2019-02-15?10:35??python爬取情話源碼\
評論
共有 條評論