資源簡介
簡單的知乎爬蟲,輸入關鍵詞,爬取該關鍵詞下的提問,回答,點贊數等,以csv格式存儲
代碼片段和文件信息
#
#?import?requests
#?import?csv
#?from?lxml?import?etree
#?from?pyquery?import?PyQuery?as?pq
#
#?user_agent?=?‘Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML?like?Gecko)?‘?\
#??????????????‘Chrome/66.0.3359.181?Safari/537.36?‘
#?headers?=?{‘User-Agent‘:?user_agent}
#?ques?=?input(‘請輸入問題:‘)
#?url?=?‘https://www.zhihu.com/search?type=content&q=‘?+?str(ques)
#?r?=?requests.get(url?headers=headers)
#
#?doc?=?pq(r.text)
#?print(doc)
#?#?b?=?doc.find(‘h2‘).text()
#?#
#?#?file?=?open(‘explore.txt‘?‘w+‘?encoding=‘utf-8‘)
#?#?file.write(‘\n‘.join([r.text]))
#?#?#?file.write(‘\n‘?+?‘=‘?*?50?+?‘\n‘)
#?#?file.close()
#
#?items?=?doc(‘.Search-container?.AnswerItem‘).items()
#?#?items?=?doc(‘.Search-container‘).items()
#?rows?=?[]
#?header?=?[‘問題‘?‘作者‘?‘網址‘]
#?with?open(‘zhihu.csv‘‘w+‘)?as?f:
#?????f_csv?=?csv.writer(f?)
#?????f_csv.writerow(header)
#?for?item?in?items:
#?????#?answer=item.find(‘meta‘)
#?????#?a?=?answer.attr.content
#?????#?a?=?a.text()
#?????#?print(a)
#?????#?print(answer)
#
#?????question?=?item.find(‘h2‘).text()
#?????author?=?item.find(‘b‘).text()
#?????a?=?item.find(‘meta‘)
#?????answer?=?a.attr.content
#?????print(‘\n問題‘?question)
#?????print(‘\n作者‘?author)
#?????print(‘\n網址‘?answer)
#
#?????content?=?(question?author?answer)
#?????rows.append(content)
#?with?open(‘zhihu.csv‘‘a‘)?as?f:
#?????f_csv?=?csv.writer(f)
#?????f_csv.writerows(rows)
#
#?????#?with?open(‘explore.txt‘?‘a‘?encoding=‘utf-8‘)?as?file:
#?????#?????file.write(‘\n‘.join([question?author?answer]))
#?????#?????file.write(‘\n‘?+?‘=‘?*?50?+?‘\n‘)
#
#?#?items?=?doc(‘.list‘)
#?#?#?print(type(items))
#?#?#?print(items)
#?#?lis?=?items.find(‘li‘)
#?#?print(type(lis))
#?#?print(lis)
#?#
#?#?file?=?open(‘explore.txt‘?‘w+‘?encoding=‘utf-8‘)
#?#?file.write(‘\n‘.join([r]))
#?#?#?file.write(‘\n‘?+?‘=‘?*?50?+?‘\n‘)
#?#?file.close()
#
#
#?if?one_info?is?not?None:
#?????for?data?in?one_info:
#?????????new_url?=?one_info[‘url‘]
#?????????r?=?requests.get(new_url?headers=self.headers)
#?????????#?print(r.text)
#?????????doc?=?pq(r.text)
#?????????#?print(doc)
#
#?????????items?=?doc(‘.Search-container?.AnswerItem‘).items()
#
#?????????for?item?in?items:
#?????????????question?=?item.find(‘h2‘).text()
#?????????????author?=?item.find(‘b‘).text()
#?????????????a?=?item.find(‘meta‘)
#?????????????answer?=?a.attr.content
#?????????????print(‘\n問題‘?question)
#?????????????print(‘\n作者‘?author)
#?????????????print(‘\n網址‘?answer)
from?lxml?import?etree
import?requests
from?pyquery?import?PyQuery?as?pq
user_agent?=?‘Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML?like?Gecko)?‘?\
?????????????‘Chrome/66.0.3359.181?Safari/537.36?‘
headers?=?{‘User-Agent‘:?user_agent}
url?=?‘http://www.zhihu.com/question/23119500/log‘
r?=?requests.get(url?headers=headers)
doc?=?pq(r.text)
print(doc)
items?=?doc(‘.zu-main-content?.zm-item‘).items()??#提
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2020-04-13?11:58??知乎爬蟲\
?????文件??????????30??2019-08-11?16:09??知乎爬蟲\.gitattributes
?????文件??????????62??2019-08-11?16:09??知乎爬蟲\.gitignore
?????目錄???????????0??2020-04-13?11:58??知乎爬蟲\.idea\
?????文件?????????185??2019-12-30?14:38??知乎爬蟲\.idea\encodings.xm
?????目錄???????????0??2020-04-13?11:58??知乎爬蟲\.idea\inspectionProfiles\
?????文件?????????174??2019-12-26?19:41??知乎爬蟲\.idea\inspectionProfiles\profiles_settings.xm
?????文件?????????298??2019-12-30?19:27??知乎爬蟲\.idea\misc.xm
?????文件?????????295??2019-12-26?19:41??知乎爬蟲\.idea\modules.xm
?????文件????????5179??2020-01-06?17:13??知乎爬蟲\.idea\workspace.xm
?????文件?????????478??2019-12-26?19:41??知乎爬蟲\.idea\zhihu-login-master.iml
?????文件????????3150??2020-01-05?18:02??知乎爬蟲\HtmlDownloader.py
?????文件????????1064??2019-08-11?16:09??知乎爬蟲\LICENSE
?????目錄???????????0??2020-04-13?11:58??知乎爬蟲\__pycache__\
?????文件????????6594??2020-01-05?18:58??知乎爬蟲\__pycache__\zhihu_login.cpython-37.pyc
?????文件????????3603??2019-12-26?20:46??知乎爬蟲\captcha.jpg
?????文件?????????906??2020-01-05?18:11??知乎爬蟲\cookies.txt
?????目錄???????????0??2020-04-13?11:58??知乎爬蟲\docs\
?????文件???????51393??2019-08-11?16:09??知乎爬蟲\docs\0.jpg
?????文件???????61527??2019-08-11?16:09??知乎爬蟲\docs\1.jpg
?????文件??????366922??2019-08-11?16:09??知乎爬蟲\docs\2.jpg
?????文件??????192250??2019-08-11?16:09??知乎爬蟲\docs\3.jpg
?????文件???????12253??2019-08-11?16:09??知乎爬蟲\docs\4.jpg
?????文件???????13618??2019-08-11?16:09??知乎爬蟲\docs\5.jpg
?????文件??????173591??2019-08-11?16:09??知乎爬蟲\docs\6.jpg
?????文件??????232144??2019-08-11?16:09??知乎爬蟲\docs\7.jpg
?????文件???????46482??2019-08-11?16:09??知乎爬蟲\docs\wx.jpg
?????文件???????10355??2019-08-11?16:09??知乎爬蟲\encrypt.js
?????文件??????203327??2019-12-30?21:03??知乎爬蟲\explore.txt
?????文件??????????93??2019-12-30?22:25??知乎爬蟲\requirements.txt
?????文件????????4878??2020-01-06?18:16??知乎爬蟲\spider.py
............此處省略4個文件信息
評論
共有 條評論