資源簡介
通過requests抓包方式爬取拉勾網深圳市的數據分析崗位信息,并利用pandas、pyecharts、jieba、WordCloud等工具,從多維度進行崗位數據的可視化分析。
代碼片段和文件信息
#?coding:?utf-8
#?In[3]:
#爬蟲部分
import?requests
#?import?pandas?as?pd
import?json
import?time
#數據可視化部分
import?pandas?as?pd
from?collections?import?Counter
from?pyecharts?import?PieBarGridMapFunnelRadar
#詞頻詞云的模塊導入
import?jieba
import?codecs
from?imageio?import?imread
import?os
from?os?import?path
import?matplotlib.pyplot?as?plt
from?PIL?import?Image?ImageDraw?ImageFont
from?wordcloud?import?WordCloud?ImageColorGenerator
#數據分析崗位,拉勾網總共的職位是22頁
def?data_worm_save():
????position_info_all?=?[]
????for?page_num?in?range(124):
????????url?=?“https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false“
????????#my_header?=?{‘User-Agent‘:‘Mozilla/5.0?(Windows?NT?10.0;?WOW64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/58.0.3029.110?Safari/537.36?SE?2.X?metaSr?1.0‘}
????????#上述偽裝的不夠徹底
????????my_header?=?{
????????????‘Host‘:‘www.lagou.com‘
????????????‘Origin‘:‘https://www.lagou.com‘
????????????‘Referer‘:‘https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city=%E6%B7%B1%E5%9C%B3‘
????????????‘User-Agent‘:‘Mozilla/5.0?(Windows?NT?10.0;?WOW64)?AppleWebKit/537.36?(KHTML?like?Gecko)?Chrome/58.0.3029.110?Safari/537.36?SE?2.X?metaSr?1.0‘
????????????‘X-Anit-Forge-Code‘:‘0‘
????????????‘X-Anit-Forge-Token‘:‘None‘
????????????‘X-Requested-With‘:‘xmlHttpRequest‘}
????????#page_num用于修改頁碼,抓取全部頁碼的信息
????????my_data?=?{‘first‘:‘true‘‘pn‘:page_num‘kd‘:‘數據分析‘}
????????#注意查看post請求還是get請求
????????response?=?requests.post(urlheaders?=?my_headerdata?=my_data??)
????????dict_all?=?json.loads(response.text)
????????dict_position_results?=?dict_all[“content“][“positionResult“][“result“]
????????for?position_item?in?dict_position_results:
????????????position_info_single?=?[]
????????????position_info_single.append(position_item[“companyFullName“])
????????????position_info_single.append(position_item[“companyShortName“])
????????????position_info_single.append(position_item[“companySize“])
????????????position_info_single.append(position_item[“financeStage“])
????????????position_info_single.append(position_item[“district“])
????????????position_info_single.append(position_item[“positionName“])
????????????position_info_single.append(position_item[“workYear“])
????????????position_info_single.append(position_item[“education“])
????????????position_info_single.append(position_item[“salary“])
????????????position_info_single.append(position_item[“jobNature“])
????????????position_info_single.append(position_item[“positionAdvantage“])
????????????position_info_single.append(position_item[“createTime“])
????????????position_info_all.append(position_info_single)
????????print(“第?“+str(page_num)+“?頁數據爬取完畢“)
????????time.sleep(20)
????????#print(position_info_all)?
????df?=?pd.Dataframe(data?=?position_info_allcolumns?=?[‘公司全名‘‘公司簡稱‘‘公司規模‘‘融資階段‘‘區域‘‘職位名稱‘‘工作經驗‘‘學歷要求‘‘工資‘‘工作形式‘‘職位福利‘‘發布時間‘])???
????df.to_csv(r‘C:\Users\yong_\Desktop\La_Gou\data_source\lagou_jobs_page_all.
評論
共有 條評論