91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

資源簡(jiǎn)介

搜狗的細(xì)胞詞庫是一個(gè)開放共享的詞庫,本程序是使用python獲取搜狗詞庫官網(wǎng)上的所有下載鏈接,下載詞庫并進(jìn)行分類保存。需要注意的是,下載過來的文件格式為:.scel 如果要轉(zhuǎn)換為.txt,歡迎查看我的另一個(gè)下載資源。歡迎下載交流!

資源截圖

代碼片段和文件信息

#!/usr/bin/env?python??
#?_*_?coding:utf-8?_*_??
#??
#?@Version?:?1.0??
#?@Time????:?2018/9/10
#?@Author??:?圈圈烴
#?@File????:?Sougou_Spider
#?@Description:?搜狗詞庫爬蟲
#
#
from?bs4?import?BeautifulSoup
from?urllib.parse?import?unquote
import?requests
import?re
import?os


class?SougouSpider:

????headers?=?{
????????“User-Agent“:?“Mozilla/5.0?(Windows?NT?10.0;?WOW64;?rv:60.0)?Gecko/20100101?Firefox/60.0“
????????“Accept“:?“text/htmlapplication/xhtml+xmlapplication/xml;q=0.9*/*;q=0.8“
????????“Accept-Language“:?“zh-CNzh;q=0.8zh-TW;q=0.7zh-HK;q=0.5en-US;q=0.3en;q=0.2“
????????“Accept-Encoding“:?“gzip?deflate“
????????“Connection“:?“keep-alive“
????}
????res?=?‘‘

????def?__init__(self?url):
????????self.url?=?url

????def?get_html(self?open_proxy=False?ip_proxies=None):
????????try:
????????????pattern?=?re.compile(r‘//(.*?)/‘)
????????????host_url?=?pattern.findall(self.url)[0]
????????????SougouSpider.headers[“Host“]?=?host_url
????????????if?open_proxy:??#?判斷是否開啟代理
????????????????proxies?=?{“http“:?“http://“?+?ip_proxies?}??#?設(shè)置代理,例如{“http“:?“http://103.109.58.242:8080“?}
????????????????SougouSpider.res?=?requests.get(self.url?headers=SougouSpider.headers?proxies=proxies?timeout=5)
????????????else:
????????????????SougouSpider.res?=?requests.get(self.url?headers=SougouSpider.headers?timeout=5)
????????????SougouSpider.res.encoding?=?SougouSpider.res.apparent_encoding??#?自動(dòng)確定html編碼
????????????print(“Html頁面獲取成功?“?+?self.url)
????????????return?SougouSpider.res??????#?只返回頁面的源碼
????????except?Exception?as?e:
????????????print(“Html頁面獲取失敗?“?+?self.url)
????????????print(e)

????def?get_cate_1_list(self):
????????#?獲取大分類鏈接
????????dict_cate_1_urls?=?[]
????????soup?=?BeautifulSoup(SougouSpider.res.text?“html.parser“)
????????dict_nav?=?soup.find(“div“?id=“dict_nav_list“)
????????dict_nav_lists?=?dict_nav.find_all(“a“)
????????for?dict_nav_list?in?dict_nav_lists:
????????????dict_nav_url?=?“https://pinyin.sogou.com“?+?dict_nav_list[‘href‘]
????????????dict_cate_1_urls.append(dict_nav_url)
????????return?dict_cate_1_urls

????def?get_cate_2_1_list(self):
????????#?獲取第一種小分類鏈接
????????dict_cate_2_1_dict?=?{}
????????soup?=?BeautifulSoup(SougouSpider.res.text?“html.parser“)
????????dict_td_lists?=?soup.find_all(“div“?class_=“cate_no_child?citylistcate?no_select“)
????????for?dict_td_list?in?dict_td_lists:
????????????dict_td_url?=?“https://pinyin.sogou.com“?+?dict_td_list.a[‘href‘]
????????????dict_cate_2_1_dict[dict_td_list.get_text().replace(“\n“?““)]?=?dict_td_url
????????return?dict_cate_2_1_dict

????def?get_cate_2_2_list(self):
????????#?獲取第二種小分類鏈接
????????dict_cate_2_2_dict?=?{}
????????soup?=?BeautifulSoup(SougouSpider.res.text?“html.parser“)
????????dict_td_lists?=?soup.find_all(“div“?class_=“cate_no_child?no_select“)
????????#?類型1解析
????????for?dict_td_list?in?dict_td_lists:
????????????dict_td_url?=?“https://pinyin.sogou.com“?+?dict_td_list.a

評(píng)論

共有 條評(píng)論