資源簡介
使用python3.7+scrapy+mongodb框架爬取新浪微博的用戶信息以及微博動態
實現了高匿ip代理池、偽裝請求User-Agent
教程貼請移步:https://blog.csdn.net/mengyanyuan8023/article/details/94017903

代碼片段和文件信息
#?-*-?coding:?utf-8?-*-
#?Define?here?the?models?for?your?scraped?items
#
#?See?documentation?in:
#?https://doc.scrapy.org/en/latest/topics/items.html
import?scrapy
from?scrapy.loader.processors?import?TakeFirst
class?SinaUserItem(scrapy.Item):
????#?微博用戶唯一標識
????user_id?=?scrapy.Field(output_processor=TakeFirst())
????#?用戶昵稱
????username?=?scrapy.Field(output_processor=TakeFirst())
????#?微博數量
????webo_num?=?scrapy.Field(output_processor=TakeFirst())
????#?關注人數
????follow_num?=?scrapy.Field(output_processor=TakeFirst())
????#?粉絲人數
????fans_num?=?scrapy.Field(output_processor=TakeFirst())
????#?性別
????gender?=?scrapy.Field(output_processor=TakeFirst())
????#?地區
????district?=?scrapy.Field(output_processor=TakeFirst())
????#?省份
????province?=?scrapy.Field(output_processor=TakeFirst())
????#?地市
????city?=?scrapy.Field(output_processor=TakeFirst())
????#?生日
????birthday?=?scrapy.Field(output_processor=TakeFirst())
????#?簡介
????brief_intro?=?scrapy.Field(output_processor=TakeFirst())
????#?認證
????identify?=?scrapy.Field(output_processor=TakeFirst())
????#?電腦板
????#?internet_url?=?scrapy.Field(output_processor=TakeFirst())
????#?手機版
????#?mobile_url?=?scrapy.Field(output_processor=TakeFirst())
????#?頭像?URL
????head_img?=?scrapy.Field(output_processor=TakeFirst())
????#?標簽
????#?tag?=?scrapy.Field()
????#?爬取時間
????crawl_time?=?scrapy.Field(output_processor=TakeFirst())
class?WeiBoContentItem(scrapy.Item):
????#?用戶?id
????user_id?=?scrapy.Field(output_processor=TakeFirst())
????#?微博?id
????weibo_id?=?scrapy.Field(output_processor=TakeFirst())
????#?權限
????#?authority?=?scrapy.Field(output_processor=TakeFirst())
????#?微博內容
????weibo_content?=?scrapy.Field(output_processor=TakeFirst())
????#?微博圖片
????weibo_images?=?scrapy.Field()
????#?微博圖片保存本地路徑
????#?images_path?=?scrapy.Field()
????#?類型(原創/轉載)
????weibo_type?=?scrapy.Field(output_processor=TakeFirst())
????#?發布時間
????post_time?=?scrapy.Field(output_processor=TakeFirst())
????#?點贊數
????like_count?=?scrapy.Field(output_processor=TakeFirst())
????#?評論數
????comment_count?=?scrapy.Field(output_processor=TakeFirst())
????#?轉發數
????retweet_count?=?scrapy.Field(output_processor=TakeFirst())
????#?發布終端
????terminal?=?scrapy.Field(output_processor=TakeFirst())
if?__name__?==?‘__main__‘:
????item?=?WeiBoContentItem()
????print(item.get(‘weibo_images‘))
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件?????????90??2019-06-28?16:24??sina_crawl\.git\COMMIT_EDITMSG
?????文件????????311??2019-06-19?18:11??sina_crawl\.git\config
?????文件?????????73??2019-06-13?16:42??sina_crawl\.git\desc
?????文件????????102??2019-06-18?15:32??sina_crawl\.git\FETCH_HEAD
?????文件?????????25??2019-06-19?18:09??sina_crawl\.git\HEAD
?????文件????????478??2019-06-13?16:42??sina_crawl\.git\hooks\applypatch-msg.sample
?????文件????????896??2019-06-13?16:42??sina_crawl\.git\hooks\commit-msg.sample
?????文件???????3327??2019-06-13?16:42??sina_crawl\.git\hooks\fsmonitor-watchman.sample
?????文件????????189??2019-06-13?16:42??sina_crawl\.git\hooks\post-update.sample
?????文件????????424??2019-06-13?16:42??sina_crawl\.git\hooks\pre-applypatch.sample
?????文件???????1638??2019-06-13?16:42??sina_crawl\.git\hooks\pre-commit.sample
?????文件???????1348??2019-06-13?16:42??sina_crawl\.git\hooks\pre-push.sample
?????文件???????4898??2019-06-13?16:42??sina_crawl\.git\hooks\pre-reba
?????文件????????544??2019-06-13?16:42??sina_crawl\.git\hooks\pre-receive.sample
?????文件???????1492??2019-06-13?16:42??sina_crawl\.git\hooks\prepare-commit-msg.sample
?????文件???????3610??2019-06-13?16:42??sina_crawl\.git\hooks\update.sample
?????文件???????3052??2019-06-28?16:24??sina_crawl\.git\index
?????文件????????240??2019-06-13?16:42??sina_crawl\.git\info\exclude
?????文件???????3602??2019-06-28?16:24??sina_crawl\.git\logs\HEAD
?????文件???????2383??2019-06-18?17:51??sina_crawl\.git\logs\refs\heads\master
?????文件???????1056??2019-06-28?16:24??sina_crawl\.git\logs\refs\heads\V1.0.619
?????文件???????2186??2019-06-18?17:51??sina_crawl\.git\logs\refs\remotes\origin\master
?????文件????????564??2019-06-28?16:24??sina_crawl\.git\logs\refs\remotes\origin\V1.0.619
?????文件???????1353??2019-06-18?17:51??sina_crawl\.git\ob
?????文件???????4208??2019-06-19?18:11??sina_crawl\.git\ob
?????文件???????4133??2019-06-18?15:36??sina_crawl\.git\ob
?????文件???????4091??2019-06-28?16:24??sina_crawl\.git\ob
?????文件???????1943??2019-06-28?16:22??sina_crawl\.git\ob
?????文件????????228??2019-06-14?17:52??sina_crawl\.git\ob
?????文件???????1931??2019-06-28?16:22??sina_crawl\.git\ob
............此處省略464個文件信息
評論
共有 條評論