一、安装:
pip install scrapy -i https://pypi.tuna.tsinghua.edu.cn/simple/
二、基本命令
- 创建项目:
scrapy startproject <project_name> [project_dir]
- 创建爬虫:
scrapy genspider <mydomain> mydomain.com
- 命令行交互
scrapy shell url
三、Scrapy项目结构:
-
scrapy.cfg
: 项目的配置信息,主要为Scrapy
命令行工具提供一个基础的配置信息 -
items.py
: 格式化数据 -
middlewares.py
: 设置IP代理,反爬时用到 -
pipelines.py
: 数据处理行为,如:一般结构化数据的持久化 -
settings.py
: 配置文件 -
spiders
: 爬虫目录
四、Script Demo:
-
web_url:
https://tieba.baidu.com/f?kw=%BC%C3%D6%DD%B8%AE
-
Shell调试
// Shell调试
scrapy shell https://tieba.baidu.com/f?kw=%BC%C3%D6%DD%B8%AE
response
response.css(".j_th_tit")
response.css(".j_th_tit").extract()
response.css(".j_th_tit::attr('href')").extract()
- 虚拟环境下,创建爬虫
// 虚拟环境下,创建爬虫
scrapy startproject my_spider
cd my_spider
scrapy genspider tieba_jzf https://tieba.baidu.com/f?kw=%BC%C3%D6%DD%B8%AE
-
代码示例:
-
主函数:
/my_spider/my_spider/main.py
# -*- coding:utf-8 -*-
from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(['scrapy', 'crawl', 'tieba_jzf']) # arg 3: 爬虫名 TiebaJzfSpider name
# test path
# print(__file__)
# print(os.path.abspath(__file__))
# print(os.path.dirname(os.path.abspath(__file__)))
-
爬虫解析:
/my_spider/my_spider/spiders/tieba_jzf.py
# -*- coding: utf-8 -*-
from urllib import parse
import scrapy
from my_spider.items import TiebaItem
class TiebaJzfSpider(scrapy.Spider):
name = 'tieba_jzf'
# allowed_domains = ['https://tieba.baidu.com/f?kw=%BC%C3%D6%DD%B8%AE']
# start_urls = ['http://https://tieba.baidu.com/f?kw=%BC%C3%D6%DD%B8%AE/']
# 域名
allowed_domains = ['tieba.baidu.com']
start_urls = ['https://tieba.baidu.com/f?kw=%BC%C3%D6%DD%B8%AE']
def parse(self, response):
# 页面帖子url数组
url_list = response.css(".j_th_tit::attr(href)").extract()
# 遍历帖子,调取下级页面
for url in url_list:
print('---- url:', url)
yield scrapy.Request(url=parse.urljoin(response.url, url), callback=self.parse_detail)
# next page
next_url = response.css(".next.pagination-item::attr(href)").extract()
if next_url is not None:
yield scrapy.Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
# 解析详情
def parse_detail(self, response):
print('---- detail url:', response.url)
# 标题 title core_title_txt
title = response.css(".core_title_txt::text").extract()[0]
if title is not None:
# 回复
authors = response.css(".p_author_name.j_user_card::text").extract()
contents = response.css('.d_post_content.j_d_post_content.clearfix').extract()
texts = response.css('.d_post_content.j_d_post_content.clearfix::text').extract()
# tail_infos =
# # j_p_postlist > div:nth-child(5) > div.d_post_content_main >
# # div.core_reply.j_lzl_wrapper > div.core_reply_tail > ul.p_tail > li:nth-child(2) > span
print('---- 打印数据:')
print('---- title:', title)
for i in range(len(authors)):
tieba_item = TiebaItem()
tieba_item['title'] = title
tieba_item['author'] = authors[i]
tieba_item['content'] = texts[i]
yield tieba_item
# print('---- author:{} content:{}'.format(authors[i], texts[i]))
# if i > 200:
# break
# 详情翻页
detail_next_url_list = response.css(".tP+a::attr(href)").extract()
if detail_next_url_list:
yield scrapy.Request(url=parse.urljoin(response.url, detail_next_url_list[0]), callback=self.parse_detail)
-
配置信息:
/my_spider/my_spider/settings.py
# modify
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'my_spider.pipelines.MySpiderPipeline': 300,
#}
ITEM_PIPELINES = {
'my_spider.pipelines.MySQLTwistedPipeline': 1,
}
# add
# 数据库基本信息
MYSQL_HOST = "10.2.130.**"
MYSQL_USER = "root"
MYSQL_DB = "baidu_db"
MYSQL_PASSWORD = '******'
-
数据格式化:
/my_spider/my_spider/items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class MySpiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class TiebaItem(scrapy.Item):
title = scrapy.Field()
author = scrapy.Field()
content = scrapy.Field()
def get_insert_sql(self):
sql_setence = '''
insert into tieba(title, author, content) values (%s, %s, %s)
'''
params = (self['title'], self['author'], self['content'])
return sql_setence, params
-
数据处理行为:
/my_spider/my_spider/pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# MySQLdb ImportError
import pymysql
pymysql.install_as_MySQLdb()
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
class MySpiderPipeline(object):
def process_item(self, item, spider):
return item
class MySQLTwistedPipeline(object):
#
def __init__(self, db_pool):
self.db_pool = db_pool
#
@classmethod
def from_settings(cls, settings):
db_params = dict(
host=settings['MYSQL_HOST'],
user=settings['MYSQL_USER'],
db=settings['MYSQL_DB'],
passwd=settings['MYSQL_PASSWORD'],
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True
)
db_pool = adbapi.ConnectionPool("MySQLdb", **db_params)
return cls(db_pool)
#
def process_item(self, item, spider):
self.db_pool.runInteraction(self.do_insert, item)
return item
#
def do_insert(self, cursor, item):
insert_sql, params = item.get_insert_sql()
cursor.execute(insert_sql, params)
-
IP代理,反爬:
/my_spider/my_spider/middlewares.py
安装:pip install fake-useragent
文档:https://pypi.org/project/fake-useragent/
# IP代理
from fake_useragent import UserAgent
class RandomIpMiddleware(object):
def __init__(self):
self.ua = UserAgent()
def process_request(self, request, spider):
request.meta['proxy'] = 'http://182.109.XXX.XXX'
request.heads.setdefault('User-Agent', self.ua.random)
-
APP抓包:
Requests
:
安装:pip install requests
文档:https://pypi.org/project/requests/
import requests
r = requests.get('https://www.baidu.com/', auth=('user', 'pass'))
r.status_code
r.headers['content-type']
r.encoding
r.text
r.json
r.content
网友评论