安装:
pip install scrapy
创建项目
scrapy startproject qiubai
在新项目中创建一个新的spider文件
scrapy genspider example example.com
# 例如:
scrapy genspider quibaiDemo qiushibaike.com
然后在settings.py 修改下面两个配置
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
在quibaiDemo里的 parse方法面写
def parse(self, response):
div_list = response.xpath('//div[@id="content-left"]/div')
all_data = []
for div in div_list:
# title = div.xpath('./div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()')[0].extract()
title = div.xpath('./div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first()
# // *[ @ id = "qiushi_tag_121884912"] / a
content = div.xpath('./a[1]/div/span/text()').extract_first()
dic = {
'title': title,
'content': content
}
all_data.append(dic)
return all_data
在终端写运行命令
# 例如
scrapy crawl quibaiDemo
# 不显示日志
scrapy crawl quibaiDemo --nolog
# 输出到 以json文件
scrapy crawl quibaiDemo -o qiubai.json
# 输出到 以csv文件
scrapy crawl quibaiDemo -o qiubai.csv
使用items.py 和pipelines.py ,下面抓取boss 直聘网
先在items.py编辑
class BossproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
salary = scrapy.Field()
company = scrapy.Field()
再在编辑pipelines.py
class BossproPipeline(object):
fp = None
# 只会被执行一次(开始爬虫的时候执行一次)
def open_spider(self, spider):
print('开始爬虫!!!')
self.fp = open('./job.txt', 'w', encoding='utf-8')
# 爬虫文件没提交一次item,该方法会被调用一次
def process_item(self, item, spider):
self.fp.write(item['title'] + "\t" + item['salary'] + '\t' + item['company'] + '\n')
return item
def close_spider(self, spider):
print('爬虫结束!!!')
self.fp.close()
在boss.py 文件
def parse(self, response):
li_list = response.xpath('//div[@class="job-list"]/ul/li')
for li in li_list:
title = li.xpath('.//div[@class="info-primary"]/h3[@class="name"]/a/div/text()').extract_first()
salary = li.xpath('.//div[@class="info-primary"]/h3[@class="name"]/a/span/text()').extract_first()
company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first()
# 实例化一个item类型的对象
item = BossproItem()
# 将解析到的数据值存储到item对象中:why?
item['title'] = title
item['salary'] = salary
item['company'] = company
# 将item对象提交给管道进行持久化存储
yield item
还记得在seting.py 打开下面,不然没有效果。
数字300表示的是优先级,数字越小优先级越高
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'bossPro.pipelines.BossproPipeline': 300, # 300表示的是优先级,数字越小优先级越高
}
常用的几个设置
# 日志级别
LOG_LEVEL = 'ERROR'
# 重定向
RETRY_ENABLED = False
# 下载超时
DOWNLOAD_TIMEOUT = 3
我们可以针对spider 文件自定义一些配置
custom_settings = {
"COOKIES_ENABLED": True
"RETRY_ENABLED "= False
往下填写继续......
}
如果要继承CrawlSpider 就可以自动翻页
scrapy genspider -t crawl crawlDemo www.xxx.com
原始作用:将起始url列表中的url进行get请求的发送.
通过如下操作进行父类方法的重写,让其进行post请求的发送
def start_requests(self):
data = {
'kw':'dog'
}
for url in self.start_urls:
yield scrapy.FormRequest(url=url,callback=self.parse,formdata=data)
经常设置的中间件
class ProxyproDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
#拦截请求:request参数就是拦截到的请求
proxy_http = ['http://209.34.29.9:8181', 'http://209.34.29.9:8181', 'http://209.34.29.9:8181']
proxy_https = ['https://119.59.84.58:8080', 'https://119.59.84.58:8080','https://119.59.84.58:8080']
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
def process_request(self, request, spider):
print('下载中间件',request)
if request.url.split(':')[0] == 'http':
request.meta['proxy'] = random.choice(self.proxy_http)
else:
request.meta['proxy'] = random.choice(self.proxy_https)
request.headers['User-Agent'] = random.choice(self.user_agent_list)
print(request.headers['User-Agent'])
return None
使用scrapy-redis 分布试爬虫
安装:
pip install scrapy-redis
我们爬取抽屉为例
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
from choutiPro.items import ChoutiproItem
class ChoutiSpider(RedisCrawlSpider):
name = 'chouti'
# allowed_domains = ['www.chouti.com']
# start_urls = ['http://www.chouti.com/']
redis_key = 'chouti'
rules = (
Rule(LinkExtractor(allow=r'/r/scoff/hot/\d+'), callback='parse_item', follow=True),
)
def parse_item(self, response):
div_list = response.xpath('//div[@id="content-list"]/div')
for div in div_list:
item = ChoutiproItem()
item['title'] = div.xpath('.//div[@class="part1"]/a/text()').extract_first()
item['author'] = div.xpath('.//div[@class="part2"]/a[4]/b/text()').extract_first()
yield item
在settings.py 设置以下
# 增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据
SCHEDULER_PERSIST = True
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 400
}
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
登录redis,在终端
lpush chouti https://dig.chouti.com/r/scoff/hot/1
可以直接在终端请求摸个页面出来,在终端分析
scrapy shell https://dig.chouti.com/r/scoff/hot/1
# 添加 User-Agent 请求。
scrapy shell -s User-Agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36" https://dig.chouti.com/r/scoff/hot/1
只出取出第一条信息:
response.css(" .part1 a::text").extract_first()
如果信息多条,取出多条信息:
response.css(" .part1 a::text").extract()
属性取值:
response.css(" .part2::attr(share-title)").extract_first()
UserAgent 就是用户代理,又叫报头,是一串字符串,相当于浏览器的身份证号
在利用爬虫爬取网站数据时,频繁更换它可以避免触发相应的反爬机制
fake-useragent 的GitHub地址
fake-useragent0.1.11 版本对应的连接参数
# 这个星比较多。
pip install fake-useragent
# 另一个useragent
pip install scrapy-fake-useragent
在数据库里随机取出一条代理IP。
SELECT ip ,port FROM proxp_ip ORDER BY RAND() LIMIT 1
使用列表中的随机代理处理Scrapy请求,以避免IP禁用并提高爬网速度。scrapy-proxies 的GitHub地址
pip install scrapy_proxies
scrapy-crawlera提供易于使用的Crawlera与Scrapy。
可在在线获取文档,也可在docs
目录中找到
pip install scrapy-crawlera
让chrome 不显示
pip install pyvirtualdisplay
分布式爬虫的优点
1.充分利用多机器的宽带加速爬取
2.充分利用多机的IP 加速爬取速度
用scrapyd部署写好的scrapy的GitHub
scrapyd官方文档
安装
pip install scrapyd
pip install scrapyd-client
自定义user-agent 的中间件,
在配置里我们
class RandomUserAgentMiddlware(object):
#随机更换user-agent
def __init__(self, crawler):
super(RandomUserAgentMiddlware, self).__init__()
self.ua = UserAgent()
self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
def get_ua():
return getattr(self.ua, self.ua_type)
request.headers.setdefault('User-Agent', get_ua())
在配置里的权重要比默认大,就可以覆盖默认user-agent
DOWNLOADER_MIDDLEWARES = {
'ArticleSpider.middlewares.JSPageMiddleware': 1,
'ArticleSpider.middlewares.RandomUserAgentMiddlware': 543,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}
我们先爬取西祠要IP
import requests
from scrapy.selector import Selector
import MySQLdb
conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="root", db="article_spider", charset="utf8")
cursor = conn.cursor()
def crawl_ips():
#爬取西刺的免费ip代理
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"}
for i in range(1568):
re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers)
selector = Selector(text=re.text)
all_trs = selector.css("#ip_list tr")
ip_list = []
for tr in all_trs[1:]:
speed_str = tr.css(".bar::attr(title)").extract()[0]
if speed_str:
speed = float(speed_str.split("秒")[0])
all_texts = tr.css("td::text").extract()
ip = all_texts[0]
port = all_texts[1]
proxy_type = all_texts[5]
ip_list.append((ip, port, proxy_type, speed))
for ip_info in ip_list:
cursor.execute(
"insert proxy_ip(ip, port, speed, proxy_type) VALUES('{0}', '{1}', {2}, 'HTTP')".format(
ip_info[0], ip_info[1], ip_info[3]
)
)
conn.commit()
class GetIP(object):
def delete_ip(self, ip):
#从数据库中删除无效的ip
delete_sql = """
delete from proxy_ip where ip='{0}'
""".format(ip)
cursor.execute(delete_sql)
conn.commit()
return True
def judge_ip(self, ip, port):
#判断ip是否可用
http_url = "http://www.baidu.com"
proxy_url = "http://{0}:{1}".format(ip, port)
try:
proxy_dict = {
"http":proxy_url,
}
response = requests.get(http_url, proxies=proxy_dict)
except Exception as e:
print ("invalid ip and port")
self.delete_ip(ip)
return False
else:
code = response.status_code
if code >= 200 and code < 300:
print ("effective ip")
return True
else:
print ("invalid ip and port")
self.delete_ip(ip)
return False
def get_random_ip(self):
#从数据库中随机获取一个可用的ip
random_sql = """
SELECT ip, port FROM proxy_ip
ORDER BY RAND()
LIMIT 1
"""
result = cursor.execute(random_sql)
for ip_info in cursor.fetchall():
ip = ip_info[0]
port = ip_info[1]
judge_re = self.judge_ip(ip, port)
if judge_re:
return "http://{0}:{1}".format(ip, port)
else:
return self.get_random_ip()
设置动态IP的中间件RandomProxyMiddleware
class RandomProxyMiddleware(object):
#动态设置ip代理
def process_request(self, request, spider):
get_ip = GetIP()
request.meta["proxy"] = get_ip.get_random_ip()
scrapy 集成selenium
现在spider文件初始化selenium
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
class JobboleSpider(scrapy.Spider):
name = "jobbole"
allowed_domains = ["blog.jobbole.com"]
start_urls = ['http://blog.jobbole.com/all-posts/']
def __init__(self):
self.browser = webdriver.Chrome(executable_path="D:/Temp/chromedriver.exe")
super(JobboleSpider, self).__init__()
# scrapy 信号量使用 , 发送爬虫 停止的时候,关闭selenium.
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
#当爬虫退出的时候关闭chrome
print ("spider closed")
self.browser.quit()
在中间Middleware 自定义JSPageMiddleware
记得在seting文件设置 'ArticleSpider.middlewares.JSPageMiddleware': 1,
权重方面要设置最小,它就小调用。
from scrapy.http import HtmlResponse
class JSPageMiddleware(object):
#通过chrome请求动态网页
def process_request(self, request, spider):
if spider.name == "jobbole":
# browser = webdriver.Chrome(executable_path="D:/Temp/chromedriver.exe")
spider.browser.get(request.url)
import time
time.sleep(3)
print ("访问:{0}".format(request.url))
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request)
网友评论