爬取所谓校花网信息。
创建一个项目:
scrapy startproject tutorial
cd
到tutorial
项目里,创建一个爬虫:
scrapy genspider belle 521609.com
一 存储到文件中
在settings.py
进行基本配置
# UA 伪装
USER_AGENT = 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
# 设置日志级别
LOG_LEVEL = 'ERROR'
# 关闭机器人协议
ROBOTSTXT_OBEY = False
在items.py
中将类名改为BellelItem
,因为这样能区分同项目下的其他爬虫管道类。定义要存储的字段
import scrapy
class BellelItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
img_url = scrapy.Field()
解析网站,在belle.py中通过xpath解析
# -*- coding: utf-8 -*-
import scrapy
from tutorial.items import BellelItem
class BelleSpider(scrapy.Spider):
name = 'belle'
allowed_domains = ['521609.com']
start_urls = ['http://www.521609.com/xiaoyuanmeinv/']
def parse(self, response):
ph_list = response.xpath('//div[@class="index_img list_center"]/ul/li')
for p in ph_list:
title = p.xpath('./a[2]/text()').extract_first()
# title = p.xpath('//a[@class="title"]/text()').extract_first()
img_url = 'http://www.521609.com' + p.xpath('./a[1]/img/@src').extract_first()
item = BellelItem()
item['title'] = title
item['img_url'] = img_url
# 提交给管道
yield item
piplines.py
存储到文件的时候,因为每次都数据过来都会运行一遍,如果打开文件操作放到process_item()中,就会每次都打开。因此可以重写open_spider()和close_spider()。
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class TutorialPipeline(object):
f = None
def open_spider(self, spider):
self.f = open("./belle.txt", 'w', encoding='utf-8')
def process_item(self, item, spider):
title = item['title']
img_url = item['img_url']
self.f.write(title + ":" + img_url + "\n")
return item
def close_spider(self, spider):
self.f.close()
settings.py
中解注释启用管道,数字越小优先级越高。
# 300是优先级,越小优先级越高
ITEM_PIPELINES = {
'tutorial.pipelines.TutorialPipeline': 300,
}
运行爬虫:
scrapy crawl belle
二 存储到数据库中
修改piplines.py
如下,导入pymysql
模块
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class TutorialPipeline(object):
f = None
def open_spider(self, spider):
self.f = open("./belle.txt", 'w', encoding='utf-8')
def process_item(self, item, spider):
title = item['title']
img_url = item['img_url']
self.f.write(title + ":" + img_url + "\n")
# 把item传递给下个类
return item
def close_spider(self, spider):
self.f.close()
class MySQLPipeline(object):
conn = None
cursor = None
def open_spider(self, spider):
self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123', db='belle', charset='utf8')
def process_item(self, item, spider):
self.cursor = self.conn.cursor()
title = item['title']
img_url = item['img_url']
insert_sql = "insert into belle values('{}', '{}')".format(title, img_url)
try:
self.cursor.execute(insert_sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
在settings.py增加管道配置
ITEM_PIPELINES = {
'tutorial.pipelines.TutorialPipeline': 300,
# 新增MySQLPipeline管道类,权重比上一个类低
'tutorial.pipelines.MySQLPipeline': 301,
}
三 Redis存储
在piplines.py
导入redis,并添加类。
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import redis
class TutorialPipeline(object):
f = None
def open_spider(self, spider):
self.f = open("./belle.txt", 'w', encoding='utf-8')
def process_item(self, item, spider):
title = item['title']
img_url = item['img_url']
self.f.write(title + ":" + img_url + "\n")
# 把item传递给下个类
return item
def close_spider(self, spider):
self.f.close()
class MySQLPipeline(object):
conn = None
cursor = None
def open_spider(self, spider):
self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123', db='belle',
charset='utf8')
def process_item(self, item, spider):
self.cursor = self.conn.cursor()
title = item['title']
img_url = item['img_url']
insert_sql = "insert into belle values('{}', '{}')".format(title, img_url)
try:
self.cursor.execute(insert_sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
class RedisPipeline(object):
conn = None
def open_spider(self, spider):
self.conn = redis.Redis(host='127.0.0.1', port=6379)
def process_item(self, item, spider):
title = item['title']
img_url = item['img_url']
item_dic = {
"title": title,
"img": img_url,
}
import json
self.conn.lpush("belle", json.dumps(item_dic))
return item
def close_spider(self, spider):
print("爬虫结束")
在settings.py中开启管道,并设置权重。
ITEM_PIPELINES = {
'tutorial.pipelines.TutorialPipeline': 300,
# 新增MySQLPipeline管道类,权重比上一个类低
'tutorial.pipelines.MySQLPipeline': 301,
'tutorial.pipelines.RedisPipeline': 302,
}
网友评论