# spider 程序
import scrapy# from scrapy.http.response.html import HtmlResponse
from ..items import GswwItem
# setting中piplines优先级越高越先执行
class GswwSpiderSpider(scrapy.Spider):
name = 'gsww_spider'
allowed_domains = ['gushiwen.cn']
start_urls = ['https://www.gushiwen.cn/default.aspx?page=2']
def myprint(self,value1,*args):
print(value1,*args)
print("="*30)
def parse(self, response):
# self.myprint(type(response))
# response.xpath返回的都是selector对象,也就是标签对象,可以用xpath,css
# title = gsw_div.xpath(".//b/text()").getall() getall()是获取所有值
# get()返回第一个值,getall()是获取所有值
gsw_divs = response.xpath("//div[@class='left']/div[@class='sons']")
# self.myprint(type(gsw_divs))
for gsw_div in gsw_divs:
titles = gsw_div.xpath('.//b/text()').getall()
source = gsw_div.xpath(".//p[@class='source']/a/text()").getall()
# 下面的//text()代表的是获取class='contson'下的所有子孙文本
content_list = gsw_div.xpath(".//div[@class='contson']//text()").getall()
if titles and source and content_list != []:
title = titles[0]
dynasty = source[0]
author = source[1]
# 下面的//text()代表的是获取class='contson'下的所有子孙文本
content = "".join(content_list).strip()
item = GswwItem(title=title, dynasty=dynasty, author=author, content=content)
yield item
else:
continue
爬取一页古诗文网
怎么创建一个scrapy项目?
mac
在终端cd进入想要创建项目的文件,在scrapy startproject [项目名称]
再cd [项目名称]
再scrapy genspider [爬虫名称] [爬虫作用的域名]
在spider里面提取数据,然后在piplines里面存储数据。
在setting里面设置请求头 ,打开
ITEM_PIPELINES

网友评论