1.新建scrapy工程
命令行下执行
gavin@gavin:~/PycharmProjects/scrapy$ mkdir tutorial
gavin@gavin:~/PycharmProjects/scrapy$ scrapy startproject tutorial
gavin@gavin:~/PycharmProjects/scrapy$ tree
生成目录如下:
scrapy.png主要文件介绍,
items.py:用来定义要抓取的内容
piplines:用来对内容进行处理,包括过滤和输出。
setting:配置文件
dmoz_spider.py:这是主要的程序,用来提取网页的内容。
2.定义item
item在items.py文件中进行定义,我们实例用来抓取标题,连接,描述和价格
import scrapy
class TutorialItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class DmozItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
desc = scrapy.Field()
price = scrapy.Field()
3.定义pipline
要想将数据写入文件,首先我们需要设置setting.py文件:
ITEM_PIPELINES = {
'tutorial.pipelines.TutorialPipeline':300,
'tutorial.pipelines.MysqlPipeline':400
}
在pipline文件中我们将获取到的数据输出到文件和数据库中,下面分块说明一下:
下面是输出到文件的代码
import json
class TutorialPipeline(object):
###初始化文件
def __init__(self):
self.file=open('item','w',encoding='utf-8')
###写文件
def process_item(self,item,spider):
line=json.dumps(dict(item), ensure_ascii=False)###解析成json的数据
self.file.write(line +"\r\n" )###写入到文件
return item
###关闭文件
def close_spider(self, spider):
self.file.close()
下面是输出到数据库的代码
import json
import pymysql
class MysqlPipeline(object):
###初始化数据库文件,打开数据库连接连接
def __init__(self):
config = {
'host': '192.168.1.200',
'port': 3308,
'user': 'root',
'password': '1qaz@WSX',
'db': 'data_department',
'charset':'utf8',
}
self.connect = pymysql.connect(**config)
self.cursor = self.connect.cursor()
###解析数据并且入库
def process_item(self,item,spider):
title = item['title']
link = item['link']
des = item['desc']
price = item['price']
insert_sql = """
insert into zk_cylw(title,link,des,price)values(%s,%s,%s,%s);
"""
para = (title,link,des,price)
self.cursor.execute(insert_sql,para)
self.connect.commit()
return item
####断开数据库连接
def close_spider(self,spider):
self.cursor.close()
self.connect.close()
4.爬虫内容
下面是爬虫的主要代码
import sys
sys.path.append('/home/gavin/PycharmProjects/scrapy/tutorial')
from scrapy.spiders import Spider
from scrapy.selector import Selector
from tutorial.items import DmozItem
from scrapy import http
class DmozSpider(Spider):
name = "dmoz"
start_urls = [
"http://www.ushsh.com/index.php?route=product/productall&page=1",
]
def parse(self, response):
sel = Selector(response)
# sites = sel.xpath('//div[@class="name"]/a')
sites = sel.css('div.product-grid > div')
items = []
for site in sites:
item = DmozItem()
title = site.css('div.name > a::text').extract()[0]
link = site.css('div.name > a::attr("href")').extract()[0]
des = site.css('div.description::text').extract()[0]
price = site.css('div.price::text').extract()[0].replace(' ','').replace('\n','').replace('\r','')
item['title'] = title
item['link'] = link
# item['desc'] = des
item['price'] = price
items.append(item)
####用回调函数爬取二级页面内容
yield http.Request(url=item["link"], meta={'item': item}, callback=self.parseDetail, dont_filter=True)
# yield item
####获取下一页的数据
nextPage = sel.xpath('//div[@class="links"]/a/@href').extract()[-2]
if nextPage:
next = nextPage
yield http.Request(next, callback=self.parse)
#####二级页面数据处理
def parseDetail(self,response):
item = response.meta['item']
selector = Selector(response)
des = selector.xpath('//meta[@name="description"]/@content').extract()[-1]
item['desc'] = des
yield item
5.编写运行代码scrapy_start.py
将scrapy_start.py文件放到和items.py文件同级的文件夹下
from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
##dmoz是爬虫主要代码中的name的值
def main():
execute(['scrapy','crawl','dmoz'])
if __name__ == '__main__':
main()
网友评论