美文网首页
crawlspider示例和登录

crawlspider示例和登录

作者: sixkery | 来源:发表于2018-09-24 15:37 被阅读30次
    • 创建crawlspider
    scrapy startproject wxapp
    cd wxapp
    scrapy genspider -t crawl wxappspider www.wxapp-union.com
    

    wxappspider.py文件中:

    # -*- coding: utf-8 -*-
    
    '''微信小程序教程页面全部爬取'''
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from wxapp.items import WxappItem
    
    class WxappSpiderSpider(CrawlSpider):
        name = 'wxapp_spider'
        allowed_domains = ['wxapp-union.com']
        start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']
    
        rules = (
            # 提取列表页,不用调用回调函数,因为列表页没有什么要爬的
            Rule(LinkExtractor(allow=r'.*?page=\d'), follow=True),
            # 提取具体详情页,调用回调函数,解析具体字段,不继续跟进爬取
            Rule(LinkExtractor(allow=r'.*?article-.*?\.html'),callback='parse_item',follow=False)
        )
    
        def parse_item(self, response):
            item = WxappItem()
            title = response.xpath('//h1[@class="ph"]/text()').extract()
            item['title'] = title
            yield item
    
    

    pipelines.py文件中:

    
    from scrapy.exporters import JsonLinesItemExporter
    
    # 保存json文件中
    class WxappPipeline(object):
        def __init__(self):
            self.file = open('wxapp.json','wb',)
            self.exporter = JsonLinesItemExporter(self.file,ensure_ascii=False,encoding='utf-8')
        def process_item(self, item, spider):
            self.exporter.export_item(item)
            return item
    
        def close_spider(self,item,spider):
            self.file.close()
    
    

    item.py更改的地方:

    ROBOTSTXT_OBEY = False
    DOWNLOAD_DELAY = 1
    DEFAULT_REQUEST_HEADERS = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'en',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        'Referer': 'http://www.wxapp-union.com/article-4527-1.html'
    }
    ITEM_PIPELINES = {
       'wxapp.pipelines.WxappPipeline': 300,
    }
    

    登录

    
    '''登陆人人网
    重写start_requests方法来实现登录'''
    
    import scrapy
    
    
    class RenrenSpider(scrapy.Spider):
        name = 'zhihu'
        allowed_domains = ['www.renren.com']
        start_urls = ['http://www.renren.com/']
    
        def start_requests(self):
            url= 'http://www.renren.com/PLogin.do'
            data = {
                'email':'','password':''
            }
            yield scrapy.FormRequest(url,formdata=data,callback=self.parse)
    
    
        def parse(self, response):
            pass
    

    相关文章

      网友评论

          本文标题:crawlspider示例和登录

          本文链接:https://www.haomeiwen.com/subject/ioucoftx.html