美文网首页林梓技术分享集
scrapy爬虫登陆后爬取

scrapy爬虫登陆后爬取

作者: 童蒙vlog | 来源:发表于2017-07-20 22:30 被阅读19次

    直接上代码例子:

    import scrapy
    import requests
    from scrapy.spiders import CrawlSpider, Rule
    from scrapy.linkextractors import LinkExtractor
    from scrapy.http import Request, FormRequest, HtmlResponse
    from spiderman.items import SpidermanItem
    
    class ItjuziWebSpider(CrawlSpider):
        name = "itjuziweb"
        allowed_domains = ["itjuzi.com"]
    
        start_urls =["https://www.itjuzi.com/user/login"]
    
        rules = (
            Rule(LinkExtractor(allow=('http://www.itjuzi.com/company\?page=\d+', ))),
    
            Rule(LinkExtractor(allow=('http://www.itjuzi.com/company/\d+', )), callback='parse_item'),
        )
    
        def __init__(self, *a, **kw):
            super(ItjuziWebSpider, self).__init__(*a, **kw)
    
            self.cookie = ''
            self.resp = requests.Session()
            self.request_headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
                "Referer": "https://www.itjuzi.com",
            }
    
            self.post_headers = {
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6",
                "Cache-Control": "no-cache",
                "Connection": "keep-alive",
                "Content-Type": "application/x-www-form-urlencoded",
                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
                "Referer": "https://www.itjuzi.com/",
            }
    
    
    
        def parse_start_url(self, response):
            # 登陆成功后, 会调用after_login回调函数
            print('Preparing login')
            url = 'https://www.itjuzi.com/user/login'
            post_data = {
                  'identity':'linzikristy@qq.com',
                  'password':'a761177953z',
                  'remember':'1',
                  'page':'',
                  'url':'',
              }
            r = self.resp.post(url, headers=self.post_headers, data=post_data)
            cookie_dict = {}
            for x in r.cookies:
                cookie_dict[x.name] = x.value
            self.cookie = cookie_dict
    
            # self.after_login()
            yield Request('http://www.itjuzi.com/company?page=1',headers=self.post_headers, cookies=self.cookie)
    
    
        def parse_item(self, response):
            self.log('Hi, this is an item page! %s' % response.url)
    
            item = SpidermanItem()
            item['item_id'] = response.xpath("//*[@id='modal_myinc']/div/div/div[2]/div/form/input/@value").extract()[0]
            item['item_name'] = response.xpath("//input[contains(@name,'com_name')]/@value").extract()[0]
            item_logo = response.xpath("//div[contains(@class,'rowhead')]/div[contains(@class,'pic')]/img/@src").extract()
            if item_logo is None:
                item['item_logo'] = ''
            else:
                item['item_logo'] = ','.join(item_logo)
            item['item_brief'] = response.xpath("//meta[@name='Description']/@content").extract()[0]
            item_area = response.xpath("//div[contains(@class,'tagset dbi c-gray-aset')]/a/span/text()").extract()
            if item_area is None:
                item['item_area'] = ''
            else:
                item['item_area'] = ','.join(item_area)
            item_CEO = response.xpath("//a[contains(@class,'title')]//span[contains(@class,'c')][1]/text()").extract()
            if item_CEO is None:
                item['item_CEO'] = ''
            else:
                item['item_CEO'] = ','.join(item_CEO)
            item_round = response.xpath("//span[contains(@class,'t-small c-green')]/text()").extract()
            if item_round is None:
                item['item_round'] = ''
            else:
                item['item_round'] = ','.join(item_round).strip(',').strip('\n').strip('\t').strip('\n').strip('(').strip(')');
    
            item_website = response.xpath("//input[contains(@name,'com_url')]/@value").extract()
            if item_website is None:
                item['item_website'] = ''
            else:
                item['item_website'] = ','.join(item_website).strip(',')
    
            item['item_from'] = 'IT桔子'
            item['item_phone'] = ''
            item['item_email'] = ''
            item_weixin = response.xpath("//li[@class='wx-text']/a/text()").extract()
            if item_weixin is None:
                item['item_weixin'] = ''
            else:
                item['item_weixin'] = ','.join(item_weixin).strip(',').strip(' ')
    
            item_weibo = response.xpath("//div[@class='link-line']/a[1]/@href").extract()
            if item_weibo is None:
                item['item_weibo'] = ''
            else:
                item['item_weibo'] = ','.join(item_weibo).strip(',').strip(' ')
    
            item['item_from_website'] = response.url
            item_address = response.xpath("//span[contains(@class,'loca c-gray-aset')]/a/text()").extract()
            if item_address is None:
                item['item_address'] = ''
            else:
                item['item_address'] = ','.join(item_address).strip(',').strip(' ')
            return item
    

    相关文章

      网友评论

        本文标题:scrapy爬虫登陆后爬取

        本文链接:https://www.haomeiwen.com/subject/fhzjkxtx.html