美文网首页
scrapy爬取豆瓣并到数据库

scrapy爬取豆瓣并到数据库

作者: 逢笔生辉 | 来源:发表于2023-02-17 15:21 被阅读0次

    spider代码

    import logging

    import re

    import scrapy

    from itemloaders.processors import Compose,MapCompose

    from numpy.core.defchararray import isdigit

    from scrapy import Request

    from douban.items import DoubanItem

    from scrapy.loader import ItemLoader

    from itemloaders.processors import MapCompose,ChainMap

    class DoubanmovieSpider(scrapy.Spider):

        name = "doubanmovie"

        allowed_domains = ["douban.com"]

        start_urls = [f'https://movie.douban.com/top250?start={i*25}&filter='for i in range(3,4)]

        # print(start_urls)

        def parse(self, response):

            item=DoubanItem()

            red=response.xpath('//div[@class="hd"]/a')

            item['page']=response.url[-11:-8]

            daoy= MapCompose(str.strip)(response.xpath('//div[@class="bd"]/p/text()').getall())

            a=[]

            b=[]

            for jj in daoy:

                if jj is '':

                    continue

                if isdigit(jj[0])==True:

                    a.append(jj)

                else:

                    b.append(jj)

            yanyuan=list(map(lambda i :i.split(':')[1][:-2].strip('\xa0\xa0\xa0'),b))

            daoyang=list(map(lambda i :i.split(':')[-1].strip('\xa0\xa0\xa0'),b))

            quotr=response.xpath('//p[@class="quote"]/span/text()').getall()

            item['dirctor']=yanyuan

            item['actor']=daoyang

            item['release_date']=list(map(lambda i:i.split('\xa0/\xa0')[0],a))

            item['release_country']=list(map(lambda i:i.split('\xa0/\xa0')[1],a))

            item['movietype']=list(map(lambda i:i.split('\xa0/\xa0')[2],a))

            item['movie_quote']=quotr

            rule=MapCompose(lambda i:i.strip('\xa0'))

            x=[]

            other=MapCompose(lambda i:i.strip('\xa0/\xa0'),lambda y:y.replace('/',','))

            y=[]

            z=[]

            for i in red:

                x.append(rule(i.xpath('@href').getall()))

                y.append(rule(i.xpath('./span/text()').get()))

                z.append(other(i.xpath('./span/text()').getall()[1::]))

                # x.append([rule(i.xpath('@href').getall())+rule(i.xpath('./span/text()').get()),other(i.xpath('./span/text()').getall()[1::])])

                # y.append(rule(i.xpath('./span/text()').getall()))

            item['othermovie']=z

            item['wangyeliangjie'] = x

            # for k in x:

            #    yield Request(k[0], callback=self.shuchu)

            item['moviename'] = y

            rate=response.xpath("//div[@class='star']//text()").getall()

            pingjia=[]

            pingfeng=[]

            # print(rate)

            for jk in rate:

                if isdigit(jk[0])==True:

                    if jk[-1]=='价':

                        pingjia.append(jk)

                    elif isdigit(jk[0])==True:

                        # print(jk)

                        pingfeng.append(jk)

                else:

                    continue

                # p/rint(jk.strip('\n'))

                # if jk

                #    print(jk)

            item['score']=pingfeng

            item['evaluator']=pingjia

            tuxiang=response.xpath("//img/@*").getall()[:-5]

            tupian_nam=[]

            tupianlink=[]

            for j in tuxiang:

                if j =='' or isdigit(j[0])==True:

                    continue

                elif j[0]=='h':

                    tupianlink.append(j)

                else:

                    tupian_nam.append(j)

            for j  in x:

                print(j)

                yield scrapy.Request(url=j[0],callback=self.shuchu)

            # item['movie_picname']=tupian_nam

            # item['moviepicture']=tupianlink

            return item

            # return item

        def shuchu(self,response):

                print('*'*10)

            # print(tuxiang)

            # i.add_xpath('moviename','//div[@class="hd"]/a')

            # print('-=====================')

            # print(i.load_item()['moviename'][0])

            # return i.load_item()

        # def start_requests(self):

        #    print(self.start_urls)

        #    return [scrapy.FormRequest('https://movie.douban.com/top250?start=25&filter=')]

            # return [scrapy.FormRequest('https://movie.douban.com/top250?start=25&filter=',callback=self.pon)]

    #'FormRequest' object is not iterable

    """moviename response.xpath('//div[@class="hd"]/a/span[1]/text()').getall()

    othermoviename response.xpath('//div[@class="hd"]/a/span[2]/text()').getall()

      response.xpath('//div[@class="hd"]/a').getall()[1]

    在进行分类导演 response.xpath('//div[@class="bd"]/p/text()').get(0)

    '                          导演: 弗兰克·德拉邦特 Frank Darabont主演: 蒂姆·罗宾斯 Tim Robbins /...'

    一栏

    quote

    response.xpath('//p[@class="quote"]/span/text()').getall()

    图片和连接//img/@*[2]

    """

            #注意这个response是个htmlresponse对象

            # print(response.meta)

            # print(response.url)

            # print(response.headers.encoding('utf-8'))

            # print(response.status)

    """start_requests

    该方法必须返回一个可迭代对象(iterable)。该对象包含了spider用于抓取的第一个Request。

    当spider起订抓取并且未指定url时,该方法被调用。当指定了url时,make_requests_from_url()将被调用来创建request对象。该方法仅仅会被scrapy调用一次,因此您可以将其实现为生成器。

    该方法的默认实现是使用start_urls的url生成request。

    如果您想要修改最初抓取某个网站的request对象,您可以重写(override)该方法。例如,如果您需要在启动时以POST登录某个网站,你可以这么写:


    ————————————————

    item

    # Define here the models for your scraped items

    #

    # See documentation in:

    # https://docs.scrapy.org/en/latest/topics/items.html

    import scrapy

    from scrapy.item import Item,Field

    class DoubanItem(Item):

        # define the fields for your item here like:

        # name = scrapy.Field()

        page=Field()

        moviepicture=Field()

        movie_picname=Field()

        moviename=Field()

        wangyeliangjie=Field()

        othermovie=Field()

        dirctor=Field()

        actor=Field()

        score=Field()

        release_date=Field()

        release_country=Field()

        movietype=Field()

        evaluator=Field()

        movie_quote=Field()

    pipline

    # Define your item pipelines here

    #

    # Don't forget to add your pipeline to the ITEM_PIPELINES setting

    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

    # useful for handling different item types with a single interface

    import functools

    import json

    import operator

    from itemadapter import ItemAdapter

    from scrapy.exceptions import DropItem

    from scrapy.pipelines.images import ImagesPipeline

    import scrapy

    import pandas as pd

    #a和b的长度必须保持一致,否则报错

    # a = [x for x in range(5)]

    # b = [x for x in range(5,10)]

    #字典中的key值即为csv中列名

    import  pymysql

    #将DataFrame存储为csv,index表示是否显示行名,default=True

    class DoubanPipeline:

        def __init__(self):

            # 建立连接

            self.conn = pymysql.connect(host='localhost',user='root',password='123qwe',database='bjpowernode',charset='utf8')  # 有中文要存入数据库的话要加charset='utf8'

            # 创建游标

            self.cursor = self.conn.cursor()

        def process_item(self, item, spider):

            # sql语句authorautho

            try:

        #    #                'release_country','score','movietype','evaluator','movie_quote'])

                self.cursor.execute('DROP TABLE IF EXISTS student')

                sqlQuery = '''CREATE TABLE %s( text(200),wangyeliangjie text(200),othermovie text(200),dirctor

                text(200),score text(200),release_date text(200), release_county text(200),movietyupe text(200),evaluator text(200),

                movie_quote text(200)'''%item['page']

                self.cursor.execute(sqlQuery)

                print("数据表创建完成!")

            except pymysql.Error as error:

                print("数据表创建失败:" + str(error))

                self.conn.rollback()

            insert_sql = """

            insert item['page'] into (moviename,wangyeliangjie,othermovie,dirctor,score,release_date,release_county,movietyupe,evaluator,movie_quote) VALUES(%s,%s,%s,%s,%s,%s,%s,%s

            ,%s,%s)

            """

            value = (item['moviename'],item['wangyeliangjie'],item['othermovie'],item['dirctor'],item['score'],item['release_date'],

                    item['release_country'],item['movietype'],item['evaluator'],item['movie_quote'] )

            # 执行插入数据到数据库操作

            self.cursor.execute(insert_sql,value)

            # 提交,不进行提交无法保存到数据库

            self.conn.commit()

        def close_spider(self, spider):

            # 关闭游标和连接

            self.cursor.close()

            self.conn.close()

        # def __init__(self):

        #    self.wb = Workbook()  # 类实例化

        #    self.ws = self.wb.active  # 激活工作表

        #    self.ws.append([ 'wangyeliangjie'])#'dirctor', 'actor','score' '', 'author', 'release_date',

        #    #                'release_country','score','movietype','evaluator','movie_quote'])  # 添加表头

        # 将数据以行的形式添加到工作表中

          # 保存

        #

        # def process_item(self, item, spider):

        #    d={'moviename':item['moviename'],'wangyeliangjie': item['wangyeliangjie'],

        #    'othermovie':item['othermovie'],

        #    'dirctor':item['dirctor'],

        #    'score':item['score'],

        #    'release_date':item['release_date'],

        #    'release_county':item['release_country'],

        #    'movietyupe':item['movietype'],

        #    'evaluator':item['evaluator'],

        #    'movie_quote':item['movie_quote']}

        #    dataframe = pd.DataFrame (pd.DataFrame.from_dict(d, orient='index').values.T, columns=list(d.keys()))

        #    dataframe.to_csv(r"C:\Users\29258\Desktop\demon1\%s.csv"%item['page'],sep=',',index=None)

        #    return item

    """class ImgsPipLine(ImagesPipeline):

        def get_media_requests(self, item, info):

            # print(item['moviepicture'])

            # print((item['moviename'][1][0]))

            x=0

            for url in item['moviepicture']:

                yield scrapy.Request(url=url,meta={'item':item["moviename"][x]})

                x+=1

        #

        # # # 返回图片名称即可

        def file_path(self, request, response=None, info=None, *, item=None):

            itemd=request.meta['item'][0]

            return 'full//%s.jpg' % (itemd)

        #

        def item_completed(self, results, item, info):

            image_paths = [x['path'] for ok, x in results if ok]

            return image_paths

        # def item_completed(self, results, item, info):

        #    return results

        #    pass

        """

    相关文章

      网友评论

          本文标题:scrapy爬取豆瓣并到数据库

          本文链接:https://www.haomeiwen.com/subject/plamkdtx.html