美文网首页
scrapy爬取豆瓣并到数据库

scrapy爬取豆瓣并到数据库

作者: 逢笔生辉 | 来源:发表于2023-02-17 15:21 被阅读0次

spider代码

import logging

import re

import scrapy

from itemloaders.processors import Compose,MapCompose

from numpy.core.defchararray import isdigit

from scrapy import Request

from douban.items import DoubanItem

from scrapy.loader import ItemLoader

from itemloaders.processors import MapCompose,ChainMap

class DoubanmovieSpider(scrapy.Spider):

    name = "doubanmovie"

    allowed_domains = ["douban.com"]

    start_urls = [f'https://movie.douban.com/top250?start={i*25}&filter='for i in range(3,4)]

    # print(start_urls)

    def parse(self, response):

        item=DoubanItem()

        red=response.xpath('//div[@class="hd"]/a')

        item['page']=response.url[-11:-8]

        daoy= MapCompose(str.strip)(response.xpath('//div[@class="bd"]/p/text()').getall())

        a=[]

        b=[]

        for jj in daoy:

            if jj is '':

                continue

            if isdigit(jj[0])==True:

                a.append(jj)

            else:

                b.append(jj)

        yanyuan=list(map(lambda i :i.split(':')[1][:-2].strip('\xa0\xa0\xa0'),b))

        daoyang=list(map(lambda i :i.split(':')[-1].strip('\xa0\xa0\xa0'),b))

        quotr=response.xpath('//p[@class="quote"]/span/text()').getall()

        item['dirctor']=yanyuan

        item['actor']=daoyang

        item['release_date']=list(map(lambda i:i.split('\xa0/\xa0')[0],a))

        item['release_country']=list(map(lambda i:i.split('\xa0/\xa0')[1],a))

        item['movietype']=list(map(lambda i:i.split('\xa0/\xa0')[2],a))

        item['movie_quote']=quotr

        rule=MapCompose(lambda i:i.strip('\xa0'))

        x=[]

        other=MapCompose(lambda i:i.strip('\xa0/\xa0'),lambda y:y.replace('/',','))

        y=[]

        z=[]

        for i in red:

            x.append(rule(i.xpath('@href').getall()))

            y.append(rule(i.xpath('./span/text()').get()))

            z.append(other(i.xpath('./span/text()').getall()[1::]))

            # x.append([rule(i.xpath('@href').getall())+rule(i.xpath('./span/text()').get()),other(i.xpath('./span/text()').getall()[1::])])

            # y.append(rule(i.xpath('./span/text()').getall()))

        item['othermovie']=z

        item['wangyeliangjie'] = x

        # for k in x:

        #    yield Request(k[0], callback=self.shuchu)

        item['moviename'] = y

        rate=response.xpath("//div[@class='star']//text()").getall()

        pingjia=[]

        pingfeng=[]

        # print(rate)

        for jk in rate:

            if isdigit(jk[0])==True:

                if jk[-1]=='价':

                    pingjia.append(jk)

                elif isdigit(jk[0])==True:

                    # print(jk)

                    pingfeng.append(jk)

            else:

                continue

            # p/rint(jk.strip('\n'))

            # if jk

            #    print(jk)

        item['score']=pingfeng

        item['evaluator']=pingjia

        tuxiang=response.xpath("//img/@*").getall()[:-5]

        tupian_nam=[]

        tupianlink=[]

        for j in tuxiang:

            if j =='' or isdigit(j[0])==True:

                continue

            elif j[0]=='h':

                tupianlink.append(j)

            else:

                tupian_nam.append(j)

        for j  in x:

            print(j)

            yield scrapy.Request(url=j[0],callback=self.shuchu)

        # item['movie_picname']=tupian_nam

        # item['moviepicture']=tupianlink

        return item

        # return item

    def shuchu(self,response):

            print('*'*10)

        # print(tuxiang)

        # i.add_xpath('moviename','//div[@class="hd"]/a')

        # print('-=====================')

        # print(i.load_item()['moviename'][0])

        # return i.load_item()

    # def start_requests(self):

    #    print(self.start_urls)

    #    return [scrapy.FormRequest('https://movie.douban.com/top250?start=25&filter=')]

        # return [scrapy.FormRequest('https://movie.douban.com/top250?start=25&filter=',callback=self.pon)]

#'FormRequest' object is not iterable

"""moviename response.xpath('//div[@class="hd"]/a/span[1]/text()').getall()

othermoviename response.xpath('//div[@class="hd"]/a/span[2]/text()').getall()

  response.xpath('//div[@class="hd"]/a').getall()[1]

在进行分类导演 response.xpath('//div[@class="bd"]/p/text()').get(0)

'                          导演: 弗兰克·德拉邦特 Frank Darabont主演: 蒂姆·罗宾斯 Tim Robbins /...'

一栏

quote

response.xpath('//p[@class="quote"]/span/text()').getall()

图片和连接//img/@*[2]

"""

        #注意这个response是个htmlresponse对象

        # print(response.meta)

        # print(response.url)

        # print(response.headers.encoding('utf-8'))

        # print(response.status)

"""start_requests

该方法必须返回一个可迭代对象(iterable)。该对象包含了spider用于抓取的第一个Request。

当spider起订抓取并且未指定url时,该方法被调用。当指定了url时,make_requests_from_url()将被调用来创建request对象。该方法仅仅会被scrapy调用一次,因此您可以将其实现为生成器。

该方法的默认实现是使用start_urls的url生成request。

如果您想要修改最初抓取某个网站的request对象,您可以重写(override)该方法。例如,如果您需要在启动时以POST登录某个网站,你可以这么写:


————————————————

item

# Define here the models for your scraped items

#

# See documentation in:

# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

from scrapy.item import Item,Field

class DoubanItem(Item):

    # define the fields for your item here like:

    # name = scrapy.Field()

    page=Field()

    moviepicture=Field()

    movie_picname=Field()

    moviename=Field()

    wangyeliangjie=Field()

    othermovie=Field()

    dirctor=Field()

    actor=Field()

    score=Field()

    release_date=Field()

    release_country=Field()

    movietype=Field()

    evaluator=Field()

    movie_quote=Field()

pipline

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# useful for handling different item types with a single interface

import functools

import json

import operator

from itemadapter import ItemAdapter

from scrapy.exceptions import DropItem

from scrapy.pipelines.images import ImagesPipeline

import scrapy

import pandas as pd

#a和b的长度必须保持一致,否则报错

# a = [x for x in range(5)]

# b = [x for x in range(5,10)]

#字典中的key值即为csv中列名

import  pymysql

#将DataFrame存储为csv,index表示是否显示行名,default=True

class DoubanPipeline:

    def __init__(self):

        # 建立连接

        self.conn = pymysql.connect(host='localhost',user='root',password='123qwe',database='bjpowernode',charset='utf8')  # 有中文要存入数据库的话要加charset='utf8'

        # 创建游标

        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):

        # sql语句authorautho

        try:

    #    #                'release_country','score','movietype','evaluator','movie_quote'])

            self.cursor.execute('DROP TABLE IF EXISTS student')

            sqlQuery = '''CREATE TABLE %s( text(200),wangyeliangjie text(200),othermovie text(200),dirctor

            text(200),score text(200),release_date text(200), release_county text(200),movietyupe text(200),evaluator text(200),

            movie_quote text(200)'''%item['page']

            self.cursor.execute(sqlQuery)

            print("数据表创建完成!")

        except pymysql.Error as error:

            print("数据表创建失败:" + str(error))

            self.conn.rollback()

        insert_sql = """

        insert item['page'] into (moviename,wangyeliangjie,othermovie,dirctor,score,release_date,release_county,movietyupe,evaluator,movie_quote) VALUES(%s,%s,%s,%s,%s,%s,%s,%s

        ,%s,%s)

        """

        value = (item['moviename'],item['wangyeliangjie'],item['othermovie'],item['dirctor'],item['score'],item['release_date'],

                item['release_country'],item['movietype'],item['evaluator'],item['movie_quote'] )

        # 执行插入数据到数据库操作

        self.cursor.execute(insert_sql,value)

        # 提交,不进行提交无法保存到数据库

        self.conn.commit()

    def close_spider(self, spider):

        # 关闭游标和连接

        self.cursor.close()

        self.conn.close()

    # def __init__(self):

    #    self.wb = Workbook()  # 类实例化

    #    self.ws = self.wb.active  # 激活工作表

    #    self.ws.append([ 'wangyeliangjie'])#'dirctor', 'actor','score' '', 'author', 'release_date',

    #    #                'release_country','score','movietype','evaluator','movie_quote'])  # 添加表头

    # 将数据以行的形式添加到工作表中

      # 保存

    #

    # def process_item(self, item, spider):

    #    d={'moviename':item['moviename'],'wangyeliangjie': item['wangyeliangjie'],

    #    'othermovie':item['othermovie'],

    #    'dirctor':item['dirctor'],

    #    'score':item['score'],

    #    'release_date':item['release_date'],

    #    'release_county':item['release_country'],

    #    'movietyupe':item['movietype'],

    #    'evaluator':item['evaluator'],

    #    'movie_quote':item['movie_quote']}

    #    dataframe = pd.DataFrame (pd.DataFrame.from_dict(d, orient='index').values.T, columns=list(d.keys()))

    #    dataframe.to_csv(r"C:\Users\29258\Desktop\demon1\%s.csv"%item['page'],sep=',',index=None)

    #    return item

"""class ImgsPipLine(ImagesPipeline):

    def get_media_requests(self, item, info):

        # print(item['moviepicture'])

        # print((item['moviename'][1][0]))

        x=0

        for url in item['moviepicture']:

            yield scrapy.Request(url=url,meta={'item':item["moviename"][x]})

            x+=1

    #

    # # # 返回图片名称即可

    def file_path(self, request, response=None, info=None, *, item=None):

        itemd=request.meta['item'][0]

        return 'full//%s.jpg' % (itemd)

    #

    def item_completed(self, results, item, info):

        image_paths = [x['path'] for ok, x in results if ok]

        return image_paths

    # def item_completed(self, results, item, info):

    #    return results

    #    pass

    """

相关文章

网友评论

      本文标题:scrapy爬取豆瓣并到数据库

      本文链接:https://www.haomeiwen.com/subject/plamkdtx.html