spider代码
import logging
import re
import scrapy
from itemloaders.processors import Compose,MapCompose
from numpy.core.defchararray import isdigit
from scrapy import Request
from douban.items import DoubanItem
from scrapy.loader import ItemLoader
from itemloaders.processors import MapCompose,ChainMap
class DoubanmovieSpider(scrapy.Spider):
name = "doubanmovie"
allowed_domains = ["douban.com"]
start_urls = [f'https://movie.douban.com/top250?start={i*25}&filter='for i in range(3,4)]
# print(start_urls)
def parse(self, response):
item=DoubanItem()
red=response.xpath('//div[@class="hd"]/a')
item['page']=response.url[-11:-8]
daoy= MapCompose(str.strip)(response.xpath('//div[@class="bd"]/p/text()').getall())
a=[]
b=[]
for jj in daoy:
if jj is '':
continue
if isdigit(jj[0])==True:
a.append(jj)
else:
b.append(jj)
yanyuan=list(map(lambda i :i.split(':')[1][:-2].strip('\xa0\xa0\xa0'),b))
daoyang=list(map(lambda i :i.split(':')[-1].strip('\xa0\xa0\xa0'),b))
quotr=response.xpath('//p[@class="quote"]/span/text()').getall()
item['dirctor']=yanyuan
item['actor']=daoyang
item['release_date']=list(map(lambda i:i.split('\xa0/\xa0')[0],a))
item['release_country']=list(map(lambda i:i.split('\xa0/\xa0')[1],a))
item['movietype']=list(map(lambda i:i.split('\xa0/\xa0')[2],a))
item['movie_quote']=quotr
rule=MapCompose(lambda i:i.strip('\xa0'))
x=[]
other=MapCompose(lambda i:i.strip('\xa0/\xa0'),lambda y:y.replace('/',','))
y=[]
z=[]
for i in red:
x.append(rule(i.xpath('@href').getall()))
y.append(rule(i.xpath('./span/text()').get()))
z.append(other(i.xpath('./span/text()').getall()[1::]))
# x.append([rule(i.xpath('@href').getall())+rule(i.xpath('./span/text()').get()),other(i.xpath('./span/text()').getall()[1::])])
# y.append(rule(i.xpath('./span/text()').getall()))
item['othermovie']=z
item['wangyeliangjie'] = x
# for k in x:
# yield Request(k[0], callback=self.shuchu)
item['moviename'] = y
rate=response.xpath("//div[@class='star']//text()").getall()
pingjia=[]
pingfeng=[]
# print(rate)
for jk in rate:
if isdigit(jk[0])==True:
if jk[-1]=='价':
pingjia.append(jk)
elif isdigit(jk[0])==True:
# print(jk)
pingfeng.append(jk)
else:
continue
# p/rint(jk.strip('\n'))
# if jk
# print(jk)
item['score']=pingfeng
item['evaluator']=pingjia
tuxiang=response.xpath("//img/@*").getall()[:-5]
tupian_nam=[]
tupianlink=[]
for j in tuxiang:
if j =='' or isdigit(j[0])==True:
continue
elif j[0]=='h':
tupianlink.append(j)
else:
tupian_nam.append(j)
for j in x:
print(j)
yield scrapy.Request(url=j[0],callback=self.shuchu)
# item['movie_picname']=tupian_nam
# item['moviepicture']=tupianlink
return item
# return item
def shuchu(self,response):
print('*'*10)
# print(tuxiang)
# i.add_xpath('moviename','//div[@class="hd"]/a')
# print('-=====================')
# print(i.load_item()['moviename'][0])
# return i.load_item()
# def start_requests(self):
# print(self.start_urls)
# return [scrapy.FormRequest('https://movie.douban.com/top250?start=25&filter=')]
# return [scrapy.FormRequest('https://movie.douban.com/top250?start=25&filter=',callback=self.pon)]
#'FormRequest' object is not iterable
"""moviename response.xpath('//div[@class="hd"]/a/span[1]/text()').getall()
othermoviename response.xpath('//div[@class="hd"]/a/span[2]/text()').getall()
response.xpath('//div[@class="hd"]/a').getall()[1]
在进行分类导演 response.xpath('//div[@class="bd"]/p/text()').get(0)
' 导演: 弗兰克·德拉邦特 Frank Darabont主演: 蒂姆·罗宾斯 Tim Robbins /...'
一栏
quote
response.xpath('//p[@class="quote"]/span/text()').getall()
图片和连接//img/@*[2]
"""
#注意这个response是个htmlresponse对象
# print(response.meta)
# print(response.url)
# print(response.headers.encoding('utf-8'))
# print(response.status)
"""start_requests
该方法必须返回一个可迭代对象(iterable)。该对象包含了spider用于抓取的第一个Request。
当spider起订抓取并且未指定url时,该方法被调用。当指定了url时,make_requests_from_url()将被调用来创建request对象。该方法仅仅会被scrapy调用一次,因此您可以将其实现为生成器。
该方法的默认实现是使用start_urls的url生成request。
如果您想要修改最初抓取某个网站的request对象,您可以重写(override)该方法。例如,如果您需要在启动时以POST登录某个网站,你可以这么写:
————————————————
item
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.item import Item,Field
class DoubanItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
page=Field()
moviepicture=Field()
movie_picname=Field()
moviename=Field()
wangyeliangjie=Field()
othermovie=Field()
dirctor=Field()
actor=Field()
score=Field()
release_date=Field()
release_country=Field()
movietype=Field()
evaluator=Field()
movie_quote=Field()
pipline
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import functools
import json
import operator
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
import scrapy
import pandas as pd
#a和b的长度必须保持一致,否则报错
# a = [x for x in range(5)]
# b = [x for x in range(5,10)]
#字典中的key值即为csv中列名
import pymysql
#将DataFrame存储为csv,index表示是否显示行名,default=True
class DoubanPipeline:
def __init__(self):
# 建立连接
self.conn = pymysql.connect(host='localhost',user='root',password='123qwe',database='bjpowernode',charset='utf8') # 有中文要存入数据库的话要加charset='utf8'
# 创建游标
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
# sql语句authorautho
try:
# # 'release_country','score','movietype','evaluator','movie_quote'])
self.cursor.execute('DROP TABLE IF EXISTS student')
sqlQuery = '''CREATE TABLE %s( text(200),wangyeliangjie text(200),othermovie text(200),dirctor
text(200),score text(200),release_date text(200), release_county text(200),movietyupe text(200),evaluator text(200),
movie_quote text(200)'''%item['page']
self.cursor.execute(sqlQuery)
print("数据表创建完成!")
except pymysql.Error as error:
print("数据表创建失败:" + str(error))
self.conn.rollback()
insert_sql = """
insert item['page'] into (moviename,wangyeliangjie,othermovie,dirctor,score,release_date,release_county,movietyupe,evaluator,movie_quote) VALUES(%s,%s,%s,%s,%s,%s,%s,%s
,%s,%s)
"""
value = (item['moviename'],item['wangyeliangjie'],item['othermovie'],item['dirctor'],item['score'],item['release_date'],
item['release_country'],item['movietype'],item['evaluator'],item['movie_quote'] )
# 执行插入数据到数据库操作
self.cursor.execute(insert_sql,value)
# 提交,不进行提交无法保存到数据库
self.conn.commit()
def close_spider(self, spider):
# 关闭游标和连接
self.cursor.close()
self.conn.close()
# def __init__(self):
# self.wb = Workbook() # 类实例化
# self.ws = self.wb.active # 激活工作表
# self.ws.append([ 'wangyeliangjie'])#'dirctor', 'actor','score' '', 'author', 'release_date',
# # 'release_country','score','movietype','evaluator','movie_quote']) # 添加表头
# 将数据以行的形式添加到工作表中
# 保存
#
# def process_item(self, item, spider):
# d={'moviename':item['moviename'],'wangyeliangjie': item['wangyeliangjie'],
# 'othermovie':item['othermovie'],
# 'dirctor':item['dirctor'],
# 'score':item['score'],
# 'release_date':item['release_date'],
# 'release_county':item['release_country'],
# 'movietyupe':item['movietype'],
# 'evaluator':item['evaluator'],
# 'movie_quote':item['movie_quote']}
# dataframe = pd.DataFrame (pd.DataFrame.from_dict(d, orient='index').values.T, columns=list(d.keys()))
# dataframe.to_csv(r"C:\Users\29258\Desktop\demon1\%s.csv"%item['page'],sep=',',index=None)
# return item
"""class ImgsPipLine(ImagesPipeline):
def get_media_requests(self, item, info):
# print(item['moviepicture'])
# print((item['moviename'][1][0]))
x=0
for url in item['moviepicture']:
yield scrapy.Request(url=url,meta={'item':item["moviename"][x]})
x+=1
#
# # # 返回图片名称即可
def file_path(self, request, response=None, info=None, *, item=None):
itemd=request.meta['item'][0]
return 'full//%s.jpg' % (itemd)
#
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
return image_paths
# def item_completed(self, results, item, info):
# return results
# pass
"""
网友评论