import scrapy
class ThomeSpider(scrapy.Spider):
name = 'thome'
allowed_domains = ['ouchn.cn']
# 形考任务一
# start_urls = ['http://hebei.ouchn.cn/mod/quiz/report.php?id=49058&mode=overview']
# 形考任务二
# start_urls = ['http://hebei.ouchn.cn/mod/quiz/report.php?id=49059&mode=overview']
# 形考任务三
start_urls = ['http://hebei.ouchn.cn/mod/quiz/report.php?id=49060&mode=overview']
ls = []
# 因为网页需要登录,所以需要加上cookies
def start_requests(self):
cookies = 'CheckCode=bAg0XLruL2w=; MoodleSession=7hmruvl32410n80b4n4ntfte9f; username=qhdzhengwei; UserName=qhdzhengwei'
# cookies需要转换成字典
cookies = {i.split('=')[0]: i.split('=')[1] for i in cookies.split('; ')}
yield scrapy.Request(
self.start_urls[0],
callback=self.parse,
cookies=cookies
)
def parse(self, response):
urls = response.xpath('//tbody//td/a[@title="回顾试答"]/@href').getall()
for url in urls:
yield scrapy.Request(url=url, callback=self.get_data)
def get_data(self, response):
# 获取试题列表
exams = response.xpath('//div[@class="qtext"]/p')
for exam in exams:
exam = ''.join(exam.xpath('.//text()').extract()) # 合并试题为一行
if exam in ['一、单选题', '二、多选题', '三、判断题']:
continue
print(exam)
with open('exams.txt', 'a+') as f:
f.write(exam + '\n')
# 获取答案列表
answers = response.xpath('//div[@class="rightanswer"]')
for answer in answers:
answer = ''.join(answer.xpath('.//text()').extract())
print(answer)
with open('answers.txt', 'a+') as f:
f.write(answer + '\n')
网友评论