爬取猫眼电影存入mysql
from urllib import request
import re
import time
import random
import pymysql
class MaoyanSpider(object):
def __init__(self):
self.base_url = 'https://maoyan.com/board/4?offset={}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
self.page = 1
self.db = pymysql.connect(
'localhost', 'root', '123456', 'maoyandb', charset='utf8'
)
self.cursor = self.db.cursor()
def get_pages(self, url):
req = request.Request(url, headers=self.headers)
res = request.urlopen(req)
html = res.read().decode('utf-8')
self.parse_page(html)
def parse_page(self, html):
pattern = re.compile('<a href.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>', re.S)
results = pattern.findall(html)
self.write_sql(results)
def write_sql(self, results):
data_list = []
for film in results:
L = [
film[0].strip(),
film[1].strip(),
film[2].strip()[5:15]]
data_list.append(L)
ins = 'insert into filmset values(%s,%s,%s)'
self.cursor.executemany(ins, data_list)
self.db.commit()
def main(self):
# 用range函数可获取某些查询参数的值
for offset in range(0, 41, 10):
url = self.base_url.format(str(offset))
self.get_pages(url)
print('第%d页爬取成功' % self.page)
self.page += 1
time.sleep(random.randint(1, 2))
self.cursor.close()
self.db.close()
if __name__ == '__main__':
spider = MaoyanSpider()
spider.main()
网友评论