美文网首页
爬取猫眼电影存入mysql

爬取猫眼电影存入mysql

作者: writ | 来源:发表于2019-07-13 18:13 被阅读0次

    爬取猫眼电影存入mysql

    from urllib import request
    import re
    import time
    import random
    import pymysql
    
    
    class MaoyanSpider(object):
        def __init__(self):
            self.base_url = 'https://maoyan.com/board/4?offset={}'
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
            }
            self.page = 1
            self.db = pymysql.connect(
                'localhost', 'root', '123456', 'maoyandb', charset='utf8'
            )
            self.cursor = self.db.cursor()
    
        def get_pages(self, url):
            req = request.Request(url, headers=self.headers)
            res = request.urlopen(req)
            html = res.read().decode('utf-8')
            self.parse_page(html)
    
        def parse_page(self, html):
            pattern = re.compile('<a href.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>', re.S)
            results = pattern.findall(html)
            self.write_sql(results)
    
        def write_sql(self, results):
            data_list = []
            for film in results:
                L = [
                    film[0].strip(),
                    film[1].strip(),
                    film[2].strip()[5:15]]
                data_list.append(L)
            ins = 'insert into filmset values(%s,%s,%s)'
            self.cursor.executemany(ins, data_list)
            self.db.commit()
    
        def main(self):
            # 用range函数可获取某些查询参数的值
            for offset in range(0, 41, 10):
                url = self.base_url.format(str(offset))
                self.get_pages(url)
                print('第%d页爬取成功' % self.page)
                self.page += 1
                time.sleep(random.randint(1, 2))
            self.cursor.close()
            self.db.close()
    
    
    if __name__ == '__main__':
        spider = MaoyanSpider()
        spider.main()
    

    相关文章

      网友评论

          本文标题:爬取猫眼电影存入mysql

          本文链接:https://www.haomeiwen.com/subject/llvmkctx.html