美文网首页
Python 爬虫 - 爬取数据存储到MySQL

Python 爬虫 - 爬取数据存储到MySQL

作者: 莫名ypc | 来源:发表于2019-01-07 10:22 被阅读0次
    import pymysql
    
    
    # 获取数据库连接
    def get_db_con():
        host = '127.0.0.1'
        port = 3306
        user = 'root'
        password = '123456'
        database = 'spider'
        con = pymysql.connect(host, user, password, database, charset='utf8', port=port)
        return con
    
    
    # 获取游标
    def get_cursor(con):
        return con.cursor()
    
    
    # 关闭连接
    def close_connection(con):
        con.close()
    
    
    # 执行插入语句
    def insert_movie(one_movie_dict, con, cursor):
        sql = "insert into maoyan_movie (title, actor, release_time) values ('%s', '%s', '%s')" % (one_movie_dict['title'],
                                                                                                   one_movie_dict['actor'],
                                                                                                   one_movie_dict['release_time'],)
        print(sql)
        cursor.execute(sql)
        con.commit()
    
    
    def main():
        con = get_db_con()
        cursor = get_cursor(con)
        m_dict = {
            'title': '霸王别姬',
            'actor': '张国荣',
            'release_time': '2019-01-03',
        }
        try:
            insert_movie(m_dict, con, cursor)
        finally:
            close_connection(con)
    
    
    if __name__ == '__main__':
        main()
    
    
    import json
    from time import sleep
    
    import requests
    from lxml import etree
    
    
    def get_all_page():
        headers = {
            "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
        }
        for i in range(306):
            sleep(2)
            url = 'https://www.douban.com/group/explore?start='
            page = i * 30
            url += str(page)
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                text = response.content.decode('utf-8')
                print(f'第{i + 1}页')
                parse_html(text)
        return None
    
    
    def strips(l):
        result_list = []
        for item in l:
            result_list.append(item.strip())
        return result_list
    
    
    # 保存json数据
    def save_json(result_list):
        result_json_str = json.dumps(result_list, ensure_ascii=False)
        with open('douban.json', 'a', encoding='utf-8') as f:
            f.write(result_json_str)
    
    
    def parse_html(html):
        result_list = []
        etree_html = etree.HTML(html)
        channel_result = etree_html.xpath('//div[@class="channel-item"]')
        for channel in channel_result:
            item = {}
            title = channel.xpath('./div[@class="bd"]/h3/a/text()')[0]
            url = channel.xpath('./div[@class="bd"]/h3/a/@href')[0]
            likes = channel.xpath('./div[@class="likes"]/text()')[0]
            come_from = channel.xpath('./div[@class="bd"]/div[@class="source"]/span[@class="from"]/a/text()')[0]
            pubtime = channel.xpath('./div[@class="bd"]/div[@class="source"]/span[@class="pubtime"]/text()')[0]
            content = channel.xpath('./div[@class="bd"]/div[@class="block"]/p/text()')[0]
            image = channel.xpath('./div[@class="bd"]/div[@class="block"]/div[@class="pic"]/div[@class="pic-wrap"]/img/@src')
            if title:
                title = title[0]
            else:
                title = ''
            if content:
                content = content[0]
            else:
                image = ''
            if image:
                image = image[0]
            else:
                image = ''
            item['title'] = title
            item['url'] = url
            item['likes'] = likes
            item['come_from'] = come_from
            item['pubtime'] = pubtime
            item['content'] = content
            item['image'] = image
    
            # 插入数据库
    
            result_list.append(item)
        # save_json(result_list)
    
    
    def main():
        get_all_page()
    
    
    if __name__ == '__main__':
        main()
    

    相关文章

      网友评论

          本文标题:Python 爬虫 - 爬取数据存储到MySQL

          本文链接:https://www.haomeiwen.com/subject/iwbyrqtx.html