美文网首页
糗事百科简单爬虫

糗事百科简单爬虫

作者: 咸鱼佬 | 来源:发表于2017-11-17 15:29 被阅读4次

    菜鸡局


    1. BeautifulSoup
    2. urllib.request
    3. pymysql

    数据库

    mysql
    表结构

    CREATE TABLE `joke` (
        content VARCHAR(2048)
    );
    

    连接数据库

    import pymysql.cursors
    
    
    def insert_db(data):
        connect = pymysql.Connect(
            host='localhost',
            port=3306,
            user='root',
            passwd='',
            db='qsbk',
            charset='utf8')
    
        cursor = connect.cursor()
        sql = "insert into joke(content) VALUES (%s)"
        cursor.executemany(sql, data)
        connect.commit()
        cursor.close()
        connect.close()
    

    获取数据

    from bs4 import BeautifulSoup
    import urllib.request
    import Python01
    import threading
    import time
    
    
    # path = os.getcwd() + '/' + 'data.txt'
    
    
    # def save_date(str):
    #     str = str + '\n\r'
    #     f = open(r'C:\python\HelloWorld\data.txt', 'ab')
    #     f.write(str.encode('utf-8'))
    #     f.close()
    
    class GetDataThread(threading.Thread):
        def __init__(self, page):
            threading.Thread.__init__(self)
            self.page = page
    
        def run(self):
            self.get_data()
    
        def get_data(self):
            url = 'https://www.qiushibaike.com/hot/page/' + str(self.page)
    
            headers = {
                'User-Agent': r'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3163.100 Safari/537.30'}
    
            request = urllib.request.Request(url, headers=headers)
    
            try:
    
                response = urllib.request.urlopen(request)
    
            except urllib.request.HTTPError:
                print(self.page)
    
            bsObje = BeautifulSoup(response.read(), 'html.parser')
    
            store = bsObje.find_all('div', {'class': 'content'})
    
            data_list = []
    
            for tem in store:
                data = tem.get_text().strip()
                if 20 < len(data) and '查看全文' not in data:
                    print(data)
                    data_list.append(data)
    
            Python01.insert_db(data_list)
    
    
    for i in range(1, 8):
        my_thread = GetDataThread(i)
        my_thread.start()
        my_thread.join()    
    
    image

    相关文章

      网友评论

          本文标题:糗事百科简单爬虫

          本文链接:https://www.haomeiwen.com/subject/hmdjvxtx.html