美文网首页
爬取百度百科词条写入数据库

爬取百度百科词条写入数据库

作者: Wei_Lai | 来源:发表于2018-12-20 17:35 被阅读0次
    from bs4 import BeautifulSoup
    from urllib.request import urlopen
    import re
    import random
    
    import pymysql.cursors   # 数据库
    
    
    base_url = "https://baike.baidu.com"
    his = ["/item/%E5%8F%B2%E8%AE%B0"]
    
    for i in range(1000):
        # dealing with Chinese symbols
        url = base_url + his[-1]
    
        html = urlopen(url).read().decode('utf-8')
        soup = BeautifulSoup(html, features='lxml')
    
        print(i, soup.find('h1').get_text(), '    url: ', url)
    
        # find valid urls
        sub_urls = soup.find_all("a", {"target": "_blank", "href": re.compile("/item/(%.{2})+$")})
    
        if len(sub_urls) != 0:
            his.append(random.sample(sub_urls, 1)[0]['href'])
        else:
            # no valid sub link found
            his.pop()
    
    # 链接数据库
        connection = pymysql.connect(host = 'localhost',
                    user = 'root',
                    password = 'password',
                    db = 'baikeurl',
                    charset = 'utf8mb4',
                         )
    
        try:
            # 获取会话指针
            with connection.cursor() as cursor:
                # 创建sql 语句
                sql = 'insert into `urls`(`urlname`,`urlhref`)values(%s,%s)'
                # 执行sql 语句
                cursor.execute(sql,(soup.find('h1').get_text(),url))
                # 提交
                connection.commit()
        except:
            pass 
        finally:
            connection.close()
    

    读取数据库

    import pymysql.cursors
    
    connection = pymysql.connect(host = 'localhost',
                    user = 'root',
                    password = 'password',
                    db = 'baikeurl',
                    charset = 'utf8mb4',
                         )
    try:
        # 获取会话指针
        with connection.cursor() as cursor:
            # 查询sql 语句
            sql = 'select `urlname` , `urlhref` from `urls` where `id` is not null'
            # 执行sql 语句
            conut = cursor.execute(sql)
            print(conut)
    
            # result = cursor.fetchall()
            # print(result)
    
    finally:
        connection.close()
    
    

    相关文章

      网友评论

          本文标题:爬取百度百科词条写入数据库

          本文链接:https://www.haomeiwen.com/subject/vjcukqtx.html