美文网首页
BeautifulSoup使用 爬虫小栗子

BeautifulSoup使用 爬虫小栗子

作者: 小飞船1号 | 来源:发表于2020-05-07 15:41 被阅读0次
    from mysql import connector
    from datetime import datetime
    from dateutil import parser
    import json
    import requests
    #使用BeautifulSoup,需要这么导入模块
    from bs4 import BeautifulSoup
    
    def loda_data(url):
        """
        发起请求,获取列表页页面源码
        """
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    
    def detail_data(html):
        #创建一个BeautifulSoup对象
        d_bs = BeautifulSoup(html,"html.parser")
        #使用css语法取出li标签
        content = d_bs.select('div[style="width: 1105px;margin:0 auto"]')[0]
        return content
    
    # 获取招标信息
    def json_data(url):
        """
        解析分页的页面源码数据
        """
        html = loda_data(url)
        html_bs = BeautifulSoup(html,"html.parser")
        #找到列表
        list=html_bs.find_all('li')
        global newinfo
        newsinfo = []
        for l in list:
            newinfo={}
            #标题
            newinfo["title"] = l.find('a').get_text()
            # 时间
            newinfo["ctime"] = parser.parse(l.find('span').get_text()).strftime("%Y-%m-%d %H:%M:%S")
            newinfo["gtime"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            # url
            detail_url = 'http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/' + str(l.find('a').attrs['href']).replace("./","")
            newinfo["url"] = detail_url
            #详情的html页面源码
            html = loda_data(detail_url)
            #获取详情内容
            newinfo["content"] = detail_data(html)
            newsinfo.append(newinfo)
        return newsinfo
    
    
    # 检查一个表是否存在
    def tableExists(mycursor, name):
        stmt = 'SHOW TABLES LIKE "{}"'.format(name)
        print(stmt)
        mycursor.execute(stmt)
        return mycursor.fetchone()
    
    def  mysql_data(url):
        conn=connector.connect(user='root', password='111111', database='book', use_unicode=True)
        cursor=conn.cursor()
        if tableExists(cursor,'newinfo'):
            print("不建")
        else:
            print("创建")
            creat_sql="create table newinfo(id INT AUTO_INCREMENT PRIMARY KEY,url varchar(255), title varchar(255), ctime datetime,gtime datetime,content text)"
            cursor.execute(creat_sql)
            print("创建成功")
        #  获取json中的数据
    
        news=json_data(url)
        # print(news)
        num=len(news)
        for i in range(0,num):
            # print(news[i]["content"])
            sql = "insert into newinfo (url,title,ctime,gtime,content) values ('{}','{}','{}','{}','{}')".format(news[i]["url"],news[i]["title"],news[i]["ctime"],news[i]["gtime"],news[i]["content"])
            cursor.execute(sql)
            print(sql)
        # 提交事务:
        conn.commit()
        cursor.close()
    
    url="http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/index_2.html"
    print(mysql_data(url))
    
    
    
    

    相关文章

      网友评论

          本文标题:BeautifulSoup使用 爬虫小栗子

          本文链接:https://www.haomeiwen.com/subject/kreaghtx.html