BeautifulSoup使用爬虫小栗子

作者: 小飞船1号 | 来源:发表于2020-05-07 15:41 被阅读0次

BeautifulSoup使用爬虫小栗子
爬虫2
爬虫
Python 爬虫实战（二）：使用 requests-html
Python+PhantomJS+selenium+Beauti
【openshift-4】实现简单爬虫功能+生成在线API
利用BeautifulSoup爬取豆瓣首页图书的详情
bs4是非常牛逼的爬虫库！深度解析爬虫利器，轻松获得网站信息！
python 网页爬虫
beautifulsoup教程

from mysql import connector
from datetime import datetime
from dateutil import parser
import json
import requests
#使用BeautifulSoup,需要这么导入模块
from bs4 import BeautifulSoup

def loda_data(url):
    """
    发起请求,获取列表页页面源码
    """
    response = requests.get(url)
    if response.status_code == 200:
        return response.text

def detail_data(html):
    #创建一个BeautifulSoup对象
    d_bs = BeautifulSoup(html,"html.parser")
    #使用css语法取出li标签
    content = d_bs.select('div[style="width: 1105px;margin:0 auto"]')[0]
    return content

# 获取招标信息
def json_data(url):
    """
    解析分页的页面源码数据
    """
    html = loda_data(url)
    html_bs = BeautifulSoup(html,"html.parser")
    #找到列表
    list=html_bs.find_all('li')
    global newinfo
    newsinfo = []
    for l in list:
        newinfo={}
        #标题
        newinfo["title"] = l.find('a').get_text()
        # 时间
        newinfo["ctime"] = parser.parse(l.find('span').get_text()).strftime("%Y-%m-%d %H:%M:%S")
        newinfo["gtime"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        # url
        detail_url = 'http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/' + str(l.find('a').attrs['href']).replace("./","")
        newinfo["url"] = detail_url
        #详情的html页面源码
        html = loda_data(detail_url)
        #获取详情内容
        newinfo["content"] = detail_data(html)
        newsinfo.append(newinfo)
    return newsinfo


# 检查一个表是否存在
def tableExists(mycursor, name):
    stmt = 'SHOW TABLES LIKE "{}"'.format(name)
    print(stmt)
    mycursor.execute(stmt)
    return mycursor.fetchone()

def  mysql_data(url):
    conn=connector.connect(user='root', password='111111', database='book', use_unicode=True)
    cursor=conn.cursor()
    if tableExists(cursor,'newinfo'):
        print("不建")
    else:
        print("创建")
        creat_sql="create table newinfo(id INT AUTO_INCREMENT PRIMARY KEY,url varchar(255), title varchar(255), ctime datetime,gtime datetime,content text)"
        cursor.execute(creat_sql)
        print("创建成功")
    #  获取json中的数据

    news=json_data(url)
    # print(news)
    num=len(news)
    for i in range(0,num):
        # print(news[i]["content"])
        sql = "insert into newinfo (url,title,ctime,gtime,content) values ('{}','{}','{}','{}','{}')".format(news[i]["url"],news[i]["title"],news[i]["ctime"],news[i]["gtime"],news[i]["content"])
        cursor.execute(sql)
        print(sql)
    # 提交事务:
    conn.commit()
    cursor.close()

url="http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/index_2.html"
print(mysql_data(url))