美文网首页
蚌埠学院官网综合新闻条目抓取

蚌埠学院官网综合新闻条目抓取

作者: 我的袜子都是洞 | 来源:发表于2018-10-16 13:32 被阅读24次

    蚌埠学院综合新闻


    QQ图片20181016133347.png 2.png
    import requests
    from bs4 import BeautifulSoup
    from requests.exceptions import RequestException
    import json
    
    def get_one_page(url):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
            } 
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    
    def page_parser(html):
        soup = BeautifulSoup(html,'lxml')
        
        for td in soup.find_all(name='td',attrs={'height':24}):
            # 超链接
            href = 'http://www.bbc.edu.cn' + td.find(name='td').a.attrs['href']
            # 标题
            title = td.find(name='td').a.font.string
            # 发布时间
            postTime = td.find(class_='postTime').string
            yield {
                'href':href,
                'title':title,
                'postTime':postTime
            }
    
    def get_pages(url):
        html = get_one_page(url)
        soup = BeautifulSoup(html,'lxml')
        # 获取总页码
        pages = soup.find(name='a',attrs={'title':'进入尾页'}).attrs['href']
        # 将总页码提取出来
        pages = pages.split('/')[8]
        if pages :
             return pages
        return None
    
    def write_to_file(content):
        with open('result.txt','a',encoding='utf-8') as f:
            f.write(json.dumps(content,ensure_ascii=False) + '\n')
    
    def main(num=0):
        pages = get_pages('http://www.bbc.edu.cn/s/21/t/267/p/22/i/1/list.htm')
        pages = int(pages)
        if num:
            pages=num
        for page in range(1,pages):
            url = 'http://www.bbc.edu.cn/s/21/t/267/p/22/i/'+str(page)+'/list.htm'
            html = get_one_page(url)
            for item in page_parser(html):
                print(item)
        print('抓取了: '+str(pages)+'页综合新闻')
    
    if __name__ == '__main__':
        main(20)
    

    相关文章

      网友评论

          本文标题:蚌埠学院官网综合新闻条目抓取

          本文链接:https://www.haomeiwen.com/subject/zjchzftx.html