Python新闻爬虫

作者: 薛落花随泪绽放 | 来源:发表于2017-10-29 16:17 被阅读0次

    新闻爬虫编写

    # 爬取腾讯新闻首页所有新闻内容
    
    1、爬取新闻首页
    2、得到各新闻链接
    3、爬取新闻链接
    4、寻找有没有frame
    5、若有,抓取frame下对应网页内容
    6、若没有,直接抓取当前页面
    import urllib.request
    url="https://news.qq.com/a/20171028/001835.htm"
    #头文件格式header=("User-Agent",具体用户代理值)
    headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
    opener=urllib.request.build_opener()
    opener.addheaders=[headers]
    data=opener.open(url).read().decode("utf-8","ignore")
    #data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
    len(data)
    
    
    import urllib.request
    import re
    url="http://news.163.com/"
    headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
    opener=urllib.request.build_opener()
    opener.addheaders=[headers]
    data=urllib.request.urlopen(url).read().decode("UTF-8","ignore")
    pat1='<a target="_blank" href="(.*?)"'
    alllink=re.compile(pat1).findall(data)
    for i in range(0,len(alllink)):
        thislink=alllink[i]
        thispage=urllib.request.urlopen(thislink).read().decode("gb2312","ignore")
        pat2="<frame src=(.*?)>"
        isframe=re.compile(pat2).findall(thispage)
        if(len(isframe)==0):
            #直接爬
            print(i)
            urllib.request.urlretrieve(thislink,"E:/python/python爬虫/data/"+str(i)+".html")
        else:
            #得到frame的网址爬
            urllib.request.urlretrieve(flink,"E:/python/python爬虫/data/"+str(i)+".html")
    
    # CSDN博文爬虫
    import urllib.request
    import re
    url="http://blog.csdn.net/"
    headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
    opener=urllib.request.build_opener()
    opener.addheaders=[headers]
    # 安装为全局
    urllib.request.install_opener(opener)
    data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
    pat='<h3  class="csdn-tracking-statistics" data-mod="popu_430" data-poputype="feed"  data-feed-show="false"  data-dsm="post"><a href="(.*?)"'
    alllink=re.compile(pat).findall(data)
    #print(alllink)
    for i in range(0,len(alllink)):
        localpath="E:\\python\\python爬虫\\rst\\"+str(i)+".html"
        thislink=alllink[i]
        urllib.request.urlretrieve(thislink,filename=localpath)
        print("当前文章(第"+str(i)+"篇) 爬取成功了!")
    

    相关文章

      网友评论

        本文标题:Python新闻爬虫

        本文链接:https://www.haomeiwen.com/subject/ddlypxtx.html