美文网首页
python爬虫

python爬虫

作者: ZHQIAN | 来源:发表于2017-11-09 18:35 被阅读0次

    一、新闻爬虫实战(爬取新浪新闻首页所有新闻内容)
    思路:
    1、爬取新闻首页
    2、得到各新闻链接
    3、爬取新闻链接
    4、寻找有没有frame
    5、若有,抓取frame下对应的网页内容
    6、若没有,直接抓取当前页面
    例子程序:

    import urllib.request
    import re
    url = "http://news.sina.com.cn/"
    data = urllib.request.urlopen(url).read().decode("UTF-8","ignore")
    pat1 = '<a target="_blank" href="(http://.*?)"'
    alllink = re.compile(pat1).findall(data)
    for i in range(0,len(alllink)):
        try:
            thislink = alllink[i]
            thispage = urllib.request.urlopen(thislink).read().decode("utf-8","ignore")
            pat2 = "<frame src=(.*?)>"
            isframe = re.compile(pat2).findall(thispage)
            if(len(isframe)==0):
                urllib.request.urlretrieve(thislink,"D:/test/"+str(i)+".html")
            else:
                flink = isframe[0]
                urllib.request.urlretrieve(thislink,"D:/test/"+str(i)+".html")
        except thislink:
            pass
    

    二、糗事百科段子爬取:

    import urllib.request
    import re
    headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0")
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    urllib.request.install_opener(opener)
    fh = open("D:/test/123.txt","w")
    for i in range(0,13):
        thisurl = "https://www.qiushibaike.com/8hr/page/"+str(i+1)+"/"
        data = urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
        pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
        rst = re.compile(pat,re.S).findall(data)
        print(len(rst))
        for j in range(0,len(rst)):
            print(rst[j])
            print("-----------------------------------")
    

    三、用户代理池的构建(使用用户代理爬取不同浏览器上的糗事百科):

    import urllib.request
    import re
    import random   #随机数模块,可是实现随机选择的功能。
    uapools = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
        ]
    def ua(uapools):
        thisua = random.choice(uapools)
        print(thisua)
        headers = ("User-Agent",thisua)
        opener = urllib.request.build_opener()
        opener.addheaders = [headers]
        urllib.request.install_opener(opener)
    for i in range(0,13):
        ua(uapools)
        thisurl = "https://www.qiushibaike.com/8hr/page/"+str(i+1)+"/"
        data = urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
        pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
        rst = re.compile(pat,re.S).findall(data)
        print(len(rst))
        for j in range(0,len(rst)):
            print(rst[j])
            print("-----------------------------------")
    

    四、IP代理
    概述:IP代理指的是让爬虫使用代理IP取爬取对方的网站。
    IP代理的构建实战:

    import urllib.request
    ip = "200.122.209.10:8080"
    proxy = urllib.request.ProxyHandler({"http":ip})
    opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    url = "http://www.baidu.com"
    data1 = urllib.request.urlopen(url).read()
    data = data1.decode("utf-8","ignore")
    print(len(data))
    fh = open("D:/test/ip_baidu.html","wb")
    fh.write(data1)
    fh.close()
    

    IP代理池构建的第一种方案(适合于代理IP稳定的情况)

    import random
    import urllib.request
    ippools = [
        "141.196.142.8:8080",
        "119.40.106.69:8081",
        "40.132.242.226:3128",
    ]
    def ip(ippools):
        thisip = random.choice(ippools)
        print(thisip)
        proxy = urllib.request.ProxyHandler({"http": thisip})
        opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
        urllib.request.install_opener(opener)
    for i in range(0,5):
        try:
            ip(ippools)
            url = "http://www.baidu.com"
            data1 = urllib.request.urlopen(url).read()
            data = data1.decode("utf-8", "ignore")
            print(len(data))
            fh = open("D:/test/ip_baidu_"+str(i)+".html", "wb")
            fh.write(data1)
            fh.close()
        except Exception as err:
            print(err)
    

    IP代理池构建的第二种方式(接口调用法,适合于代理IP不稳定的情况)

    
    
    
    
    

    淘宝商品图片爬虫实战

    import urllib.request
    import re
    import random
    keyname = "连衣裙"
    key = urllib.request.quote(keyname)
    import urllib.request
    uapools = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
        ]
    def ua(uapools):
        thisua = random.choice(uapools)
        print(thisua)
        headers = ("User-Agent",thisua)
        opener = urllib.request.build_opener()
        opener.addheaders = [headers]
        urllib.request.install_opener(opener)
    for i in range(1,101):
        url = "https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44)
        ua(uapools)
        data = urllib.request.urlopen(url).read().decode("utf-8","ignore")
        pat = '"pic_url":"//(.*?)"'
        imglist = re.compile(pat).findall(data)
        for j in range(0,len(imglist)):
            thisimg = imglist[j]
            thisimgurl = "http://"+thisimg
            localfile = "D:/test/test1/"+str(i)+str(j)+".jpg"
            urllib.request.urlretrieve(thisimgurl,filename = localfile)
    
    

    同时使用用户代理池和IP代理

    #同时使用ip代理以及用户代理
    import urllib.request
    import random
    #用户代理池和ip代理池
    uapools = [
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon/3.0)",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ;  QIHU 360EE)"]
    ippools = ["61.135.217.7","118.114.77.47","111.224.104.161"]
    url = 'http://www.baidu.com'
    #添加用户代理以及ip代理
    def ua(uapools,ippools):
        req = urllib.request.Request(url)
        req.add_header("User-Agent",random.choice(uapools))
        proxy = urllib.request.ProxyHandler({"https":random.choice(ippools)})
        opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
        #将opener安装为全局
        urllib.request.install_opener(opener)
        return req
    if __name__ == '__main__':
        for i in range(20):
            req = ua(uapools,ippools)
            try:
                data = urllib.request.urlopen(req)
                print(len(data.read()))
                print(data.getcode())
            except Exception as err:
                print(err)
    

    微信爬虫实战

    
    
    

    腾讯视频评论爬虫实战

    import urllib.request
    import re
    vid = "1453179977"
    cid = "6310753745936743097"
    num = "20"
    headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
             "Content-Type":"application/javascript"
    }
    opener = urllib.request.build_opener()
    headall = []
    for key, value in headers.items():
        item = (key, value)
        headall.append(item)
    opener.addheaders = headall
    urllib.request.install_opener(opener)
    for j in range(0,2):
        print("page " + str(j))
        thisurl = "https://coral.qq.com/article/"+vid+"/comment?commentid="+cid+"&reqnum="+num
        data = urllib.request.urlopen(thisurl).read().decode("utf-8")
        contentpat = '"content":"(.*?)"'
        contentall = re.compile(contentpat, re.S).findall(data)
        lastpat = '"last":"(.*?)"'
        cid = re.compile(lastpat, re.S).findall(data)[0]
        for i in range(0, len(contentall)):
            try:
                print("conent:" + eval('u"'+contentall[i]+'"'))
                print("----------")
            except Exception as err:
                print(err)
    

    爬取电影天堂的电影链接

    import urllib.request
    import re
    import random
    uapools = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
        ]
    def ua(uapools):
        thisua = random.choice(uapools)
        headers = ("User-Agent",thisua)
        opener = urllib.request.build_opener()
        opener.addheaders = [headers]
        urllib.request.install_opener(opener)
    fh = open("D:/test1/test.txt","w")
    for i in range (0, 14):
        ua(uapools)
        url = "http://www.dytt8.net/html/tv/oumeitv/list_9_"+str(i)+".html"
        data = urllib.request.urlopen(url).read().decode("gb2312", "ignore")
        pat = '<b>.*?<a.*?href="(.*?)".*?class="ulink">'
        rst = re.compile(pat, re.S).findall(data)
        for j in range(0, len(rst)):
            try:
                thisurl = "http://www.dytt8.net"+rst[j]
                thisdata = urllib.request.urlopen(thisurl).read().decode("gb2312", "ignore")
                thispat = '"#fdfddf"><a href="(.*?)">'
                thisrst = re.compile(thispat,  re.S).findall(thisdata)
                for z in range(0,len(thisrst)):
                    fh.write(str(thisrst[z])+"\n--------------------------------------------------------\n")
                #print(thisrst)
            except Exception as err:
                print(err)
    fh.close()
    

    电影天堂(全部)

    import urllib.request
    import re
    import random
    uapools = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
        ]
    def ua(uapools):
        thisua = random.choice(uapools)
        headers = ("User-Agent",thisua)
        opener = urllib.request.build_opener()
        opener.addheaders = [headers]
        urllib.request.install_opener(opener)
    fh = open("D:/test1/dianying.txt","w")
    list = ["gndy/dyzz","gndy/jddy","tv/rihantv","zongyi2013","2009zongyi","dongman","tv/hytv","gndy/rihan"]
    list2 = ["23","63","8","99","89","16","71","6"]
    for this in range(0, 8):
        for i in range(1, 100):
            ua(uapools)
            if(this<6):
                url = "http://www.ygdy8.net/html/" + str(list[this]) + "/list_" + str(list2[this]) + "_" + str(i) + ".html"
            else:
                url = "http://www.dytt8.net/html/" + str(list[this]) + "/list_" + str(list2[this]) + "_" + str(i) + ".html"
            data = urllib.request.urlopen(url).read().decode("gb2312", "ignore")
            #print(data)
            pat = '<b>.*?<a.*?href="(.*?)".*?class="ulink">'
            rst = re.compile(pat, re.S).findall(data)
            #print(rst)
    
            for j in range(0, len(rst)):
                try:
                    if(this<6):
                        thisurl = "http://www.ygdy8.net" + rst[j]
                    else:
                        thisurl = "http://www.dytt8.net" + rst[j]
                    thisdata = urllib.request.urlopen(thisurl).read().decode("gb2312", "ignore")
                    #print(len(thisdata))
                    thispat = '"#fdfddf"><a href="(.*?)">'
                    thisrst = re.compile(thispat, re.S).findall(thisdata)
                    for z in range(0, len(thisrst)):
                        fh.write(str(thisrst[z])+"\n")
                        print(len(thisrst))
                except Exception as err:
                    print(err)
    fh.close()
    

    相关文章

      网友评论

          本文标题:python爬虫

          本文链接:https://www.haomeiwen.com/subject/ffuapxtx.html