美文网首页
python 抓取搜狗里的文章, 图片保存到七牛云

python 抓取搜狗里的文章, 图片保存到七牛云

作者: app_developer | 来源:发表于2018-11-23 11:26 被阅读0次

    图片保存到七牛云的方法, 这边走http://lastidea.net/?p=7以下是代码

    #-*- coding: UTF-8 -*-
    import urllib, urllib2
    from bs4 import BeautifulSoup
    import socket
    import requests
    import datetime, time
    import random
    import os,stat,pwd
    from qiniu import Auth
    from qiniu import BucketManager
    
    
    # 七牛云储储
    def qiniu(url, key):
        access_key = ""  #yours access_key
        secret_key = ""  #yours secret_key
        bucket_name = ''  #yours bucket_name
    
        q = Auth(access_key, secret_key)
        bucket = BucketManager(q)
    
        ret, info = bucket.fetch(url, bucket_name, key)
        #assert ret['key'] == key
    
    def go():
        # 设置代理访问
        url = 'http://www.xicidaili.com/nn/'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
        }
        ip_list = get_ip_list(url, headers=headers)
        proxies = get_random_ip(ip_list)
    
        enable_proxy = True
        proxy_handler = urllib2.ProxyHandler(proxies)
        null_proxy_handler = urllib2.ProxyHandler({})
        if enable_proxy:
            opener = urllib2.build_opener(proxy_handler)
        else:
            opener = urllib2.build_opener(null_proxy_handler)
            urllib2.install_opener(opener)
    
        for i in [0, 1, 2, 4, 5, 6, 7, 8]: #
            url = "http://weixin.sogou.com/pcindex/pc/pc_"+str(i)+"/pc_"+str(i)+".html"
            #print url
            #http://weixin.sogou.com/pcindex/pc/pc_1/pc_1.html
            request = urllib2.Request(url)
            request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
            response = urllib2.urlopen(request, timeout=120)
            doc = response.read()
            soup = BeautifulSoup(''.join(doc), "html.parser")
            #print soup
            a = soup.find("a")
            href = a.get("href")
            img = soup.find("img")
            src = img.get("src")
            # print href;
            try:
                fileName = creatFileName("jpeg")
                
                #print src;
                if not src.startswith("http"):
                    src = "http:" + src;
                
                #抓取图片到七牛云
                qiniu(src, fileName);
            except:
                pass
                continue
            time.sleep(3)
            getContent(href, i, qiniu_server_url + fileName)
    
    def post(body=None):
        url = "http://test.lastidea.com/Admin/SystemArticle/add"  // your url
        #url = "http://localhost:8091/Admin/ArticleAdd/add"
    
        headers = {"Content-type": "application/x-www-form-urlencoded"}
    
        response = requests.post(url, data=body, headers=headers)
        #print response.text
    
    def getContent(url, sogouClassId, src):
        socket.setdefaulttimeout(100)
    
        #print "---------";
        request = urllib2.Request(url)
        request.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
        request.add_header('Referer','https://mp.weixin.qq.com/')
        response = urllib2.urlopen(request)
        doc = response.read()
        #print "---------";
        #print doc;
        soup = BeautifulSoup(''.join(doc), "html.parser")
    
        title = soup.find(id="activity-name")
    
        #print title
        imgs = soup.find(id="js_content").findAll("img")
        #print imgs
        for img in imgs:
            imgDataSrc = img.get('data-src')
            imgType = img.get('data-type')
            if imgDataSrc:
                if not imgDataSrc.startswith("http"):
                    imgDataSrc = "http:" + imgDataSrc;
                    
                fileName = creatFileName(imgType)
                count = 1
                while count <= 3:
                    try:
                        qiniu(imgDataSrc, fileName)
                        break
                    except socket.timeout:
                        err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count
                        print(err_info)
                        count += 1
    
                img['data-src'] = qiniu_server_url + fileName
                img['data-original'] = qiniu_server_url + fileName
                img['src'] = qiniu_server_url + "loading.gif" # 正在加载图片
                img['class'] = "lazy"
                #time.sleep(1)
            else:
                pass
    
        # 组织post数据
        if sogouClassId == 1: articleClassId = 17 #热门
        elif sogouClassId == 4: articleClassId = 16 #八卦精
        elif sogouClassId == 0: articleClassId = 10 #搞笑
        elif sogouClassId == 8: articleClassId = 11 #爱生活
        elif sogouClassId == 7: articleClassId = 12 #汽车迷
        elif sogouClassId == 6: articleClassId = 13 #财经类
        elif sogouClassId == 5: articleClassId = 14 #科技咖
        elif sogouClassId == 2: articleClassId = 15 #养生堂
    
    
        jsContent = soup.select("#activity-name, #js_content")
        jsContent = jsContent[0].__str__() + jsContent[1].__str__()
    
        body = {
            "title" : title.getText().strip(),
            "articleClassId" : articleClassId,
            "img" : src,
            "content" : jsContent,
            "attr[]" : 1,
            "click" : random.randint(10000, 100000)
        }
        #print body
        
        post(body=body)
    
    # 获取代理IP
    def get_ip_list(url, headers):
        web_data = requests.get(url, headers=headers)
        soup = BeautifulSoup(web_data.text, "html.parser")
        ips = soup.find_all('tr')
        ip_list = []
        for i in range(1, len(ips)):
            ip_info = ips[i]
            tds = ip_info.find_all('td')
            ip_list.append(tds[1].text + ':' + tds[2].text)
        return ip_list
    
    # 随机得到一个代理IP
    def get_random_ip(ip_list):
        proxy_list = []
        for ip in ip_list:
            proxy_list.append('http://' + ip)
        proxy_ip = random.choice(proxy_list)
        proxies = {'http': proxy_ip}
        return proxies
    
        #print(proxies)
    
    def creatFileName(ext = "png"):
        return str(int(round(time.time() * 1000))) + str(random.randint(10000,99999)) + "." + str(ext)
    
    # 七牛云外链服务器地址
    qiniu_server_url = "http://ph4xfr5l1.bkt.clouddn.com"
    go()
    

    相关文章

      网友评论

          本文标题:python 抓取搜狗里的文章, 图片保存到七牛云

          本文链接:https://www.haomeiwen.com/subject/uztjqqtx.html