美文网首页
2019-07-30

2019-07-30

作者: 超哥__ | 来源:发表于2019-07-30 21:24 被阅读0次
    #! /usr/bin/env python
    # # -*- coding: utf-8 -*-
    
    import datetime
    import gzip
    import hashlib
    import json
    import logging
    from lxml import etree
    import os
    import random
    import re
    import socket
    import ssl
    import StringIO
    import sys
    import threading
    import threadpool
    import time
    
    
    defencode = 'utf-8'
    ssl._create_default_https_context = ssl._create_unverified_context
    
    if sys.version_info[0] == 2:
        from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
        import urllib2 as urllib_
    elif sys.version_info[0] == 3:
        from http.server import BaseHTTPRequestHandler, HTTPServer
        import urllib.request as urllib_
    
    
    logging.basicConfig(level=logging.INFO, filename='serv.log', filemode='a',
        format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
    
    
    def FLog(msg):
        logging.info(msg)
        print(datetime.datetime.now().strftime('%c') + '\t' + msg)
    
    
    class NoRedirection(urllib_.HTTPRedirectHandler):
        def http_error_301(req, fp, code, msg, hdrs, newurl):
            return code
        def http_error_302(req, fp, code, msg, hdrs, newurl):
            return code
        def http_error_303(req, fp, code, msg, hdrs, newurl):
            return code
    
    
    def httpRequest(url, headers=None, postdata=None, proxy=None):
        if headers is None:
            headers = {
                'User-Agent':'Mozilla/5.0'
            }
        try:
            if proxy is None:
                opener = urllib_.build_opener(NoRedirection)
            else:
                opener = urllib_.build_opener(NoRedirection, urllib_.ProxyHandler({
                    "http": proxy,
                    "https": proxy,
                }))
            resp = opener.open(urllib_.Request(url, headers=headers, data=postdata))
            data = resp.read()
            if 'content-encoding' in resp.headers and resp.headers['content-encoding'] == 'gzip':
                gz = gzip.GzipFile(fileobj=StringIO.StringIO(data))
                data = gz.read()
                gz.close()
            return resp.code, data
        except Exception as e:
            return 600, None
    
    
    def downFile(url, path, headers=None, postdata=None, proxy=None):
        code, data = httpRequest(url=url, headers=headers, postdata=postdata, proxy=proxy)
        if code == 200 and data is not None:
            with open(path, 'wb') as f:
                f.write(data)
    
    
    proxy = None #'127.0.0.1:1080' 被封ip需要换代理
    
    if __name__ == '__main__':
        rootdir = './qiushi'
        for i in range(110006543, 130000000):
            itemdir = rootdir + '/%d' % i
            print('handle %d' % i)
            if os.path.exists(itemdir):
                continue
            url = 'https://m2.qiushibaike.com/article/%d' % i
            code, data = httpRequest(url=url, proxy=proxy, headers={
                'Accept-Encoding': 'gzip, deflate',
                'User-Agent': 'Mozilla/5.0'
            })
            if code != 200:
                continue
            try:
                jdata = json.loads(data)
                jdata['article']['content']
            except Exception as e:
                continue
            os.makedirs(itemdir)
            contentfile = itemdir + '/data'
            with open(contentfile, 'wb') as f:
                f.write(data)
            print('handle %d done' % i)
            if 'high_url' not in jdata['article']:
                continue
            high_url = jdata['article']['high_url']
            highurlfile = itemdir + '/' + os.path.basename(high_url)
            downFile(high_url, highurlfile)
    
    
    
    #! /usr/bin/env python
    # # -*- coding: utf-8 -*-
    
    import datetime
    import gzip
    import hashlib
    import json
    import logging
    from lxml import etree
    import os
    import random
    import re
    import socket
    import ssl
    import StringIO
    import sys
    import threading
    import threadpool
    import time
    
    
    defencode = 'utf-8'
    
    # https使用ssl来做证书加密,python有时候无法正常解密,加上这个就可以忽略证书验证。正常获取到https的html响应
    ssl._create_default_https_context = ssl._create_unverified_context
    
    # 因为python2和python3的urllib库方法有所区别
    if sys.version_info[0] == 2:
        from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
        import urllib2 as urllib_
    elif sys.version_info[0] == 3:
        from http.server import BaseHTTPRequestHandler, HTTPServer
        import urllib.request as urllib_
    
    # 打印日志
    logging.basicConfig(level=logging.INFO, filename='serv.log', filemode='a',
        format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
    
    
    def FLog(msg):
        logging.info(msg)
        print(datetime.datetime.now().strftime('%c') + '\t' + msg)
    
    # 用于设置不自动跳转
    class NoRedirection(urllib_.HTTPRedirectHandler):
        def http_error_301(req, fp, code, msg, hdrs, newurl):
            return code
        def http_error_302(req, fp, code, msg, hdrs, newurl):
            return code
        def http_error_303(req, fp, code, msg, hdrs, newurl):
            return code
    
    '''
    http/https请求类型:
    请求部分:
    
    传参方式
    1.http://www.baidu.com/omn/20190810/20190810A0ND3I00.html?a=1&b=2&aaa=
    2.User:lichao -> Header
    3.body传参
    
    GET /omn/20190810/20190810A0ND3I00.html?usr=lichao&pass=lihao HTTP/1.1(\r\n)
    User-Agent: ...(\r\n)
    Cookie:"asl=1,sa=1"
    (\r\n\r\n)
    
    POST /omn/20190810/20190810A0ND3I00.html HTTP/1.1(\r\n)
    User-Agent: ...(\r\n)
    (\r\n\r\n)
    body.....
    
    响应部分:
    HTTP/1.1 200 OK(\r\n)
    Header1:value1(\r\n)
    Header2:value2
    Cookie:"JSESSIONID=aaghlajalggajlsjkdklflkjas"
    ...
    (\r\n\r\n)
    body.....
    '''
    
    def httpRequest(url, headers=None, postdata=None, proxy=None):
        if headers is None:
            headers = {
                'User-Agent':'Mozilla/5.0'
            }
        try:
            if proxy is None:
                # opener = urllib_.build_opener() 如果想自动处理跳转
                opener = urllib_.build_opener(NoRedirection)
            else:
                opener = urllib_.build_opener(NoRedirection, urllib_.ProxyHandler({
                    "http": proxy,
                    "https": proxy,
                }))
            resp = opener.open(urllib_.Request(url, headers=headers, data=postdata)) # 构造http请求
            data = resp.read() # 真正的请求,获取状态码,返回的数据
            if 'content-encoding' in resp.headers and resp.headers['content-encoding'] == 'gzip':
                # 如果响应头告知采用gzip方式压缩,就解压body部分
                gz = gzip.GzipFile(fileobj=StringIO.StringIO(data)) 
                data = gz.read()
                gz.close()
            return resp.code, data
        except Exception as e:
            return 600, None
    
    
    def downFile(url, path, headers=None, postdata=None, proxy=None):
        code, data = httpRequest(url=url, headers=headers, postdata=postdata, proxy=proxy)
        if code == 200 and data is not None:
            with open(path, 'wb') as f:
                f.write(data)
    
    proxy = None #'127.0.0.1:1080' 被封ip需要换代理
    
    if __name__ == '__main__':
        rootdir = './qiushi'
        for i in range(110006540, 130000000):
            itemdir = rootdir + '/%d' % i
            print('handle %d' % i)
            if os.path.exists(itemdir):
                continue
            url = 'https://m2.qiushibaike.com/article/%d' % i
            code, data = httpRequest(url=url, proxy=proxy, headers={
                'Accept-Encoding': 'gzip, deflate',
                'User-Agent': 'Mozilla/5.0'
            })
            if code != 200:
                continue
            try:
                jdata = json.loads(data)
                jdata['article']['content'] # 取json的/article/content得值
            except Exception as e:
                continue
            os.makedirs(itemdir) 
            contentfile = itemdir + '/data'
            with open(contentfile, 'wb') as f:
                f.write(data)
            print('handle %d done' % i)
            if 'high_url' not in jdata['article']:
                continue
            high_url = jdata['article']['high_url']
            highurlfile = itemdir + '/' + os.path.basename(high_url)
            downFile(high_url, highurlfile)
            # itemdir: ./qiushi/110006543   os.makedirs(itemdir)
            # highurlfile: ./qiushi/110006543/1.img
    

    相关文章

      网友评论

          本文标题:2019-07-30

          本文链接:https://www.haomeiwen.com/subject/ydborctx.html