美文网首页
2019-07-30

2019-07-30

作者: 超哥__ | 来源:发表于2019-07-30 21:24 被阅读0次
#! /usr/bin/env python
# # -*- coding: utf-8 -*-

import datetime
import gzip
import hashlib
import json
import logging
from lxml import etree
import os
import random
import re
import socket
import ssl
import StringIO
import sys
import threading
import threadpool
import time


defencode = 'utf-8'
ssl._create_default_https_context = ssl._create_unverified_context

if sys.version_info[0] == 2:
    from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
    import urllib2 as urllib_
elif sys.version_info[0] == 3:
    from http.server import BaseHTTPRequestHandler, HTTPServer
    import urllib.request as urllib_


logging.basicConfig(level=logging.INFO, filename='serv.log', filemode='a',
    format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')


def FLog(msg):
    logging.info(msg)
    print(datetime.datetime.now().strftime('%c') + '\t' + msg)


class NoRedirection(urllib_.HTTPRedirectHandler):
    def http_error_301(req, fp, code, msg, hdrs, newurl):
        return code
    def http_error_302(req, fp, code, msg, hdrs, newurl):
        return code
    def http_error_303(req, fp, code, msg, hdrs, newurl):
        return code


def httpRequest(url, headers=None, postdata=None, proxy=None):
    if headers is None:
        headers = {
            'User-Agent':'Mozilla/5.0'
        }
    try:
        if proxy is None:
            opener = urllib_.build_opener(NoRedirection)
        else:
            opener = urllib_.build_opener(NoRedirection, urllib_.ProxyHandler({
                "http": proxy,
                "https": proxy,
            }))
        resp = opener.open(urllib_.Request(url, headers=headers, data=postdata))
        data = resp.read()
        if 'content-encoding' in resp.headers and resp.headers['content-encoding'] == 'gzip':
            gz = gzip.GzipFile(fileobj=StringIO.StringIO(data))
            data = gz.read()
            gz.close()
        return resp.code, data
    except Exception as e:
        return 600, None


def downFile(url, path, headers=None, postdata=None, proxy=None):
    code, data = httpRequest(url=url, headers=headers, postdata=postdata, proxy=proxy)
    if code == 200 and data is not None:
        with open(path, 'wb') as f:
            f.write(data)


proxy = None #'127.0.0.1:1080' 被封ip需要换代理

if __name__ == '__main__':
    rootdir = './qiushi'
    for i in range(110006543, 130000000):
        itemdir = rootdir + '/%d' % i
        print('handle %d' % i)
        if os.path.exists(itemdir):
            continue
        url = 'https://m2.qiushibaike.com/article/%d' % i
        code, data = httpRequest(url=url, proxy=proxy, headers={
            'Accept-Encoding': 'gzip, deflate',
            'User-Agent': 'Mozilla/5.0'
        })
        if code != 200:
            continue
        try:
            jdata = json.loads(data)
            jdata['article']['content']
        except Exception as e:
            continue
        os.makedirs(itemdir)
        contentfile = itemdir + '/data'
        with open(contentfile, 'wb') as f:
            f.write(data)
        print('handle %d done' % i)
        if 'high_url' not in jdata['article']:
            continue
        high_url = jdata['article']['high_url']
        highurlfile = itemdir + '/' + os.path.basename(high_url)
        downFile(high_url, highurlfile)


#! /usr/bin/env python
# # -*- coding: utf-8 -*-

import datetime
import gzip
import hashlib
import json
import logging
from lxml import etree
import os
import random
import re
import socket
import ssl
import StringIO
import sys
import threading
import threadpool
import time


defencode = 'utf-8'

# https使用ssl来做证书加密,python有时候无法正常解密,加上这个就可以忽略证书验证。正常获取到https的html响应
ssl._create_default_https_context = ssl._create_unverified_context

# 因为python2和python3的urllib库方法有所区别
if sys.version_info[0] == 2:
    from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
    import urllib2 as urllib_
elif sys.version_info[0] == 3:
    from http.server import BaseHTTPRequestHandler, HTTPServer
    import urllib.request as urllib_

# 打印日志
logging.basicConfig(level=logging.INFO, filename='serv.log', filemode='a',
    format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')


def FLog(msg):
    logging.info(msg)
    print(datetime.datetime.now().strftime('%c') + '\t' + msg)

# 用于设置不自动跳转
class NoRedirection(urllib_.HTTPRedirectHandler):
    def http_error_301(req, fp, code, msg, hdrs, newurl):
        return code
    def http_error_302(req, fp, code, msg, hdrs, newurl):
        return code
    def http_error_303(req, fp, code, msg, hdrs, newurl):
        return code

'''
http/https请求类型:
请求部分:

传参方式
1.http://www.baidu.com/omn/20190810/20190810A0ND3I00.html?a=1&b=2&aaa=
2.User:lichao -> Header
3.body传参

GET /omn/20190810/20190810A0ND3I00.html?usr=lichao&pass=lihao HTTP/1.1(\r\n)
User-Agent: ...(\r\n)
Cookie:"asl=1,sa=1"
(\r\n\r\n)

POST /omn/20190810/20190810A0ND3I00.html HTTP/1.1(\r\n)
User-Agent: ...(\r\n)
(\r\n\r\n)
body.....

响应部分:
HTTP/1.1 200 OK(\r\n)
Header1:value1(\r\n)
Header2:value2
Cookie:"JSESSIONID=aaghlajalggajlsjkdklflkjas"
...
(\r\n\r\n)
body.....
'''

def httpRequest(url, headers=None, postdata=None, proxy=None):
    if headers is None:
        headers = {
            'User-Agent':'Mozilla/5.0'
        }
    try:
        if proxy is None:
            # opener = urllib_.build_opener() 如果想自动处理跳转
            opener = urllib_.build_opener(NoRedirection)
        else:
            opener = urllib_.build_opener(NoRedirection, urllib_.ProxyHandler({
                "http": proxy,
                "https": proxy,
            }))
        resp = opener.open(urllib_.Request(url, headers=headers, data=postdata)) # 构造http请求
        data = resp.read() # 真正的请求,获取状态码,返回的数据
        if 'content-encoding' in resp.headers and resp.headers['content-encoding'] == 'gzip':
            # 如果响应头告知采用gzip方式压缩,就解压body部分
            gz = gzip.GzipFile(fileobj=StringIO.StringIO(data)) 
            data = gz.read()
            gz.close()
        return resp.code, data
    except Exception as e:
        return 600, None


def downFile(url, path, headers=None, postdata=None, proxy=None):
    code, data = httpRequest(url=url, headers=headers, postdata=postdata, proxy=proxy)
    if code == 200 and data is not None:
        with open(path, 'wb') as f:
            f.write(data)

proxy = None #'127.0.0.1:1080' 被封ip需要换代理

if __name__ == '__main__':
    rootdir = './qiushi'
    for i in range(110006540, 130000000):
        itemdir = rootdir + '/%d' % i
        print('handle %d' % i)
        if os.path.exists(itemdir):
            continue
        url = 'https://m2.qiushibaike.com/article/%d' % i
        code, data = httpRequest(url=url, proxy=proxy, headers={
            'Accept-Encoding': 'gzip, deflate',
            'User-Agent': 'Mozilla/5.0'
        })
        if code != 200:
            continue
        try:
            jdata = json.loads(data)
            jdata['article']['content'] # 取json的/article/content得值
        except Exception as e:
            continue
        os.makedirs(itemdir) 
        contentfile = itemdir + '/data'
        with open(contentfile, 'wb') as f:
            f.write(data)
        print('handle %d done' % i)
        if 'high_url' not in jdata['article']:
            continue
        high_url = jdata['article']['high_url']
        highurlfile = itemdir + '/' + os.path.basename(high_url)
        downFile(high_url, highurlfile)
        # itemdir: ./qiushi/110006543   os.makedirs(itemdir)
        # highurlfile: ./qiushi/110006543/1.img

相关文章

  • 2019-07-30 webstorm 最新注册码

    2019-07-30 webstorm 最新注册码 YZVR7WDLV8-eyJsaWNlbnNlSWQiOiJZ...

  • [补]Lan的ScalersTalk第四轮新概念朗读持续力训练D

    练习材料: [Day 1768 2019-07-30] Lesson 27-2The 'Vasa' They ha...

  • 2019-08-01

    2019-07-30 毛雅亭 字数 563 · 阅读 14 2019-06-02 18:39 ...

  • 文先森的日常

    日精进打卡第364天 姓名:李文杰 (四爷); 公司:中国太平人寿; 日期:2019-07-30 【知~学习】 《...

  • 夏季避暑杭州灵隐寺 清晨入古寺初日照高林

    夏季避暑杭州灵隐寺 清晨入古寺初日照高林 人生最好旅行 2019-07-30 11:32 夏季避暑杭州灵隐寺 清晨...

  • 以后

    时间:2019-07-30 20:51 周三 地点:湖北武汉 天气:好热,晚上下雨了 状态:有些开心 关键字:回首...

  • 深度践行14/90

    2019-07-30 今日天气:凉 【宣言】做更好的自己 #深度践行 教练姓名:谭福翠 孩子年龄+性别:张崤睿12...

  • 2019-07-30

    2019-07-30 姓名:郭祥华 组别:315期六项精进努力一组 【日精进打卡第570】 【知~学习】 背诵《...

  • 看图说话

    你的文章《2019-07-30》已转为仅自己可见,如有疑问请查看《为什么文章会被锁定?》或邮件联系 hel...

  • 2.0践行50/90

    2019-07-30 打卡天数:Day50 #不吼不叫做温柔父母# 菠萝7月目标: 亲子共读时间管理 妈妈7月目标...

网友评论

      本文标题:2019-07-30

      本文链接:https://www.haomeiwen.com/subject/ydborctx.html