美文网首页
scrapy处理HttpDigestAuth认证

scrapy处理HttpDigestAuth认证

作者: tenlee | 来源:发表于2018-06-06 17:50 被阅读37次

    由于requests类库自带了Http Digest Auth,故搬砖到scrapy。
    加入如下MiddleWare,并在settings.py文件中配置HTTP_DIGEST_USERNAMEHTTP_DIGEST_PASSWORD即可

    import hashlib
    import os
    import re
    import time
    import threading
    
    from scrapy import signals
    from requests.compat import urlparse
    from requests.utils import parse_dict_header
    
    
    class HTTPDigestAuthMiddleWare(object):
    
        def __init__(self, username, password):
            self.username = username
            self.password = password
    
            self._thread_local = threading.local()
    
        @classmethod
        def from_crawler(cls, crawler):
            s = cls(crawler.settings.get('HTTP_DIGEST_USERNAME'),
                    crawler.settings.getlist('HTTP_DIGEST_PASSWORD'))
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
    
        def process_request(self, request, spider):
            if hasattr(self._thread_local, 'chal') and self._thread_local.chal:
                auth = self.build_digest_header(request.method, request.url)
                request.headers['Authorization'] = auth
            return None
    
        def process_response(self, request, response, spider):
            if not response.status == 401:
                self._thread_local.num_401_calls = 1
            else:
                self.handle_401(response)
                return request
            return response
    
        def init_per_thread_state(self):
            if not hasattr(self._thread_local, 'init'):
                self._thread_local.init = True
                self._thread_local.last_nonce = ''
                self._thread_local.nonce_count = 0
                self._thread_local.chal = {}
                self._thread_local.num_401_calls = 1
    
        def build_digest_header(self, method, url):
            """
            :rtype: str
            """
    
            realm = self._thread_local.chal['realm']
            nonce = self._thread_local.chal['nonce']
            qop = self._thread_local.chal.get('qop')
            algorithm = self._thread_local.chal.get('algorithm')
            opaque = self._thread_local.chal.get('opaque')
            hash_utf8 = None
    
            if algorithm is None:
                _algorithm = 'MD5'
            else:
                _algorithm = algorithm.upper()
            # lambdas assume digest modules are imported at the top level
            if _algorithm == 'MD5' or _algorithm == 'MD5-SESS':
                def md5_utf8(x):
                    if isinstance(x, str):
                        x = x.encode('utf-8')
                    return hashlib.md5(x).hexdigest()
                hash_utf8 = md5_utf8
            elif _algorithm == 'SHA':
                def sha_utf8(x):
                    if isinstance(x, str):
                        x = x.encode('utf-8')
                    return hashlib.sha1(x).hexdigest()
                hash_utf8 = sha_utf8
    
            KD = lambda s, d: hash_utf8("%s:%s" % (s, d))
    
            if hash_utf8 is None:
                return None
    
            # XXX not implemented yet
            entdig = None
            p_parsed = urlparse(url)
            #: path is request-uri defined in RFC 2616 which should not be empty
            path = p_parsed.path or "/"
            if p_parsed.query:
                path += '?' + p_parsed.query
    
            A1 = '%s:%s:%s' % (self.username, realm, self.password)
            A2 = '%s:%s' % (method, path)
    
            HA1 = hash_utf8(A1)
            HA2 = hash_utf8(A2)
    
            if nonce == self._thread_local.last_nonce:
                self._thread_local.nonce_count += 1
            else:
                self._thread_local.nonce_count = 1
            ncvalue = '%08x' % self._thread_local.nonce_count
            s = str(self._thread_local.nonce_count).encode('utf-8')
            s += nonce.encode('utf-8')
            s += time.ctime().encode('utf-8')
            s += os.urandom(8)
    
            cnonce = (hashlib.sha1(s).hexdigest()[:16])
            if _algorithm == 'MD5-SESS':
                HA1 = hash_utf8('%s:%s:%s' % (HA1, nonce, cnonce))
    
            if not qop:
                respdig = KD(HA1, "%s:%s" % (nonce, HA2))
            elif qop == 'auth' or 'auth' in qop.split(','):
                noncebit = "%s:%s:%s:%s:%s" % (
                    nonce, ncvalue, cnonce, 'auth', HA2
                )
                respdig = KD(HA1, noncebit)
            else:
                # XXX handle auth-int.
                return None
    
            self._thread_local.last_nonce = nonce
    
            # XXX should the partial digests be encoded too?
            base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
                   'response="%s"' % (self.username, realm, nonce, path, respdig)
            if opaque:
                base += ', opaque="%s"' % opaque
            if algorithm:
                base += ', algorithm="%s"' % algorithm
            if entdig:
                base += ', digest="%s"' % entdig
            if qop:
                base += ', qop="auth", nc=%s, cnonce="%s"' % (ncvalue, cnonce)
    
            return 'Digest %s' % (base)
    
        def handle_401(self, resp, **kwargs):
            self.init_per_thread_state()
    
            s_auth = resp.headers.get('www-authenticate', b'').decode()
            if 'digest' in s_auth.lower() and self._thread_local.num_401_calls < 2:
    
                self._thread_local.num_401_calls += 1
                pat = re.compile(r'digest ', flags=re.IGNORECASE)
                self._thread_local.chal = parse_dict_header(pat.sub('', s_auth, count=1))
    
            self._thread_local.num_401_calls = 1
    

    相关文章

      网友评论

          本文标题:scrapy处理HttpDigestAuth认证

          本文链接:https://www.haomeiwen.com/subject/kgwkrftx.html