美文网首页
爬取煎蛋网图片的一种思路

爬取煎蛋网图片的一种思路

作者: cz_布衣 | 来源:发表于2018-09-07 10:04 被阅读0次

    任何一个学习的过程,都需要练手项目。学网络爬虫就总会想去爬点什么东西。网上更多介绍的就是爬取网站图片,图片网站一般都有会自己的一套反爬技术。昨天遇到有帖子在说爬煎蛋网图片,也就去试了试。

    其中的反爬技术分析在 Python爬虫(15):煎蛋网加密处理方式 博客中已有详细解说,思路方法也有说了,大家可以仔细去看看。在这里,我的思路也一样,但实现方法不是去将其js方法改造成为python方法(虽然我也觉得这是最佳方法,无奈我对加密算法不熟悉,代码理解不了。接下来还是得去学学加密的算法才行。)这里使用一个偷巧的办法,把js解密方法直接拿出来构造一个html文件,再把抓到的图片hash值放进去,让它来给我解密还原成地址。有了地址你想怎么下载就很容易了,我使用的是用迅雷。(爬取图片的hash:把个含图片的网页都下载,直接抓取各个<span class="img-hash">***</span>值)。

    构造html文件时,我是截取jandan_load_img()中有关的两行代码,jdXFKzuIDxRVqKYQfswJ5elNfow1x0JrJH()就全照原样拷出来运行,然后打开开发者工具,边运行边看出现什么错误,需要什么方法就去原网站的js中寻找并补齐。除了hex_md5()外,其它方法都可以在原网站的js中找到。百度了一下,hex_md5()函数是在md5.js中,我下边也给我整个md5.js文件。(hex_md5()本来也是想拷贝出来用就好,可是看到md5.js里边好多参数,若是拷出来不知会涉及多少其它东西,所以就干脆直接引用md5.js)。

    先上图:


    html.png

    抓取图片hash值的py代码如下:

    图片hash都存放到img_hash.txt中

    # -*- coding:utf-8 -*-
    from lxml import etree
    import requests, time
    
    urls = ['http://jandan.net/ooxx/page-{}#comments'.format(i) for i in range(1, 41)]
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko)'
                             ' Chrome/22.0.1207.1 Safari/537.1'}
    i = 1
    img_hash = []
    print('Downloading:', end='')
    for url in urls:
        html = requests.get(url, headers=headers).text
        root = etree.HTML(html)
        span_img_hashs = root.xpath('//span[@class="img-hash"]')
        for span_img_hash in span_img_hashs:
            img_hash.append(span_img_hash.text)
        print(i, '\t', end='')
        i += 1
        time.sleep(3)
    print('Download completed!')
    with open('img_hash.txt', 'a') as f:
            f.write(str(img_hash))
    

    html文件如下:

    • get_url()函数是我加上去的,将hash值作为参数调用jandan_load_img()
    • 打开img_hash.txt,将其中的hash值拷贝给get_url()函数的hashlist变量
    <!DOCTYPE html>
    <html>
    <head>
        <title></title>
        <script type="text/ecmascript" src="md5.js"></script>
        <script type="text/javascript">
            function jandan_load_img(e) {
                var c = jdjDMYMvK51QlNY6NdLY1OkZw6dpQvspIM(e, "aPz8sQnzRxiHfhgesalhIBhfKZczglYq");
                var a = c.replace(/(\/\/\w+\.sinaimg\.cn\/)(\w+)(\/.+\.(gif|jpg|jpeg))/, "$1large$3");
                return a
            }
            var jdjDMYMvK51QlNY6NdLY1OkZw6dpQvspIM = function(o, y, g) {
                var d = o;
                var l = "DECODE";
                var y = y ? y : "";
                var g = g ? g : 0;
                var h = 4;
                y = md5(y);
                var x = md5(y.substr(0, 16));
                var v = md5(y.substr(16, 16));
                if (h) {
                    if (l == "DECODE") {
                        var b = md5(microtime());
                        var e = b.length - h;
                        var u = b.substr(e, h)
                    }
                } else {
                    var u = ""
                }
                var t = x + md5(x + u);
                var n;
                if (l == "DECODE") {
                    g = g ? g + time() : 0;
                    tmpstr = g.toString();
                    if (tmpstr.length >= 10) {
                        o = tmpstr.substr(0, 10) + md5(o + v).substr(0, 16) + o
                    } else {
                        var f = 10 - tmpstr.length;
                        for (var q = 0; q < f; q++) {
                            tmpstr = "0" + tmpstr
                        }
                        o = tmpstr + md5(o + v).substr(0, 16) + o
                    }
                    n = o
                }
                var k = new Array(256);
                for (var q = 0; q < 256; q++) {
                    k[q] = q
                }
                var r = new Array();
                for (var q = 0; q < 256; q++) {
                    r[q] = t.charCodeAt(q % t.length)
                }
                for (var p = q = 0; q < 256; q++) {
                    p = (p + k[q] + r[q]) % 256;
                    tmp = k[q];
                    k[q] = k[p];
                    k[p] = tmp
                }
                var m = "";
                n = n.split("");
                for (var w = p = q = 0; q < n.length; q++) {
                    w = (w + 1) % 256;
                    p = (p + k[w]) % 256;
                    tmp = k[w];
                    k[w] = k[p];
                    k[p] = tmp;
                    m += chr(ord(n[q]) ^ (k[(k[w] + k[p]) % 256]))
                }
                if (l == "DECODE") {
                    m = base64_encode(m);
                    var c = new RegExp("=","g");
                    m = m.replace(c, "");
                    m = u + m;
                    m = base64_decode(d)
                }
                return m
            };
            function md5(a) {
                return hex_md5(a)
            }
            function base64_encode(a) {
                return window.btoa(a)
            }
            function base64_decode(a) {
                return window.atob(a)
            }
            function microtime(b) {
                var a = new Date().getTime();
                var c = parseInt(a / 1000);
                return b ? (a / 1000) : (a - (c * 1000)) / 1000 + " " + c
            }
            function chr(a) {
                return String.fromCharCode(a)
            }
            function ord(a) {
                return a.charCodeAt()
            }
            function get_url() {
                var hashlist = ['Ly93eDQuc2luYWltZy5jbi9tdzYwMC8wMDc2QlNTNWx5MWZ1am93MDQyNGJqMzBpYTB0M3dnMi5qcGc=', 'Ly93dzMuc2luYWltZy5jbi9tdzEwMjQvMDA3M29iNlBneTFmdWpvNWdodGNiZzMwNnkwYW11MHkuZ2lm', 'Ly93eDQuc2luYWltZy5jbi9tdzYwMC8wMDc2QlNTNWx5MWZ1am5teGpqdWdqMzExMTFqazRxcC5qcGc='];
                // var urllist = new Array()
                var content = '';
                for (hash in hashlist){
                    var url = 'http:' + jandan_load_img(hashlist[hash]);
                    // urllist[hash] = url;
                    content += '<a href="'+url+'">'+url+'</a>';
                    content += '<br>'
                }
                document.getElementById("content").innerHTML = content;
            }
        </script>
    </head>
    <body>
        <button onclick="get_url()">click here</button>
        <div id="content"></div>
    </body>
    </html>
    

    md5.js:

    /*
     * A JavaScript implementation of the RSA Data Security, Inc. MD5 Message
     * Digest Algorithm, as defined in RFC 1321.
     * Version 2.1 Copyright (C) Paul Johnston 1999 - 2002.
     * Other contributors: Greg Holt, Andrew Kepert, Ydnar, Lostinet
     * Distributed under the BSD License
     * See http://pajhome.org.uk/crypt/md5 for more info.
     */
    /*
     * Configurable variables. You may need to tweak these to be compatible with
     * the server-side, but the defaults work in most cases.
     */
    var hexcase = 0; /* hex output format. 0 - lowercase; 1 - uppercase  */
    var b64pad = ""; /* base-64 pad character. "=" for strict RFC compliance */
    var chrsz = 8; /* bits per input character. 8 - ASCII; 16 - Unicode  */
    /*
     * These are the functions you'll usually want to call
     * They take string arguments and return either hex or base-64 encoded strings
     */
    function hex_md5(s){ return binl2hex(core_md5(str2binl(s), s.length * chrsz));}
    function b64_md5(s){ return binl2b64(core_md5(str2binl(s), s.length * chrsz));}
    function str_md5(s){ return binl2str(core_md5(str2binl(s), s.length * chrsz));}
    function hex_hmac_md5(key, data) { return binl2hex(core_hmac_md5(key, data)); }
    function b64_hmac_md5(key, data) { return binl2b64(core_hmac_md5(key, data)); }
    function str_hmac_md5(key, data) { return binl2str(core_hmac_md5(key, data)); }
    /*
     * Perform a simple self-test to see if the VM is working
     */
    function md5_vm_test()
    {
     return hex_md5("abc") == "900150983cd24fb0d6963f7d28e17f72";
    }
    /*
     * Calculate the MD5 of an array of little-endian words, and a bit length
     */
    function core_md5(x, len)
    {
     /* append padding */
     x[len >> 5] |= 0x80 << ((len) % 32);
     x[(((len + 64) >>> 9) << 4) + 14] = len;
     var a = 1732584193;
     var b = -271733879;
     var c = -1732584194;
     var d = 271733878;
     for(var i = 0; i < x.length; i += 16)
     {
     var olda = a;
     var oldb = b;
     var oldc = c;
     var oldd = d;
     a = md5_ff(a, b, c, d, x[i+ 0], 7 , -680876936);
     d = md5_ff(d, a, b, c, x[i+ 1], 12, -389564586);
     c = md5_ff(c, d, a, b, x[i+ 2], 17, 606105819);
     b = md5_ff(b, c, d, a, x[i+ 3], 22, -1044525330);
     a = md5_ff(a, b, c, d, x[i+ 4], 7 , -176418897);
     d = md5_ff(d, a, b, c, x[i+ 5], 12, 1200080426);
     c = md5_ff(c, d, a, b, x[i+ 6], 17, -1473231341);
     b = md5_ff(b, c, d, a, x[i+ 7], 22, -45705983);
     a = md5_ff(a, b, c, d, x[i+ 8], 7 , 1770035416);
     d = md5_ff(d, a, b, c, x[i+ 9], 12, -1958414417);
     c = md5_ff(c, d, a, b, x[i+10], 17, -42063);
     b = md5_ff(b, c, d, a, x[i+11], 22, -1990404162);
     a = md5_ff(a, b, c, d, x[i+12], 7 , 1804603682);
     d = md5_ff(d, a, b, c, x[i+13], 12, -40341101);
     c = md5_ff(c, d, a, b, x[i+14], 17, -1502002290);
     b = md5_ff(b, c, d, a, x[i+15], 22, 1236535329);
     a = md5_gg(a, b, c, d, x[i+ 1], 5 , -165796510);
     d = md5_gg(d, a, b, c, x[i+ 6], 9 , -1069501632);
     c = md5_gg(c, d, a, b, x[i+11], 14, 643717713);
     b = md5_gg(b, c, d, a, x[i+ 0], 20, -373897302);
     a = md5_gg(a, b, c, d, x[i+ 5], 5 , -701558691);
     d = md5_gg(d, a, b, c, x[i+10], 9 , 38016083);
     c = md5_gg(c, d, a, b, x[i+15], 14, -660478335);
     b = md5_gg(b, c, d, a, x[i+ 4], 20, -405537848);
     a = md5_gg(a, b, c, d, x[i+ 9], 5 , 568446438);
     d = md5_gg(d, a, b, c, x[i+14], 9 , -1019803690);
     c = md5_gg(c, d, a, b, x[i+ 3], 14, -187363961);
     b = md5_gg(b, c, d, a, x[i+ 8], 20, 1163531501);
     a = md5_gg(a, b, c, d, x[i+13], 5 , -1444681467);
     d = md5_gg(d, a, b, c, x[i+ 2], 9 , -51403784);
     c = md5_gg(c, d, a, b, x[i+ 7], 14, 1735328473);
     b = md5_gg(b, c, d, a, x[i+12], 20, -1926607734);
     a = md5_hh(a, b, c, d, x[i+ 5], 4 , -378558);
     d = md5_hh(d, a, b, c, x[i+ 8], 11, -2022574463);
     c = md5_hh(c, d, a, b, x[i+11], 16, 1839030562);
     b = md5_hh(b, c, d, a, x[i+14], 23, -35309556);
     a = md5_hh(a, b, c, d, x[i+ 1], 4 , -1530992060);
     d = md5_hh(d, a, b, c, x[i+ 4], 11, 1272893353);
     c = md5_hh(c, d, a, b, x[i+ 7], 16, -155497632);
     b = md5_hh(b, c, d, a, x[i+10], 23, -1094730640);
     a = md5_hh(a, b, c, d, x[i+13], 4 , 681279174);
     d = md5_hh(d, a, b, c, x[i+ 0], 11, -358537222);
     c = md5_hh(c, d, a, b, x[i+ 3], 16, -722521979);
     b = md5_hh(b, c, d, a, x[i+ 6], 23, 76029189);
     a = md5_hh(a, b, c, d, x[i+ 9], 4 , -640364487);
     d = md5_hh(d, a, b, c, x[i+12], 11, -421815835);
     c = md5_hh(c, d, a, b, x[i+15], 16, 530742520);
     b = md5_hh(b, c, d, a, x[i+ 2], 23, -995338651);
     a = md5_ii(a, b, c, d, x[i+ 0], 6 , -198630844);
     d = md5_ii(d, a, b, c, x[i+ 7], 10, 1126891415);
     c = md5_ii(c, d, a, b, x[i+14], 15, -1416354905);
     b = md5_ii(b, c, d, a, x[i+ 5], 21, -57434055);
     a = md5_ii(a, b, c, d, x[i+12], 6 , 1700485571);
     d = md5_ii(d, a, b, c, x[i+ 3], 10, -1894986606);
     c = md5_ii(c, d, a, b, x[i+10], 15, -1051523);
     b = md5_ii(b, c, d, a, x[i+ 1], 21, -2054922799);
     a = md5_ii(a, b, c, d, x[i+ 8], 6 , 1873313359);
     d = md5_ii(d, a, b, c, x[i+15], 10, -30611744);
     c = md5_ii(c, d, a, b, x[i+ 6], 15, -1560198380);
     b = md5_ii(b, c, d, a, x[i+13], 21, 1309151649);
     a = md5_ii(a, b, c, d, x[i+ 4], 6 , -145523070);
     d = md5_ii(d, a, b, c, x[i+11], 10, -1120210379);
     c = md5_ii(c, d, a, b, x[i+ 2], 15, 718787259);
     b = md5_ii(b, c, d, a, x[i+ 9], 21, -343485551);
     a = safe_add(a, olda);
     b = safe_add(b, oldb);
     c = safe_add(c, oldc);
     d = safe_add(d, oldd);
     }
     return Array(a, b, c, d);
    }
    /*
     * These functions implement the four basic operations the algorithm uses.
     */
    function md5_cmn(q, a, b, x, s, t)
    {
     return safe_add(bit_rol(safe_add(safe_add(a, q), safe_add(x, t)), s),b);
    }
    function md5_ff(a, b, c, d, x, s, t)
    {
     return md5_cmn((b & c) | ((~b) & d), a, b, x, s, t);
    }
    function md5_gg(a, b, c, d, x, s, t)
    {
     return md5_cmn((b & d) | (c & (~d)), a, b, x, s, t);
    }
    function md5_hh(a, b, c, d, x, s, t)
    {
     return md5_cmn(b ^ c ^ d, a, b, x, s, t);
    }
    function md5_ii(a, b, c, d, x, s, t)
    {
     return md5_cmn(c ^ (b | (~d)), a, b, x, s, t);
    }
    /*
     * Calculate the HMAC-MD5, of a key and some data
     */
    function core_hmac_md5(key, data)
    {
     var bkey = str2binl(key);
     if(bkey.length > 16) bkey = core_md5(bkey, key.length * chrsz);
     var ipad = Array(16), opad = Array(16);
     for(var i = 0; i < 16; i++)
     {
     ipad[i] = bkey[i] ^ 0x36363636;
     opad[i] = bkey[i] ^ 0x5C5C5C5C;
     }
     var hash = core_md5(ipad.concat(str2binl(data)), 512 + data.length * chrsz);
     return core_md5(opad.concat(hash), 512 + 128);
    }
    /*
     * Add integers, wrapping at 2^32. This uses 16-bit operations internally
     * to work around bugs in some JS interpreters.
     */
    function safe_add(x, y)
    {
     var lsw = (x & 0xFFFF) + (y & 0xFFFF);
     var msw = (x >> 16) + (y >> 16) + (lsw >> 16);
     return (msw << 16) | (lsw & 0xFFFF);
    }
    /*
     * Bitwise rotate a 32-bit number to the left.
     */
    function bit_rol(num, cnt)
    {
     return (num << cnt) | (num >>> (32 - cnt));
    }
    /*
     * Convert a string to an array of little-endian words
     * If chrsz is ASCII, characters >255 have their hi-byte silently ignored.
     */
    function str2binl(str)
    {
     var bin = Array();
     var mask = (1 << chrsz) - 1;
     for(var i = 0; i < str.length * chrsz; i += chrsz)
     bin[i>>5] |= (str.charCodeAt(i / chrsz) & mask) << (i%32);
     return bin;
    }
    /*
     * Convert an array of little-endian words to a string
     */
    function binl2str(bin)
    {
     var str = "";
     var mask = (1 << chrsz) - 1;
     for(var i = 0; i < bin.length * 32; i += chrsz)
     str += String.fromCharCode((bin[i>>5] >>> (i % 32)) & mask);
     return str;
    }
    /*
     * Convert an array of little-endian words to a hex string.
     */
    function binl2hex(binarray)
    {
     var hex_tab = hexcase ? "0123456789ABCDEF" : "0123456789abcdef";
     var str = "";
     for(var i = 0; i < binarray.length * 4; i++)
     {
     str += hex_tab.charAt((binarray[i>>2] >> ((i%4)*8+4)) & 0xF) +
       hex_tab.charAt((binarray[i>>2] >> ((i%4)*8 )) & 0xF);
     }
     return str;
    }
    /*
     * Convert an array of little-endian words to a base-64 string
     */
    function binl2b64(binarray)
    {
     var tab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
     var str = "";
     for(var i = 0; i < binarray.length * 4; i += 3)
     {
     var triplet = (((binarray[i >> 2] >> 8 * ( i %4)) & 0xFF) << 16)
        | (((binarray[i+1 >> 2] >> 8 * ((i+1)%4)) & 0xFF) << 8 )
        | ((binarray[i+2 >> 2] >> 8 * ((i+2)%4)) & 0xFF);
     for(var j = 0; j < 4; j++)
     {
      if(i * 8 + j * 6 > binarray.length * 32) str += b64pad;
      else str += tab.charAt((triplet >> 6*(3-j)) & 0x3F);
     }
     }
     return str;
    }
    

    如果本文对您有帮助,请给我留个言。谢谢!

    相关文章

      网友评论

          本文标题:爬取煎蛋网图片的一种思路

          本文链接:https://www.haomeiwen.com/subject/fspqgftx.html