美文网首页
Python简单爬取京东商品列表

Python简单爬取京东商品列表

作者: 孔冬兵 | 来源:发表于2018-10-26 17:32 被阅读0次

    直接上代码

    #!/usr/bin/python3
    # -*- coding: UTF-8 -*-
    
    import urllib.request
    import urllib.error
    import re
    import csv
    import time
    
    global_row = 0
    
    '''
    需求:
    爬取京东商品数据,以‘java’关键字为例。要求使用最基础的urllib和re库。
    需要保存书名,价格,评论数,出版社等信息。
    
    实现:
    找出页面规律如下
    一、每页显示60个商品,但分为两部分。
    1. 每页前30个商品,通过search.jd.com/Search?keyword=java接口获取
       每页后30个商品,通过search.jd.com/s_new.php?keyword=java接口获取,这个接口是个XHR请求,通过Chrome的开发者选项可以看出。模拟人向下滚动页面
    2. 每个接口有两个关键的参数'page'和's'
       参数'page'好理解,就是页面数。对第一个接口变化规律为1,3,5...,对第二个接口变化规律为2,4,6...;
       参数's'我猜应该是start,也就是起始商品的索引。实际测试时,这个参数变化不是很规律,这里强制设置每页30个,
       这样对第一个接口变化规律1,61,121...,对第二个接口变化规律为31,91,151...
    3. 对于获取不到商店名称的情况,再构造chat1.jd.com/api/checkChat请求,获取商店名称。(但还是存在获取不到的情况,页面也不能显示)
    '''
    def crawl_page(n, csv_writer):
        #上半部分网页
        top_url = 'http://search.jd.com/Search?keyword=java&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&page=%d&s=%d&click=0' % (2*n- 1, 1 + 60*(n - 1))
        content = get_page_content(top_url, True)
        parse_content(content, n, csv_writer)
    
        #下半部分网页(需要发送XHR请求的)
        current_time = '%.5f' % time.time() 
        url = 'http://search.jd.com/s_new.php?keyword=java&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&page=%d&s=%d&scrolling=y&log_id=%s' %(2*n, 31 + 60*(n - 1), current_time)
        content = get_page_content(url, False)
        parse_content(content, n, csv_writer)
    
    def get_page_content(url, is_top):
        req = urllib.request.Request(url)
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3192.0 Safari/537.36')
        if not is_top:
            req.add_header('Referer', 'http://search.jd.com/Search?keyword=java&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&click=0')
            req.add_header('X-Requested-With', 'XMLHttpRequest')
        response = urllib.request.urlopen(req)
        return response.read().decode('utf8')
    
    def get_seller(shop_url):
        req = urllib.request.Request(shop_url)
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3192.0 Safari/537.36')
        req.add_header('Referer', 'http://search.jd.com/Search?keyword=java&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&click=0')
        response = urllib.request.urlopen(req)
        content = response.read().decode('utf8')
        print('content', content)
        seller = re.search(r'"seller":"(.*?)"', content)
        if seller:
            #获得的值为类似于"\u58a8\u9a6c\u56fe\u4e66\u65d7\u8230\u5e97",需要转码
            return seller.group(1).encode('latin-1').decode('unicode_escape')
        return ''
    
    def parse_content(content, n, csv_writer):
        li_list = re.findall(r'<li data-sku="\d*" class="gl-item">.*?</li>', content, re.DOTALL)
        for li in li_list:
            #书名
            name_match = re.search(r'<div class="p-name p-name-type-2">.*?<em>(.*?)</em>.*?</div>', li, re.DOTALL)
            name = ''
            if name_match:
                name = re.sub(r'<.*?>', '', name_match.group(1))
            else:
                print('page %d, name is empty' % n)
    
            #价格
            price_match = re.search(r'<div class="p-price">.*?<i>(.*?)</i>.*?</div>', li, re.DOTALL)
            price = ''
            if price_match:
                price = price_match.group(1)
            else:
                print('page %d, price is empty' % n)
            #评论数
            commit_match = re.search(r'<div class="p-commit">.*?<a .*?>(.*?)</a>.*?</div>', li, re.DOTALL)
            commit = ''
            if commit_match:
                commit = commit_match.group(1)
            else:
                print('page %d, price is empty' % n)
    
            #出版社(商店)
            shop_match = re.search(r'<div class="p-shop" [^>]*>\s*<span [^>]*><a [^>]*>(.*?)</a>.*?</div>', li, re.DOTALL)
            shop = ''
            if shop_match:
                shop = shop_match.group(1)
            else:
                uid = re.match(r'<li data-sku="(\d*)" class="gl-item">.*?</li>', li, re.DOTALL).group(1)
                seller_url = 'https://chat1.jd.com/api/checkChat?pid=' + uid + '&returnCharset=utf-8'
                shop = get_seller(seller_url)
                if not shop:
                    print('page %d, shop is empty' % n)
        
            global global_row
            global_row = global_row + 1
            csv_writer.writerow([str(global_row), name, price, commit, shop])
            
    
    def main():
        with open('output.csv', 'w', encoding='gbk') as f:
            csv_writer = csv.writer(f)
            csv_writer.writerow(['序号', '书名', '价格', '评论数', '出版社'])
            for i in range(1, 6):
                crawl_page(i, csv_writer)
        
    
    if __name__ == '__main__':
        main()
    
    

    相关文章

      网友评论

          本文标题:Python简单爬取京东商品列表

          本文链接:https://www.haomeiwen.com/subject/ehsltqtx.html