美文网首页
爬取京东商品信息

爬取京东商品信息

作者: 交易狗二哈 | 来源:发表于2017-03-24 20:42 被阅读3432次

    利用 BeautifulSoup + Requests 爬取京东商品信息并保存在Excel中

    一、查看网页信息

    打开京东商城,随便输入个商品,就选固态硬盘吧


    先看看 URL 的规律,可以看到我们输入的关键词是在 keyword 后面


    试试删掉后面的一些字符,发现并不影响我们的访问,所以我们的 URL 可以简化成下面这个样子


    复制下链接,发现简化后的 URL 是酱紫的https://search.jd.com/Search?keyword=%E5%9B%BA%E6%80%81%E7%A1%AC%E7%9B%98&enc=utf-8
    keyword后面的“固态硬盘”变成了这个

    %E5%9B%BA%E6%80%81%E7%A1%AC%E7%9B%98

    这是因为网址中的中文会被编码成UTF-8,每个中文3个字节,每个字节前加上%号。编码和解码方法如下:

    >>> import urllib
    >>>urllib.parse.unquote('%E5%9B%BA%E6%80%81%E7%A1%AC%E7%9B%98')
    '固态硬盘'
    >>> urllib.parse.quote('固态硬盘')
    '%E5%9B%BA%E6%80%81%E7%A1%AC%E7%9B%98'
    

    那我们就可以写出搜索商品名对应请求的 URL:

    def get_good_url(word):
        url_str = urllib.parse.quote(word)
        url = "https://search.jd.com/Search?keyword={}&enc=utf-8".format(url_str)
        return url
    

    二、爬取信息

    接着来看看我们所需爬取商品的信息
    选中一个商品,右键检查



    再检查下第二个商品,查看下规律,我们可以发现每个商品信息,都存在下面这个 class 里。所以我们就可以用BeautifulSoup的 find_all(class_="gl-i-wrap") 找出所有的商品,生成一个列表,再从中找出每个商品对应的信息



    但点开这个标签后,如下图,我们可以发现,我们索要爬取商品信息都存在一样的标签内。
    所以我们也可以先找出所有的 name,price,commit,img,生成四个列表,再把它们一一对应

    三、生成代码

    import requests
    from bs4 import BeautifulSoup
    import urllib 
    
    
    headers = {                            #加个请求头伪装浏览器
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
        }
    
    def get_good_url(word):
        url_str = urllib.parse.quote(word)
        url = "https://search.jd.com/Search?keyword={}&enc=utf-8".format(url_str)
        return url
    
    
    def get_html(url):
        html = requests.get(url, headers=headers)
        html.encoding = html.apparent_encoding    #html.apparent_encoding为从请求返回的内容分析出的编码格式,此处转为'utf-8'
        soup = BeautifulSoup(html.text, 'lxml')
        return soup
    
    #all_goods = soup.find_all(class_='gl-i-wrap') 另一种查找方法
    
    
    def get_info(soup):
        titles = soup.find_all(class_="p-name p-name-type-2")
        prices = soup.find_all(class_="p-price")
        commits = soup.find_all(class_="p-commit")
        imgs = soup.find_all(class_="p-img")
    
        for title, price, commit, img in zip(titles, prices, commits, imgs):
            data = {
                'title' :   title.text.strip(),
                'price' :   price.text.strip(),
                'commit':   commit.text.strip(),
                'link'  :   img.find_all('a')[0].get("href"),
                'img'   :   img.find_all('img')[0].get("src")
                }
            print(data)
    
    if __name__ == '__main__':
        good = input("请输入你要查询的商品\n")
        link = get_good_url(good)
        html = get_html(link)
        get_info(html)
    

    运行下试试看:

    >>> 
    =================== RESTART: C:/Users/Why Me/Desktop/jd.py ===================
    请输入你要查询的商品
    固态硬盘
    {'commit': '已有6.4万+人评价', 'link': '//item.jd.com/2010277.html', 'price': '¥469.00', 'title': '三星(SAMSUNG) 750 EVO 120G SATA3 固态硬盘', 'img': '//img12.360buyimg.com/n7/jfs/t2212/266/1035221213/221087/773b0946/563977acNf0e20fa1.jpg'}
    {'commit': '已有6.9万+人评价', 'link': '//item.jd.com/1279827.html', 'price': '¥699.00', 'title': '三星(SAMSUNG) 850 EVO 250G SATA3 固态硬盘', 'img': '//img12.360buyimg.com/n7/jfs/t3346/324/399270074/297766/3973b0ec/5809a884N64b7c922.jpg'}
    {'commit': '已有7.4万+人评价', 'link': '//item.jd.com/2010278.html', 'price': '¥669.00', 'title': '三星(SAMSUNG) 750 EVO 250G SATA3 固态硬盘', 'img': '//img13.360buyimg.com/n7/jfs/t1927/358/970997561/221087/773b0946/563977f8Nfc78217b.jpg'}
    {'commit': '已有10万+人评价', 'link': '//item.jd.com/779351.html', 'price': '¥419.00', 'title': '金士顿(Kingston)V300 120G SATA3 固态硬盘', 'img': '//img11.360buyimg.com/n7/jfs/t3631/219/2161004093/156337/8219df07/584623caNc6709dd6.jpg'}
    {'commit': '已有434967人评价', 'link': '//item.jd.com/1652127.html', 'price': '¥', 'title': '金士顿(Kingston)DDR3 1600 4G台式机内存+V300 120G 固态硬盘套装', 'img': '//img12.360buyimg.com/n7/jfs/t1291/10/518608285/159481/aa443498/557ff074N2fb18be7.jpg'}
    ...
    

    查询别的商品试试:


    QAQ

    四、储存数据

    保存在Excel里会比较好分析
    这里使用 xslxwriter

    import requests
    from bs4 import BeautifulSoup
    import urllib
    import xlsxwriter
    
    
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
        }
    
    def get_good_url(word):
        url_str = urllib.parse.quote(word)
        url = "https://search.jd.com/Search?keyword={}&enc=utf-8".format(url_str)
        return url
    
    
    def get_html(url):
        html = requests.get(url, headers=headers)
        html.encoding = html.apparent_encoding
        soup = BeautifulSoup(html.text, 'lxml')
        return soup
    
    
    def get_info(soup, good):
        titles = soup.find_all(class_="p-name p-name-type-2")
        prices = soup.find_all(class_="p-price")
        commits = soup.find_all(class_="p-commit")
        imgs = soup.find_all(class_="p-img")
    
        workbook = xlsxwriter.Workbook(good + '.xlsx') #创建新表
        worksheet = workbook.add_worksheet()
        
        bold = workbook.add_format({'bold': True})  #建立粗体格式
        
        worksheet.write('A1', 'Title', bold)        #写入标题,粗体
        worksheet.write('B1', 'Price', bold)
        worksheet.write('C1', 'Commit', bold)
        worksheet.write('D1', 'Link', bold)
        worksheet.write('F1', 'Img', bold)
    
        worksheet.set_column('A:A', 100)            #改变列宽度
        worksheet.set_column('B:B', 10)
        worksheet.set_column('C:C', 18)
        worksheet.set_column('D:D', 27)
        worksheet.set_column('F:F', 100)
        
        row = 1
        col = 0
        
        for title, price, commit, img in zip(titles, prices, commits, imgs):
            data = {
                'title' :   title.text.strip(),
                'price' :   price.text.strip(),
                'commit':   commit.text.strip(),
                'link'  :   img.find_all('a')[0].get("href"),  #链接的标签也在 img 标签里找
                'img'   :   img.find_all('img')[0].get("src")
                }
            #print(data)
            worksheet.write(row, col, data['title'])    #写入数据
            worksheet.write(row, col+1, data['price'])
            worksheet.write(row, col+2, data['commit'])
            worksheet.write(row, col+3, data['link'])
            worksheet.write(row, col+4, data['img'])
            row += 1
            
        workbook.close()
        
            
    
    if __name__ == '__main__':
        good = input("请输入你要查询的商品\n")
        link = get_good_url(good)
        html = get_html(link)
        get_info(html, good)
        
    

    运行下试试看

    >>> 
    =================== RESTART: C:/Users/Why Me/Desktop/jd.py ===================
    请输入你要查询的商品
    固态硬盘
    
    Warning (from warnings module):
      File "D:\python3.52\lib\site-packages\xlsxwriter\worksheet.py", line 830
        force_unicode(url))
    UserWarning: Ignoring URL 'https://ccc-x.jd.com/dsp/nc?ext=Y2xpY2sueC5qZC5jb20vSmRDbGljay8_eHVpZD01MjAwNyZ4c2l0ZWlkPTEwOTk1NDM5OF8xNDc2JnRvPWh0dHA6Ly9pdGVtLmpkLmNvbS8xNTkyNDQ4Lmh0bWw&log=7PpBMf6t87I6oM0VLPwEmWrd2SgyaWJjj6EC4vYhuh7iCsttJfv9TDfcAgTKqWbCLLeI1dEGfC09SoPIvPAKj4Xtbv-6jnX-qAWZKz46GdiJJNV2ZU3OWox54fbLzZ-TRTooveAkSRdWyaH0DE4M3DwxQts4PxqUQiiov99E20WKCLFpu4ncy0V6NR8PfTloBPGVKTUkAjLHnqzQzO0rb_ok9tZBsyXLPRoNUiqZcvB9ajEs8Zb6BCtHCzu5QDmD-yiaD25Tm_eS4DgkfGayyFFoMGx_y6FyO2E1zbDIUNcoF5G4ON1xMOaPciH2CptI6XSdUF8ViyV9SmzCEykWUrD9i2Ne0oi0qMyZNfsoDpHAx6f4UCdEHMfwu45XisbAnfj21UjheU7tzM3KuWk_0OLH-J77gHlUyuX72psI4dyUKGyEyYGgswvn_bLD3DX3&v=404' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS
    
    Warning (from warnings module):
      File "D:\python3.52\lib\site-packages\xlsxwriter\worksheet.py", line 830
        force_unicode(url))
    UserWarning: Ignoring URL 'https://ccc-x.jd.com/dsp/nc?ext=Y2xpY2sueC5qZC5jb20vSmRDbGljay8_eHVpZD01MjAwNyZ4c2l0ZWlkPTEwOTUzMzAwNF8xNDc2JnRvPWh0dHA6Ly9pdGVtLmpkLmNvbS80MTczODY0Lmh0bWw&log=7PpBMf6t87I6oM0VLPwEmWrd2SgyaWJjj6EC4vYhuh5LVKlnoUwiKskX7yp59hsaYbRCZqHPA7of0ku0pKD8yyMlENlDBmmWbYQSf5iudST1aW-kq4LWnzYSiXwquGa-lI_ZpBv3PQD6U_UWdQyYDLMCQ5bmriNRaHFpJosmkQU7RG-rXJZ98TaN_snWQixVUiEHC46VwrN9PqHlvkNnXAS-rvvda-_qloIbofbme2FqWymvkxzSlLYqS73YOQuiH4ugaFGdNOaP94Wt3MTWT5rkJfrZMWr33qDLS3JBvTa1tewqA8EbImCHaNbUT9tCbkEngyIMMT5emd-Q-GrEVwFHBSWTxhne-aSWEDzCR76612OabK1mfCVrtQefrh0I96hinm5qsYkb751issutBi9Yd325l7JJA3-0eLou0lw&v=404' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS
    >>> 
    
    

    报警告,原来是有的链接长度超过了Excel单元格的限制,不要紧。
    打开生成的"固态硬盘.xlsx"看看


    效果不错!
    不过 price 列有些缺失了,待会再看下原因
    还有我们只是爬取了一页商品,更多页,甚至全部呢。虽然第一页已经足够了,不过有时候分析需要用

    五、爬取多页商品信息

    还是分析下 URL, 可以看到 page 即为页数变更的字段,而且是一次增加 2。所以我们就可以写出所要请求的所以商品网址


    def get_good_urls(word):
        url_str = urllib.parse.quote(word)
        urls = ("https://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=4&page={}&s=1&click=0".format(url_str, i) for i in range(1,12,2))
        return urls
    

    如果我们要请求多页的话就用 () 生成器会省内存,如果页数比较少 用 [] 生成个列表就可以了。这里先生成 10 页试试看效果

    由于 xlsxwriter 不能读取,所以我们只能一次性把所有数据写入,先来个比较土的方法,还是由上面的代码修改

    import requests
    from bs4 import BeautifulSoup
    import urllib
    import xlsxwriter
    
    
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
        }
    
    
    def get_good_urls(word):
        url_str = urllib.parse.quote(word)
        urls = ("https://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=4&page={}&s=1&click=0".format(url_str, i) for i in range(1,12,2))
        return urls
    
    
    def get_html(url):
        html = requests.get(url, headers=headers)
        html.encoding = html.apparent_encoding
        soup = BeautifulSoup(html.text, 'lxml')
        return soup
    
    
    def get_info(soup):
        all_titles = soup.find_all(class_="p-name p-name-type-2")
        all_prices = soup.find_all(class_="p-price")
        all_commits = soup.find_all(class_="p-commit")
        all_imgs = soup.find_all(class_="p-img")
    
        titles = []
        prices = []
        commits = []
        links = []
        imgs = []
        
        for title in all_titles:
            titles.append(title.text.strip())
    
        for price in all_prices:
            prices.append(price.text.strip())
            
        for commit in all_commits:
            commits.append(commit.text.strip())
    
        for link in all_imgs:
            links.append(link.find_all('a')[0].get("href"))
    
        for img in all_imgs:
            imgs.append(img.find_all('img')[0].get("src"))
    
        return titles, prices, commits, links, imgs
    
    if __name__ == '__main__':
        good = input("请输入你要查询的商品\n")
        links = get_good_urls(good)
    
        workbook = xlsxwriter.Workbook(good + '.xlsx') #创建新表
        worksheet = workbook.add_worksheet()
        
        bold = workbook.add_format({'bold': True})  #建立粗体格式
        
        worksheet.write('A1', 'Title', bold)        #写入标题,粗体
        worksheet.write('B1', 'Price', bold)
        worksheet.write('C1', 'Commit', bold)
        worksheet.write('D1', 'Link', bold)
        worksheet.write('E1', 'Img', bold)
    
        worksheet.set_column('A:A', 100)            #改变列宽度
        worksheet.set_column('B:B', 10)
        worksheet.set_column('C:C', 18)
        worksheet.set_column('D:D', 27)
        worksheet.set_column('E:E', 100)
    
        all_row = 1
        col = 0
        
        for link in links:
            html = get_html(link)
            ti, pr, co, li, im = get_info(html)
    
            row = all_row
            for t in ti:
                worksheet.write(row, col, t)
                row += 1
    
            row = all_row
            for p in pr:
                worksheet.write(row, col+1, p)
                row += 1
    
            row = all_row
            for c in co:
                worksheet.write(row, col+2, c)
                row += 1
    
            row = all_row     
            for l in li:
                worksheet.write(row, col+3, l)
                row += 1
    
            row = all_row     
            for i in im:
                worksheet.write(row, col+4, i)
                row += 1
    
            all_row += len(ti)
            print('Done One page')
        workbook.close()
    

    看着很蹩脚,不管了,先凑合试试看

    >>> 
    ================== RESTART: C:/Users/Why Me/Desktop/爬京东2.py ==================
    请输入你要查询的商品
    固态硬盘
    
    Warning (from warnings module):
      File "D:\python3.52\lib\site-packages\xlsxwriter\worksheet.py", line 830
        force_unicode(url))
    UserWarning: Ignoring URL 'https://ccc-x.jd.com/dsp/nc?ext=Y2xpY2sueC5qZC5jb20vSmRDbGljay8_eHVpZD01MjAwNyZ4c2l0ZWlkPTE1MDEyNzQ1XzE0NzYmdG89aHR0cDovL2l0ZW0uamQuY29tLzM1MDA5NzQuaHRtbA&log=X8iXmZwdy8FrP784YxabEBovMCmgCc1tSMJf40elIqO5X09xjWDJrwbXJgDIu--hzdqLCdWvtuXToxiOC6fwtcQocJezn7MF1BIQ-O71yq2ZnJeNEqSqI6t6pJSSKmrbg3ZKkm-z_YHe04MrG_t1MSxvxPJqBTA8PpsJ3qhLXI3GZDAzT_vDqKnbr52l80NutEulONu-sKe5XxVPpIIZiDu8_PE1aXPJvRwC9EFb7VjlDw1FkOyc6ZgclyhIpWq-hEA3zNiKa7shBoDdCgprkm3a_RpUBhg7ak96p9XdlRS5gwK2cN-ByQ5DFYjCtzs4jo2x5HUShAcp74TdTpSgaiOMh4xwPqtE1Fs30VifVN5RvdNTxcGnbFsS_1MhfijzrJNMmuGMA3d1KN68w1cqPOqlN-o68u0Id4Wzt85e5Chc9EWXjZJVeOZdjgMRd1reOw657DT_zkQfWYkDGvlzjA&v=404' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS
    ...
    ...
    ...
    Done One page
    

    还是跟上面一样的警告,不管,先看看效果。
    还是不错的。复制几个 link 链接检验下信息有对齐正确不,发现也没有错误。OK

    六、优化

    在得到的excel中,可以看到价格列缺失了一些数据,无论用 BeautifulSoup 的 css 选择器还是 re,都也找不到缺失的数据,看下网页源代码,找个我们缺失的数据
    发现在源码中是存在的


    但是通过 requests 请求到的 rensponse 则不见了


    那么应该是 ajax 。那么有俩种解决方法
    一是把获得价格的那个请求找到,应该返回的是个json字符串,之后解析下json串,二是通过 selenium 模仿浏览器。
    查看下 几个 xhr



    没发现有关的请求,所以就用第二种方法好了。
    可以直接用 selenium 库到达目的,但由于我们上面的是通过 requests + BeautifluSoup,所以我们这里就用 selenium + BeautifulSoup ,这样就稍微改下上面的代码就行了。

    from bs4 import BeautifulSoup
    import urllib
    import xlsxwriter
    from selenium import webdriver
    
    
    def get_good_urls(word):
        url_str = urllib.parse.quote(word)
        urls = ("https://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=4&page={}&s=1&click=0".format(url_str, i) for i in range(1,12,2))
        return urls
    
    
    def get_html(url):
        driver = webdriver.PhantomJS()
        driver.get(url)
        web_data = driver.page_source
        soup = BeautifulSoup(web_data, 'lxml')
        return soup
    
    def get_info(soup):
        all_titles = soup.find_all(class_="p-name p-name-type-2")
        all_prices = soup.find_all(class_="p-price")
        all_commits = soup.find_all(class_="p-commit")
        all_imgs = soup.find_all(class_="p-img")
    
        titles = []
        prices = []
        commits = []
        links = []
        imgs = []
    
        for title in all_titles:
            titles.append(title.text.strip())
    
        for price in all_prices:
            prices.append(price.text.strip())
    
        for commit in all_commits:
            commits.append(commit.text.strip())
    
        for link in all_imgs:
            links.append(link.find_all('a')[0].get("href"))
    
        for img in all_imgs:
            imgs.append(img.find_all('img')[0].get("src"))
    
        return titles, prices, commits, links, imgs
    
    if __name__ == '__main__':
        good = input("请输入你要查询的商品\n")
        links = get_good_urls(good)
    
        workbook = xlsxwriter.Workbook(good + '.xlsx') #创建新表
        worksheet = workbook.add_worksheet()
    
        bold = workbook.add_format({'bold': True})  #建立粗体格式
    
        worksheet.write('A1', 'Title', bold)        #写入标题,粗体
        worksheet.write('B1', 'Price', bold)
        worksheet.write('C1', 'Commit', bold)
        worksheet.write('D1', 'Link', bold)
        worksheet.write('F1', 'Img', bold)
    
        worksheet.set_column('A:A', 100)            #改变列宽度
        worksheet.set_column('B:B', 10)
        worksheet.set_column('C:C', 18)
        worksheet.set_column('D:D', 27)
        worksheet.set_column('F:F', 100)
    
        all_row = 1
        col = 0
    
        for link in links:
            html = get_html(link)
            ti, pr, co, li, im = get_info(html)
    
            row = all_row
            for t in ti:
                worksheet.write(row, col, t)
                row += 1
    
            row = all_row
            for p in pr:
                worksheet.write(row, col+1, p)
                row += 1
    
            row = all_row
            for c in co:
                worksheet.write(row, col+2, c)
                row += 1
    
            row = all_row     
            for l in li:
                worksheet.write(row, col+3, l)
                row += 1
    
            row = all_row     
            for i in im:
                worksheet.write(row, col+4, i)
                row += 1
    
            all_row += len(ti)
            print('Done One page')
        workbook.close()
    
    

    并没多大改变,就把用 requests 请求换成 selenium 而已,运行下试试。

    ok!

    相关文章

      网友评论

          本文标题:爬取京东商品信息

          本文链接:https://www.haomeiwen.com/subject/tkafottx.html