美文网首页
四、selenium+beautifulsoup4获取淘宝商品评

四、selenium+beautifulsoup4获取淘宝商品评

作者: 雷动软件 | 来源:发表于2019-12-16 12:10 被阅读0次

    概述

    • 使用firefox浏览器
    • 使用selenium时,只能使用扫码登录,不能使用用户名密码登录
    • 使用用户名密码登录时会提示"==哎呀,出错了,点击刷新再来一次(error:9PiETg)=="
    • 扫码登录后可以将cookie保存,下一次可以使用cookie登录,避免每次运行都要扫码
    • 保存的cookie会有失效时间
    • 支持淘宝链接,天猫链接和短链接
    • 爬取时还有获取页面元素不稳定的问题,暂时没找到好办法,只能多试几次
    • 商品评论的页数只能到99页,多于99页会提示错误
    • 翻页太快会导致出现验证,并且验证时会提示"==哎呀,出错了,点击刷新再来一次(error:9PiETg)=="

    效果

    评论内容 评论图片

    实现

    源码文件

    源码
    文件 介绍
    main.py 爬虫入口,创建保存目录,扫码登录或cookie登录,开启爬虫
    core.py 爬虫判断,根据url创建TMall或者Taobao爬虫
    spider文件夹 taobaoSpider.py和tmallSpider.py 实现了淘宝商品评论爬取和天猫商品评论爬取
    browser.py WebDriver FireFox抽象
    util.py 工具类
    settings.py 配置如扫码登录还是cookie,图片存储目录,url等

    main.py

    # coding=utf-8
    import json
    import os
    
    import settings
    import util
    from core import Crawler
    from browser import FirefoxBrowser
    
    #创建图片目录
    util.mkStorePath(settings.STORE_PATH)
    
    firefox = FirefoxBrowser()
    #扫码登录并保存cookie
    if settings.QRCODE == True:
        cookies = firefox.get_cookies(settings.LOGIN_URL)
        jsonCookies = json.dumps(cookies)
        with open("cookies_tao.json", "w") as fp:
            fp.write(jsonCookies)
            fp.close()
        print("cookie file done")
    #否则使用保存的cookie登录
    else:
        firefox.get(settings.LOGIN_URL)
        if os.path.exists('cookies_tao.json'):
            with open("cookies_tao.json","r",encoding="utf8") as fp:
                cookies = json.loads(fp.read())
                firefox.set_cookies(cookies)
                fp.close()
    
    #爬取商品评论
    failedList = []
    #第一遍有失败的
    for url in settings.URLS:
        isSuccess = Crawler(url,firefox).start()
        if isSuccess == False:
            failedList.append(url)
    #失败的重试一次
    for url in failedList:
        Crawler(url,firefox).start()
    
    firefox.close()
    

    core.py

    # coding=utf-8
    import os
    from time import sleep
    
    from spider.tmallSpider import TmallSpider
    from spider.taobaoSpider import TaobaoSpider
    import settings
    
    class Crawler(object):
    
        def __init__(self, target_url,firefoxBrowser):
            #TODO 验证url可用性
            self._firefox = firefoxBrowser
            if(target_url.find('detail.tmall.com') != -1):
                self._type = 1
            elif(target_url.find('item.taobao.com') != -1):
                self._type = 2
            elif(target_url.find('m.tb.cn') != -1):
                self._type = 0
            self._firefox.get(target_url)
            if self._type == 0:
                self._firefox._wait_url(target_url,300)
            self._url = self._firefox.driver().current_url
    
        def start(self):
            """
            :return: True False
            """
            #判断爬虫类型
            if(self._url.find('detail.tmall.com') != -1):
                return TmallSpider(self._firefox).start()
            elif(self._url.find('item.taobao.com') != -1):
                return TaobaoSpider(self._firefox).start()
    

    spider/tmallSpider.py

    # coding=utf-8
    import time
    from bs4 import BeautifulSoup
    from urllib.request import urlretrieve
    
    import settings
    import util
    from util import ElementFilter
    
    class TmallSpider(object):
    
        def __init__(self,firefoxBrowser):
            self._firefox = firefoxBrowser
            self._surl = self._firefox.driver().current_url
            self._sid = util.getIdAndMkdir(self._surl,settings.STORE_PATH)
            self._item = {}
            self._rate = []
    
        def start(self):
            """
            :return: True-爬取完成 False-爬取失败
            """
            #判断爬虫类型
            print('start tmallSpider ' + str(self._sid))
            #获取item的标题
            print('get Title')
            self._item['title'] = self._firefox.get_element(ElementFilter.tm_dict['Title']).text
            #找到包含评论页面的view
            print('get JTabBarBox')
            element = self._firefox.get_element(ElementFilter.tm_dict['JTabBarBox'])
            self._firefox.driver().execute_script("arguments[0].scrollIntoView()",element)
            #找到包含radio的容器
            print('get JTabbar')
            jtabbar = self._firefox.get_element_without_wait(ElementFilter.tm_dict['JTabbar'])
            if jtabbar is None:
                print('JTabbar not found')
                return False
            jtabbar.click()
            time.sleep(5)
            #找到radio并点击包含图片的评论
            print('get JReviews')
            jreviews = self._firefox.get_element_without_wait(ElementFilter.tm_dict['JReviews'])
            if jreviews is None:
                print('JReviews not found')
                return False
            jreviews.click()
            time.sleep(5)
            #找到图片评论,最多99页
            for num in range(1,99):
                temp = self.parse(self._firefox.driver().page_source)
                self._rate.append(temp)
                print('page'+str(num))
                num = num + 1
                isLast = self._firefox.get_next_page_tmall('下一页>>')
                if isLast == False:
                    break
                time.sleep(5)
            self._item['rates'] = self._rate
            return True
    
        def parse(self,html):
            bs4 = BeautifulSoup(html,"html.parser")
            div_rate = bs4.find("div",class_="rate-grid")
            items = []
            #选择每一行
            trs = div_rate.select('tr')
            for tr in trs:
                item = {}
                
                #td class="col-author" 作者
                td3 = tr.select_one('td.col-author')
                contents = td3.select_one('div.rate-user-info').contents
                item['author'] = contents[0].strip()+ "***" + contents[2].strip()
                item['rauthor'] = contents[0].strip() + contents[2].strip()
    
                #td class="tm-col-master"  评论内容和图片地址
                td1 = tr.select_one('td.tm-col-master')
                #追评 tm-rate-premiere 否则 tm-rate-content
                premiere = td1.select_one('div.tm-rate-premiere')
                if premiere is not None:
                    print('premiere')
                    #tm-rate-premiere
                    #初始评论内容
                    fulltxt = premiere.select_one('div.tm-rate-fulltxt').contents
                    if len(fulltxt) > 1:
                        item['tm-rate-fulltxt'] = fulltxt[1].strip()
                    else:
                        item['tm-rate-fulltxt'] = fulltxt[0].strip()
                    item['tm-rate-fulltxt'] = fulltxt
                    #评论时间
                    date = premiere.select_one('div.tm-rate-date').contents[0].strip()
                    item['tm-rate-date'] = date
                    #评论图片url
                    lis = premiere.select('li')
                    datasrc=[]
                    for li in lis:
                        srcLi = li.attrs['data-src']
                        if srcLi.endswith(".png"):
                            continue
                        imgUrl = self.parseImg(srcLi,item['rauthor'])
                        datasrc.append(imgUrl)
                    #追评内容
                    append = td1.select_one('div.tm-rate-append')
                    fulltxt = append.select_one('div.tm-rate-fulltxt').contents
                    if len(fulltxt) > 1:
                        item['tm-rate-fulltxt'] = fulltxt[1].strip()
                    else:
                        item['tm-rate-fulltxt'] = fulltxt[0].strip()
                    item['append-rate-fulltxt'] = fulltxt
                    alis = append.select('li')
                    for li in alis:
                        srcLi = li.attrs['data-src']
                        if srcLi.endswith(".png"):
                            continue
                        imgUrl = self.parseImg(srcLi,item['rauthor'])
                        datasrc.append(imgUrl)
                    item['tm-m-photos'] = datasrc
                else:
                    #tm-rate-content
                    content = td1.select_one('div.tm-rate-content')
                    #评论内容
                    fulltxt = content.select_one('div.tm-rate-fulltxt').contents
                    if len(fulltxt) > 1:
                        item['tm-rate-fulltxt'] = fulltxt[1].strip()
                    else:
                        item['tm-rate-fulltxt'] = fulltxt[0].strip()
                            #评论图片url
                    lis = content.select('li')
                    datasrc=[]
                    for li in lis:
                        srcLi = li.attrs['data-src']
                        if srcLi.endswith(".png"):
                            continue
                        imgUrl = self.parseImg(srcLi,item['rauthor'])
                        datasrc.append(imgUrl)
                    item['tm-m-photos'] = datasrc
                    #评论时间
                    date = td1.select_one('div.tm-rate-date').contents[0].strip()
                    item['tm-rate-date'] = date
                #td class="col-meta" 颜色和鞋码
                td2 = tr.select_one('td.col-meta div.rate-sku')
                ps = td2.select('p')
                item['color'] = ps[0]['title']
                item['size'] = ps[1]['title']
                
                items.append(item)
            return items
    
        def parseImg(self,picUrl,author):
            picTemp = picUrl.rpartition('/')[2]
            picDes = settings.STORE_PATH + '/' + self._sid + "/" + author + '_' + picTemp[:len(picTemp)-12]
            picAll = "http:" + picUrl[:len(picUrl)-12]
            urlretrieve(picAll,picDes)
            return picAll
    

    参考

    selenium文档
    Beautiful Soup 4.4.0 文档
    CSS 选择器参考手册
    selenium爬取淘宝评论信息
    python +Selenium 爬取淘宝商品评论

    相关文章

      网友评论

          本文标题:四、selenium+beautifulsoup4获取淘宝商品评

          本文链接:https://www.haomeiwen.com/subject/fynrnctx.html