美文网首页
爬虫神器 selenium的安装和使用

爬虫神器 selenium的安装和使用

作者: 夜空中乄最亮的星 | 来源:发表于2021-04-14 14:27 被阅读0次

    当前安装环境是centos7,非root用户,使用chrome的headless模式, python3

    下载并安装chrome最新版

    sudo yum install https://dl.google.com/linux/direct/google-chrome-stable_current_x86_64.rpm
    

    本机当前版本为:google-chrome-stable-89.0.4389.128-1.x86_64

    创建软链接:

    sudo ln -s /usr/bin/google-chrome-stable /bin/chrome
    

    下载chromedriver,并解压 。必须注意chromedriver和chrome的版本要一致

    sudo wget http://chromedriver.storage.googleapis.com/89.0.4389.23/chromedriver_linux64.zip
    sudo unzip chromedriver_linux64.zip
    

    安装selenium

    pip3 install selenium
    

    编写脚本抓取搜狗微信公众号

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.common.exceptions import NoSuchElementException
    from lxml import etree
    import time
    
    #搜狗微信官网地址
    sougou_site='https://weixin.sogou.com/'
    
    #打开chrome
    def new_driver():
        option = Options()  # 1.配置参数
        option.add_argument("--headless")  # 无头模式
        option.add_argument("--disable-gpu")
        option.add_argument('--start-maximized') #最大化运行(全屏窗口),不设置,取元素会报错
        option.add_argument("--blink-settings=imagesEnabled=false")# 不加载图片, 提升速度
        option.add_argument('--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"')
        return webdriver.Chrome(executable_path='/home/soft/selenium/chromedriver',chrome_options=option)
    #关闭chrome
    def close_driver(dirver):
        dirver.close()
    
    #获取公众号列表html结构
    def get_html(driver,keywords):
        print(driver)
        #driver.implicitly_wait(15)
        driver.get(sougou_site)
        html = None
        try:        
            ele_input_id = driver.find_element_by_id('query')
            search_btn = driver.find_element_by_class_name('swz2')
            ele_input_id.send_keys(keywords)
            search_btn.click()
            news_box = driver.find_element_by_class_name('news-box')
            html = news_box.get_attribute('innerHTML')
            
        except NoSuchElementException as e:
            print(e)
        finally:
            return html
    
    #解析公众号html列表
    def html_parse(html):
        list=[]
        doc = etree.HTML(html)
        all_div = doc.xpath('//li')
        for row in all_div:
            wxhl = row.xpath('.//div[@class="txt-box"]/p[@class="info"]/label[@name="em_weixinhao"]/text()')
            #微信号
            wxh=''.join(wxhl)
            kname = row.xpath('.//div[@class="txt-box"]/p[@class="tit"]/a/em/text()')
            ktitle = row.xpath('.//div[@class="txt-box"]/p[@class="tit"]/a/text()')
            #公众号名字
            name = ''.join(kname+ktitle)
            dl2 = row.xpath('.//dl')[2]
            #最近文章标题
            title = ''.join(dl2.xpath('.//dd/a/text()'))
            #最近文章链接
            link = ''.join(dl2.xpath('.//dd/a/@href'))
            item={'wxh':wxh,'name':name,'title':title,'link':link}
            list.append(item)
        return list
            
    #获取真实的微信url
    def get_real_url(driver,link):
        driver.implicitly_wait(15)
        driver.get(sougou_site+link)
        try:
            driver.find_element_by_id("activity-detail")
        except NoSuchElementException as e:
            print(e)
        finally:
            return wxurl_filter(driver.current_url)
    
    #过滤微信url
    def wxurl_filter(url):
        if url.startswith('https://mp.weixin.qq.com'):
            return url
        else:
            return ''
    
    driver = new_driver()
    html = get_html(driver,"百度")
    print(html is None)
    #list = html_parse(html)
    #wxurl = get_real_url(driver,list[0]['link'])
    #print(list)
    #close_driver(driver)
    

    以上是示例代码,稍作修改即可

    相关文章

      网友评论

          本文标题:爬虫神器 selenium的安装和使用

          本文链接:https://www.haomeiwen.com/subject/utgklltx.html