美文网首页
爬虫神器 selenium的安装和使用

爬虫神器 selenium的安装和使用

作者: 夜空中乄最亮的星 | 来源:发表于2021-04-14 14:27 被阅读0次

当前安装环境是centos7,非root用户,使用chrome的headless模式, python3

下载并安装chrome最新版

sudo yum install https://dl.google.com/linux/direct/google-chrome-stable_current_x86_64.rpm

本机当前版本为:google-chrome-stable-89.0.4389.128-1.x86_64

创建软链接:

sudo ln -s /usr/bin/google-chrome-stable /bin/chrome

下载chromedriver,并解压 。必须注意chromedriver和chrome的版本要一致

sudo wget http://chromedriver.storage.googleapis.com/89.0.4389.23/chromedriver_linux64.zip
sudo unzip chromedriver_linux64.zip

安装selenium

pip3 install selenium

编写脚本抓取搜狗微信公众号

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from lxml import etree
import time

#搜狗微信官网地址
sougou_site='https://weixin.sogou.com/'

#打开chrome
def new_driver():
    option = Options()  # 1.配置参数
    option.add_argument("--headless")  # 无头模式
    option.add_argument("--disable-gpu")
    option.add_argument('--start-maximized') #最大化运行(全屏窗口),不设置,取元素会报错
    option.add_argument("--blink-settings=imagesEnabled=false")# 不加载图片, 提升速度
    option.add_argument('--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"')
    return webdriver.Chrome(executable_path='/home/soft/selenium/chromedriver',chrome_options=option)
#关闭chrome
def close_driver(dirver):
    dirver.close()

#获取公众号列表html结构
def get_html(driver,keywords):
    print(driver)
    #driver.implicitly_wait(15)
    driver.get(sougou_site)
    html = None
    try:        
        ele_input_id = driver.find_element_by_id('query')
        search_btn = driver.find_element_by_class_name('swz2')
        ele_input_id.send_keys(keywords)
        search_btn.click()
        news_box = driver.find_element_by_class_name('news-box')
        html = news_box.get_attribute('innerHTML')
        
    except NoSuchElementException as e:
        print(e)
    finally:
        return html

#解析公众号html列表
def html_parse(html):
    list=[]
    doc = etree.HTML(html)
    all_div = doc.xpath('//li')
    for row in all_div:
        wxhl = row.xpath('.//div[@class="txt-box"]/p[@class="info"]/label[@name="em_weixinhao"]/text()')
        #微信号
        wxh=''.join(wxhl)
        kname = row.xpath('.//div[@class="txt-box"]/p[@class="tit"]/a/em/text()')
        ktitle = row.xpath('.//div[@class="txt-box"]/p[@class="tit"]/a/text()')
        #公众号名字
        name = ''.join(kname+ktitle)
        dl2 = row.xpath('.//dl')[2]
        #最近文章标题
        title = ''.join(dl2.xpath('.//dd/a/text()'))
        #最近文章链接
        link = ''.join(dl2.xpath('.//dd/a/@href'))
        item={'wxh':wxh,'name':name,'title':title,'link':link}
        list.append(item)
    return list
        
#获取真实的微信url
def get_real_url(driver,link):
    driver.implicitly_wait(15)
    driver.get(sougou_site+link)
    try:
        driver.find_element_by_id("activity-detail")
    except NoSuchElementException as e:
        print(e)
    finally:
        return wxurl_filter(driver.current_url)

#过滤微信url
def wxurl_filter(url):
    if url.startswith('https://mp.weixin.qq.com'):
        return url
    else:
        return ''

driver = new_driver()
html = get_html(driver,"百度")
print(html is None)
#list = html_parse(html)
#wxurl = get_real_url(driver,list[0]['link'])
#print(list)
#close_driver(driver)

以上是示例代码,稍作修改即可

相关文章

网友评论

      本文标题:爬虫神器 selenium的安装和使用

      本文链接:https://www.haomeiwen.com/subject/utgklltx.html