当前安装环境是centos7,非root
用户,使用chrome的headless模式, python3
下载并安装chrome最新版
sudo yum install https://dl.google.com/linux/direct/google-chrome-stable_current_x86_64.rpm
本机当前版本为:google-chrome-stable-89.0.4389.128-1.x86_64
创建软链接:
sudo ln -s /usr/bin/google-chrome-stable /bin/chrome
下载chromedriver,并解压 。必须注意chromedriver和chrome的版本要一致
sudo wget http://chromedriver.storage.googleapis.com/89.0.4389.23/chromedriver_linux64.zip
sudo unzip chromedriver_linux64.zip
安装selenium
pip3 install selenium
编写脚本抓取搜狗微信公众号
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from lxml import etree
import time
#搜狗微信官网地址
sougou_site='https://weixin.sogou.com/'
#打开chrome
def new_driver():
option = Options() # 1.配置参数
option.add_argument("--headless") # 无头模式
option.add_argument("--disable-gpu")
option.add_argument('--start-maximized') #最大化运行(全屏窗口),不设置,取元素会报错
option.add_argument("--blink-settings=imagesEnabled=false")# 不加载图片, 提升速度
option.add_argument('--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"')
return webdriver.Chrome(executable_path='/home/soft/selenium/chromedriver',chrome_options=option)
#关闭chrome
def close_driver(dirver):
dirver.close()
#获取公众号列表html结构
def get_html(driver,keywords):
print(driver)
#driver.implicitly_wait(15)
driver.get(sougou_site)
html = None
try:
ele_input_id = driver.find_element_by_id('query')
search_btn = driver.find_element_by_class_name('swz2')
ele_input_id.send_keys(keywords)
search_btn.click()
news_box = driver.find_element_by_class_name('news-box')
html = news_box.get_attribute('innerHTML')
except NoSuchElementException as e:
print(e)
finally:
return html
#解析公众号html列表
def html_parse(html):
list=[]
doc = etree.HTML(html)
all_div = doc.xpath('//li')
for row in all_div:
wxhl = row.xpath('.//div[@class="txt-box"]/p[@class="info"]/label[@name="em_weixinhao"]/text()')
#微信号
wxh=''.join(wxhl)
kname = row.xpath('.//div[@class="txt-box"]/p[@class="tit"]/a/em/text()')
ktitle = row.xpath('.//div[@class="txt-box"]/p[@class="tit"]/a/text()')
#公众号名字
name = ''.join(kname+ktitle)
dl2 = row.xpath('.//dl')[2]
#最近文章标题
title = ''.join(dl2.xpath('.//dd/a/text()'))
#最近文章链接
link = ''.join(dl2.xpath('.//dd/a/@href'))
item={'wxh':wxh,'name':name,'title':title,'link':link}
list.append(item)
return list
#获取真实的微信url
def get_real_url(driver,link):
driver.implicitly_wait(15)
driver.get(sougou_site+link)
try:
driver.find_element_by_id("activity-detail")
except NoSuchElementException as e:
print(e)
finally:
return wxurl_filter(driver.current_url)
#过滤微信url
def wxurl_filter(url):
if url.startswith('https://mp.weixin.qq.com'):
return url
else:
return ''
driver = new_driver()
html = get_html(driver,"百度")
print(html is None)
#list = html_parse(html)
#wxurl = get_real_url(driver,list[0]['link'])
#print(list)
#close_driver(driver)
以上是示例代码,稍作修改即可
网友评论