之前都是用requests+bs4+re来写爬虫,改用selenium试一下:
import pandas as pd
from lxml import etree
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from datetime import datetime
class SpiderQunar:
def __init__(self,
city1='上海',
city2='广州',
date1='',
date2='',
url='https://flight.qunar.com/',
save_path=r'd:\Users\GMCC\Desktop\test.xlsx',
s_options=None,
exec_path="c:/users/python_package/chromedriver.exe"):
# 未配置日期时,默认查询第二天去,第三天返回
if date1 == '':
today = datetime.now().date()
date1 = str(today.replace(day=(today.day + 1)))
date2 = str(today.replace(day=(today.day + 2)))
self.go_df = None
self.back_df = None
self.df = None
self.save_path = save_path
self.browser = self.browser_init(exec_path, s_options)
self.search_content = self.search_flight(url, city1, city2, date1, date2)
self.go_df, self.back_df, self.df = self.paser(self.search_content)
@staticmethod
def browser_init(exec_path, s_options):
# 检测是否存在selenium配置选项
if isinstance(s_options, list) and len(s_options) > 0:
options = Options()
for each in s_options:
options.add_argument(each)
# options.add_argument('--headless')#无界浏览器
return webdriver.Chrome(executable_path=exec_path)
def search_flight(self, url, city1, city2, date1, date2):
self.browser.get(url)
# 出发城市
fromcity = self.browser.find_element_by_name("fromCity")
fromcity.clear()
fromcity.send_keys(city1)
# 到达城市
tocity = self.browser.find_element_by_name("toCity")
tocity.clear()
tocity.send_keys(city2)
# 去程日期
fromdate = self.browser.find_element_by_id('fromDate')
self.browser.execute_script("arguments[0].value=arguments[1]", fromdate, date1)
# 点击返程日期输入框,激活往返日期选项
self.browser.find_element_by_xpath('//div[contains(@class,"qcbox") '
'and contains(@class,"qdate")'
'and contains(@class,"toD")'
'and contains(@class,"qcbox_disable")]').click()
# 返程日期
todate = self.browser.find_element_by_id('toDate')
self.browser.execute_script("arguments[0].value=arguments[1]", todate, date2)
# 点击搜索按钮
sub_button = self.browser.find_element_by_xpath(
'//button[@class="btn_search" and @data-track="key=101020008&val=国内搜索"]')
sub_button.click()
# 点击搜索后有时会弹出热门机场选择框,需要检查并关闭它.否则会遮挡住搜索按钮.
try:
self.browser.find_element_by_id("closeXI20").click()
sub_button.click()
finally:
print("Redirecting..")
# 等待页面加载完毕
try:
WebDriverWait(self.browser, 10).until(
EC.presence_of_element_located(
(By.XPATH,
"//div[contains(@class,'list-ct') and contains(@class,'back-list')]//p[@class='price-desc']")))
finally:
content = self.browser.page_source
# 关闭浏览器窗口及浏览器驱动
self.browser.quit()
# self.c_service.stop()
return content
@staticmethod
def _get_sche_info(content):
left = etree.HTML(content).xpath('//div[@class="left"]')
left = [etree.tostring(each, encoding='utf8').decode() for each in left]
# 出发时间
dep_time = [etree.HTML(each).xpath("//div[@class='dep']/p[@class='time']/text()")[0]
for each in left]
# 到达时间
arr_time = [etree.HTML(each).xpath("//div[@class='arr']/p[@class='time']/text()")[0]
for each in left]
# 出发机场
dep_airport = [etree.HTML(each).xpath("//div[@class='dep']/p[@class='airport']/span/text()")
for each in left]
dep_airport = [''.join(each) for each in dep_airport]
# 到达机场
arr_airport = [etree.HTML(each).xpath("//div[@class='arr']/p[@class='airport']/span/text()")
for each in left]
arr_airport = [''.join(each) for each in arr_airport]
# 过夜航班
cross_day = [etree.HTML(each).xpath("//div[@class='arr']/p[@class='cross-day']/text()")
for each in left]
cross_day = [''.join(each) for each in cross_day]
# 航空公司logo
sub_info1_logo = [etree.HTML(each).xpath("//div[@class='sub-info']/img[@class='air-logo']/@src")[0]
for each in left]
# 航班号
sub_info2_flight = [etree.HTML(each).xpath("//div[@class='sub-info']/span/text()")[0]
for each in left]
# 共享航班
sub_info3_share = [etree.HTML(each).xpath("//div[@class='sub-info']/span[@class='share']/text()")
for each in left]
sub_info3_share = [''.join(each) for each in sub_info3_share]
# 飞行时长
sub_info4_during = [etree.HTML(each).xpath("//div[@class='sub-info']/span[@class='dur']/text()")[0]
for each in left]
right = etree.HTML(content).xpath('//div[@class="right"]')
right = [etree.tostring(each, encoding='utf8').decode() for each in right]
# 总价格
total_p = [etree.HTML(each).xpath("//p[@class='price']//span/text()")
for each in right]
total_p = [''.join(each).replace('\xa0', '') for each in total_p]
# 汇总并返回DataFrame数据
columns = ['起飞时间', '到达时间', '起飞机场', '到达机场',
'是否过夜航班', '航空公司logo', '航班号', '是否共享航班',
'时长', '价格']
data = []
for each in zip(dep_time, arr_time, dep_airport, arr_airport,
cross_day, sub_info1_logo, sub_info2_flight,
sub_info3_share, sub_info4_during, total_p):
data.append(each)
if 'back-list' in etree.HTML(content).xpath('//div/@class')[0]:
df = pd.DataFrame(data=data, columns=[each + '_去程' for each in columns])
else:
df = pd.DataFrame(data=data, columns=[each + '_返程' for each in columns])
return df
def paser(self, content):
# 去程
go_content = etree.tostring(
etree.HTML(content).xpath('//div[contains(@class,"list-ct") and contains(@class,"go-list")]')[0],
encoding='utf8').decode()
go_df = self._get_sche_info(go_content)
# 返程
back_content = etree.tostring(
etree.HTML(content).xpath('//div[contains(@class,"list-ct") and contains(@class,"back-list")]')[0],
encoding='utf8').decode()
back_df = self._get_sche_info(back_content)
# 全程汇总
go_df[' '] = '|'
df = pd.concat([go_df, back_df], axis=1)
return go_df, back_df, df
def save_files(self, df):
df.to_excel(self.save_path, index=False)
spider = SpiderQunar(city1='天津', city2='哈尔滨')
spider.save_files(spider.df)
网友评论