python 携程爬虫开发笔记

作者: freesan44 | 来源:发表于2018-07-06 11:34 被阅读71次

前言

最近购买了《Python3 爬虫、数据清洗与可视化实战》，刚好适逢暑假，就尝试从携程页面对广州的周边游产品进行爬虫数据捕捉。
因为才学Python不够一个星期，python的命名规范还是不太了解，只能套用之前iOS开发的命名规范，有不足之处请多多指点

一、前期

1.主要用到的库

from bs4 import BeautifulSoup
import time
import re #正则表达式
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains #浏览器操作
import xlrd
import xlwt
from xlutils.copy import copy
import os

BeautifulSoup：用于对标签等数据进行定位和抓取
selenium：用于启动浏览器和对页面进行自动操作
time：暂停等待操作
xlrd、xlwt、xlutils：对数据结果进行Excel读写保存操作

2.核心思路

1，跳进出发点的周边游页面（广州）
2，在首页捕捉推荐的热门目的地和热点景点，进行保存
3，针对目的地地点进行遍历搜索所展示的旅游产品
4，产品数据参数抓取
5，数据保存
6，退出浏览器

二、代码

1.启动浏览器

def setupDriverSetting():
    global driver
    # url = 'http://m.ctrip.com/restapi/soa2/10290/createclientid?systemcode=09&createtype=3&conte'#获取cookieID
    # 手机端
    # url = 'https://m.ctrip.com/webapp/vacations/tour/list?tab=64&kwd=%E7%8F%A0%E6%B5%B7&salecity=32&searchtype=tour&sctiy=32'
    # 电脑端
    url = 'https://weekend.ctrip.com/around/'
    # 设置用chrome启动
    driver = webdriver.Chrome()
    # #设置fireFox请求头参数
    # profile = webdriver.FirefoxProfile()
    # user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0"
    # profile.set_preference("general.useragent.override",user_agent)
    #
    # driver = webdriver.Firefox(profile)
    driver.get(url)

用webdriver启动Chrome或者fireFox，并跳进首页URL

2.选择出发点城市

def select_StartPlace(startPlace):

    #点击出发点view
    driver.find_element_by_xpath("//*[@id='CitySelect']").click()
    #选择出发点
    cityList = driver.find_elements_by_xpath("//*[@id='CitySelect']/dd/ul")
    for link in cityList:
        links = link.find_elements(By.TAG_NAME,"a")
        for eachCity in  links:
            cityStr = eachCity.text
            if cityStr == startPlace:
                print("找到目标城市:"+eachCity.get_attribute('href'))
                driver.get(eachCity.get_attribute('href'))
                time.sleep(2)
                try:
                    WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='SearchText']")))
                except:
                    print('出发地页面加载不成功')

                break

主要是用find_element_by_xpath寻找目标城市进行选择筛选，然后跳到城市专页

3.搜索目的地

def finAllDestinationPage():
    #查找总数组
    destType = driver.find_element_by_id("J_sub_circum")#id 决定产品范围（周边游，境外游）
    print(destType.text)
    destType1 = destType.find_element_by_class_name("side_jmp_dest")
    destTypeItem = destType1.get_attribute('innerHTML')
    item = BeautifulSoup(destTypeItem,'lxml')
    destTypeList = item.find_all('li')
    allDestinationListDic = {}
    for each in destTypeList:
        typeName = each.h4.string
        typeList  = each.find_all('a')
        list = []
        for i in  typeList:
            list.append(i.string)
            allDestinationListDic[typeName] = list

    return allDestinationListDic

搜索所有可推荐目的地和景点，并用字典保存

4.旅游产品列表页

def jump_destinationPage(startPlace,destination):
    #定位搜索栏
    try:
        WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"//*[@id='SearchText']")))
    except:
        print('查找不到搜索栏')
    finally:
        print('本地页面加载完毕')


    driver.find_element_by_xpath("//input[@id='SearchText']").send_keys(destination)
    print("输入目的地："+destination)
    driver.find_element_by_xpath("//*[@id='SearchBtn']").click()
    print("点击搜索按钮结束")
    time.sleep(2)

    try:
        WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"//*[@id='js-dpSearcher']")))
    except:
        print('产品列表页加载不成功')
    finally:
        print('产品列表页加载完毕')

    #再选一次出发地，以防出错
    reSelect_StartPlace(startPlace)


    #搜索页数
    pageHtml = driver.find_element_by_xpath("//*[@id='_sort']/div/span")
    print(pageHtml.text)
    pageNumStr = pageHtml.text
    pageNumStr = pageNumStr[:-1]
    print("获取的num:" + pageNumStr)
    #正则表达式 查找页数
    pageNumS = re.findall(r'\d+',pageNumStr)
    pageNum = int(pageNumS[1])
    print(pageNum)

    tourProductList = []
    for i in range(0,pageNum):
        itemList = showCurrentPageAllData()

        #收集数据
        for j in range(0,len(itemList)):
            eachItem = collectCurrentPageEachData(j)
            tourProductList.append(eachItem)

        #点击下一页
        driver.find_element_by_xpath("//input[@id='ipt_page_txt']").clear()
        driver.find_element_by_xpath("//input[@id='ipt_page_txt']").send_keys(str(i+2))
        driver.find_element_by_xpath("//*[@id='ipt_page_btn']").click()
        print("点击下一页结束->"+str(i+2)+"页")
        time.sleep(2)
    return driver

跳进产品页，并根据标签，抓取总页数，在遍历所有旅游产品后，再跳到下一页进行循环遍历

5.产品数据抓取

def collectCurrentPageEachData(itemNum):
    itemList = driver.find_elements_by_class_name("product_box")
    str = itemList[itemNum].get_attribute('innerHTML')#转换成字符串
    # item = BeautifulSoup(str,"html.parser")#获取item的soup对象
    item = BeautifulSoup(str, "lxml")  # 获取item的soup对象
    # print("+++++++"+item.prettify())
    # 解析
    #产品名称
    titleNameHtml = item.find('h2',class_= 'product_title')
    print("-------"+titleNameHtml.get_text())
    productName = titleNameHtml.get_text()

    #产品链接
    productLink = titleNameHtml.a['href']
    productLink = productLink[2:]
    productLink = "https://"+productLink
    print("link:" + productLink)
    #产品类型
    productType = item.find('em')
    print("type:"+productType.get_text())
    productTypeStr = productType.get_text()

    #产品价格
    priceHtml = item.find('span',class_='sr_price')
    priceStr = priceHtml.strong.get_text()
    #判断是否为数字
    if priceStr.isdigit() == True :
        priceStr = "%.2f"%float(priceStr)
    print("price:"+priceStr)

    #产品供应商
    productRetail = item.find('p',class_='product_retail')
    productRetailStr = productRetail['title']
    if "供应商" in productRetailStr:
        productRetailStr = productRetailStr[4:]

    print("retail:" + productRetailStr)
    #产品评分
    try :
        gradeHtml = item.find('p', class_='grade')
        gradeStr = gradeHtml.strong.get_text()
        print("grade:" + gradeStr)
    except:
        print('查找不到评分')
        gradeStr = ''
    # 产品人数
    try:
        commentHtml = item.find('div', class_='comment')
        commentStr = commentHtml.em.get_text()
        commentNumS = re.findall(r'\d+', commentStr)
        commentNum = int(commentNumS[0])
        print("comment:",commentNum)
    except:
        print('查找不到出游人数')
        commentNum = ''
   return {
        '名称':productName,
        '链接':productLink,
        '类型':productTypeStr,
        '价格':priceStr,
        '供应商':productRetailStr,
        '评分':gradeStr,
        '人数':commentNum,
    }

在产品页面上获取所有可见信息，并返回

6.数据保存

class ExcelFileManager:
    def creatExcelFile(fileName,sheetName,headRowList):
        # 获取项目所在目录
        filePath = os.getcwd() + '/' + fileName + '.xls'
        #如果不存在就新增
        try:
            oldFile = xlrd.open_workbook(filePath)
            file = copy(oldFile)
        except:
            file = xlwt.Workbook()
            print("新建文件")

        #如果不存在就新增
        try:
            sheet1 = file.add_sheet(sheetName,cell_overwrite_ok=True)
        except:
            sheet1 = file.get_sheet(sheetName)
        #设置style样式
        head_style = xlwt.easyxf('font: name Times New Roman, color-index red, bold on',num_format_str='#,##0.00')
        row0 = headRowList
        for i in range(0,len(row0)):
            sheet1.write(0,i,row0[i],head_style)


        print(filePath)
        file.save(filePath)

    def addDataToExcelFile(fileName,sheetName,dataList):
        filePath = os.getcwd()+'/'+fileName+'.xls'
        file = xlrd.open_workbook(filePath)
        #已存在的行数
        newRows = file.sheet_by_name(sheetName).nrows
        new_File = copy(file)
        sheet = new_File.get_sheet(sheetName)
        try:
            for i in range(0,len(dataList)):
                for j in  range(0,len(dataList[i])):
                    sheet.write(i+newRows,j,dataList[i][j])
        except Exception as e:
            print(e)

        new_File.save(filePath)

Excel文件创建与保存数据，不得不说，python对Excel支持不是很友好，xlrd和xlwt仅支持读和写，不支持增加sheet或者在原有Excel文件上添加数据等操作，需要用到第三方库

三、抓取结果：

1530848043475.jpg

GitHub代码：https://github.com/freesan44/PythonCtripClooection

网友评论

本文标题：python 携程爬虫开发笔记

本文链接：https://www.haomeiwen.com/subject/znevuftx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

python 携程爬虫开发笔记

前言

一、前期

1.主要用到的库

2.核心思路

二、代码

1.启动浏览器

2.选择出发点城市

3.搜索目的地

4.旅游产品列表页

5.产品数据抓取

6.数据保存

三、抓取结果：

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读

python爬虫

大数据爬虫Python AI Sql

python 携程爬虫开发笔记

前言

一、前期

1.主要用到的库

2.核心思路

二、代码

1.启动浏览器

2.选择出发点城市

3.搜索目的地

4.旅游产品列表页

5.产品数据抓取

6.数据保存

三、抓取结果：

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读

python爬虫

大数据 爬虫Python AI Sql

大数据爬虫Python AI Sql