python 携程爬虫开发笔记

作者: freesan44 | 来源:发表于2018-07-06 11:34 被阅读71次

    前言

    最近购买了《Python3 爬虫、数据清洗与可视化实战》,刚好适逢暑假,就尝试从携程页面对广州的周边游产品进行爬虫数据捕捉。
    因为才学Python不够一个星期,python的命名规范还是不太了解,只能套用之前iOS开发的命名规范,有不足之处请多多指点

    一、前期

    1.主要用到的库

    from bs4 import BeautifulSoup
    import time
    import re #正则表达式
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.action_chains import ActionChains #浏览器操作
    import xlrd
    import xlwt
    from xlutils.copy import copy
    import os
    

    BeautifulSoup:用于对标签等数据进行定位和抓取
    selenium:用于启动浏览器和对页面进行自动操作
    time:暂停等待操作
    xlrd、xlwt、xlutils:对数据结果进行Excel读写保存操作

    2.核心思路

    1,跳进出发点的周边游页面(广州)
    2,在首页捕捉推荐的热门目的地和热点景点,进行保存
    3,针对目的地地点进行遍历搜索所展示的旅游产品
    4,产品数据参数抓取
    5,数据保存
    6,退出浏览器

    二、代码

    1.启动浏览器

    def setupDriverSetting():
        global driver
        # url = 'http://m.ctrip.com/restapi/soa2/10290/createclientid?systemcode=09&createtype=3&conte'#获取cookieID
        # 手机端
        # url = 'https://m.ctrip.com/webapp/vacations/tour/list?tab=64&kwd=%E7%8F%A0%E6%B5%B7&salecity=32&searchtype=tour&sctiy=32'
        # 电脑端
        url = 'https://weekend.ctrip.com/around/'
        # 设置用chrome启动
        driver = webdriver.Chrome()
        # #设置fireFox请求头参数
        # profile = webdriver.FirefoxProfile()
        # user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0"
        # profile.set_preference("general.useragent.override",user_agent)
        #
        # driver = webdriver.Firefox(profile)
        driver.get(url)
    

    用webdriver启动Chrome或者fireFox,并跳进首页URL

    2.选择出发点城市

    def select_StartPlace(startPlace):
    
        #点击出发点view
        driver.find_element_by_xpath("//*[@id='CitySelect']").click()
        #选择出发点
        cityList = driver.find_elements_by_xpath("//*[@id='CitySelect']/dd/ul")
        for link in cityList:
            links = link.find_elements(By.TAG_NAME,"a")
            for eachCity in  links:
                cityStr = eachCity.text
                if cityStr == startPlace:
                    print("找到目标城市:"+eachCity.get_attribute('href'))
                    driver.get(eachCity.get_attribute('href'))
                    time.sleep(2)
                    try:
                        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='SearchText']")))
                    except:
                        print('出发地页面加载不成功')
    
                    break
    

    主要是用find_element_by_xpath寻找目标城市进行选择筛选,然后跳到城市专页

    3.搜索目的地

    def finAllDestinationPage():
        #查找总数组
        destType = driver.find_element_by_id("J_sub_circum")#id 决定产品范围(周边游,境外游)
        print(destType.text)
        destType1 = destType.find_element_by_class_name("side_jmp_dest")
        destTypeItem = destType1.get_attribute('innerHTML')
        item = BeautifulSoup(destTypeItem,'lxml')
        destTypeList = item.find_all('li')
        allDestinationListDic = {}
        for each in destTypeList:
            typeName = each.h4.string
            typeList  = each.find_all('a')
            list = []
            for i in  typeList:
                list.append(i.string)
                allDestinationListDic[typeName] = list
    
        return allDestinationListDic
    

    搜索所有可推荐目的地和景点,并用字典保存

    4.旅游产品列表页

    def jump_destinationPage(startPlace,destination):
        #定位搜索栏
        try:
            WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"//*[@id='SearchText']")))
        except:
            print('查找不到搜索栏')
        finally:
            print('本地页面加载完毕')
    
    
        driver.find_element_by_xpath("//input[@id='SearchText']").send_keys(destination)
        print("输入目的地:"+destination)
        driver.find_element_by_xpath("//*[@id='SearchBtn']").click()
        print("点击搜索按钮结束")
        time.sleep(2)
    
        try:
            WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"//*[@id='js-dpSearcher']")))
        except:
            print('产品列表页加载不成功')
        finally:
            print('产品列表页加载完毕')
    
        #再选一次出发地,以防出错
        reSelect_StartPlace(startPlace)
    
    
        #搜索页数
        pageHtml = driver.find_element_by_xpath("//*[@id='_sort']/div/span")
        print(pageHtml.text)
        pageNumStr = pageHtml.text
        pageNumStr = pageNumStr[:-1]
        print("获取的num:" + pageNumStr)
        #正则表达式 查找页数
        pageNumS = re.findall(r'\d+',pageNumStr)
        pageNum = int(pageNumS[1])
        print(pageNum)
    
        tourProductList = []
        for i in range(0,pageNum):
            itemList = showCurrentPageAllData()
    
            #收集数据
            for j in range(0,len(itemList)):
                eachItem = collectCurrentPageEachData(j)
                tourProductList.append(eachItem)
    
            #点击下一页
            driver.find_element_by_xpath("//input[@id='ipt_page_txt']").clear()
            driver.find_element_by_xpath("//input[@id='ipt_page_txt']").send_keys(str(i+2))
            driver.find_element_by_xpath("//*[@id='ipt_page_btn']").click()
            print("点击下一页结束->"+str(i+2)+"页")
            time.sleep(2)
        return driver
    

    跳进产品页,并根据标签,抓取总页数,在遍历所有旅游产品后,再跳到下一页进行循环遍历

    5.产品数据抓取

    def collectCurrentPageEachData(itemNum):
        itemList = driver.find_elements_by_class_name("product_box")
        str = itemList[itemNum].get_attribute('innerHTML')#转换成字符串
        # item = BeautifulSoup(str,"html.parser")#获取item的soup对象
        item = BeautifulSoup(str, "lxml")  # 获取item的soup对象
        # print("+++++++"+item.prettify())
        # 解析
        #产品名称
        titleNameHtml = item.find('h2',class_= 'product_title')
        print("-------"+titleNameHtml.get_text())
        productName = titleNameHtml.get_text()
    
        #产品链接
        productLink = titleNameHtml.a['href']
        productLink = productLink[2:]
        productLink = "https://"+productLink
        print("link:" + productLink)
        #产品类型
        productType = item.find('em')
        print("type:"+productType.get_text())
        productTypeStr = productType.get_text()
    
        #产品价格
        priceHtml = item.find('span',class_='sr_price')
        priceStr = priceHtml.strong.get_text()
        #判断是否为数字
        if priceStr.isdigit() == True :
            priceStr = "%.2f"%float(priceStr)
        print("price:"+priceStr)
    
        #产品供应商
        productRetail = item.find('p',class_='product_retail')
        productRetailStr = productRetail['title']
        if "供应商" in productRetailStr:
            productRetailStr = productRetailStr[4:]
    
        print("retail:" + productRetailStr)
        #产品评分
        try :
            gradeHtml = item.find('p', class_='grade')
            gradeStr = gradeHtml.strong.get_text()
            print("grade:" + gradeStr)
        except:
            print('查找不到评分')
            gradeStr = ''
        # 产品人数
        try:
            commentHtml = item.find('div', class_='comment')
            commentStr = commentHtml.em.get_text()
            commentNumS = re.findall(r'\d+', commentStr)
            commentNum = int(commentNumS[0])
            print("comment:",commentNum)
        except:
            print('查找不到出游人数')
            commentNum = ''
       return {
            '名称':productName,
            '链接':productLink,
            '类型':productTypeStr,
            '价格':priceStr,
            '供应商':productRetailStr,
            '评分':gradeStr,
            '人数':commentNum,
        }
    

    在产品页面上获取所有可见信息,并返回

    6.数据保存

    class ExcelFileManager:
        def creatExcelFile(fileName,sheetName,headRowList):
            # 获取项目所在目录
            filePath = os.getcwd() + '/' + fileName + '.xls'
            #如果不存在就新增
            try:
                oldFile = xlrd.open_workbook(filePath)
                file = copy(oldFile)
            except:
                file = xlwt.Workbook()
                print("新建文件")
    
            #如果不存在就新增
            try:
                sheet1 = file.add_sheet(sheetName,cell_overwrite_ok=True)
            except:
                sheet1 = file.get_sheet(sheetName)
            #设置style样式
            head_style = xlwt.easyxf('font: name Times New Roman, color-index red, bold on',num_format_str='#,##0.00')
            row0 = headRowList
            for i in range(0,len(row0)):
                sheet1.write(0,i,row0[i],head_style)
    
    
            print(filePath)
            file.save(filePath)
    
        def addDataToExcelFile(fileName,sheetName,dataList):
            filePath = os.getcwd()+'/'+fileName+'.xls'
            file = xlrd.open_workbook(filePath)
            #已存在的行数
            newRows = file.sheet_by_name(sheetName).nrows
            new_File = copy(file)
            sheet = new_File.get_sheet(sheetName)
            try:
                for i in range(0,len(dataList)):
                    for j in  range(0,len(dataList[i])):
                        sheet.write(i+newRows,j,dataList[i][j])
            except Exception as e:
                print(e)
    
            new_File.save(filePath)
    

    Excel文件创建与保存数据,不得不说,python对Excel支持不是很友好,xlrd和xlwt仅支持读和写,不支持增加sheet或者在原有Excel文件上添加数据等操作,需要用到第三方库

    三、抓取结果:

    1530848043475.jpg

    GitHub代码:https://github.com/freesan44/PythonCtripClooection

    相关文章

      网友评论

        本文标题:python 携程爬虫开发笔记

        本文链接:https://www.haomeiwen.com/subject/znevuftx.html