美文网首页
爬虫 爬取杭州公交始末站,经纬度

爬虫 爬取杭州公交始末站,经纬度

作者: 诗人藏夜里 | 来源:发表于2019-11-22 00:33 被阅读0次

    杭州公交总览
    http://bus.hangzhou.com.cn/all_line.php

    找到每一路公交车对应网址


    每一路公交点击按键对应位置
    # 获取每一辆车对应网页id
    url = 'http://bus.hangzhou.com.cn/all_line.php'
    
    headers = {
        'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    
    
    response = requests.get(url, headers=headers, timeout=5)
    
    soup = BeautifulSoup(response.text, 'lxml')   #获取网页代码信息
    href = soup.find(attrs={'class': 'line_all'}).find_all('a')    #找到每辆车网址信息
    id_ = []    #每辆车网址
    for i in range(len(href)):
        id_one = re.findall('\d+', str(href[i]))[0]
        id_.append(id_one)
    

    任选一路车进入其页面
    http://bus.hangzhou.com.cn/line.php?line_id=3

    找到始末站信息
    '''
        进入每一辆车对应网页
        '''
        url = 'http://bus.hangzhou.com.cn/line.php?line_id=' + str(id_[count])
        response_ = requests.get(url, headers=headers, timeout=10)
        print('url:{} count:{}'.format(url, count))
        soup = BeautifulSoup(response_.text, 'lxml')
        #找到始发站终点站对应位置
        start_terminal_ = soup.find(attrs={'class': 'main_title'}).find_all('strong')[-1]
    

    完整代码

    import requests
    from bs4 import BeautifulSoup
    import random
    import tqdm as tqdm
    
    import pandas as pd
    import numpy as np
    import re
    '''
    创建dataframe来保存起始站,终点站
    '''
    df = pd.DataFrame(columns=['count', 'start', 'terminal'])
      
        
    # 获取每一辆车对应网页id
    url = 'http://bus.hangzhou.com.cn/all_line.php'
    
    headers = {
        'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
    
    
    response = requests.get(url, headers=headers, timeout=5)
    
    soup = BeautifulSoup(response.text, 'lxml')   #获取网页代码信息
    href = soup.find(attrs={'class': 'line_all'}).find_all('a')    #找到每辆车网址信息
    id_ = []    #每辆车网址
    for i in range(len(href)):
        id_one = re.findall('\d+', str(href[i]))[0]
        id_.append(id_one)
    
    '''
    若不换ip则可能ip被封,因为采用count计数,可以知道停在了哪里,下一次把count改为相应值继续开始即可
    '''
    count = 0
    while(count!=len(id_)):
        '''
        进入每一辆车对应网页
        '''
        url = 'http://bus.hangzhou.com.cn/line.php?line_id=' + str(id_[count])
        response_ = requests.get(url, headers=headers, timeout=10)
        print('url:{} count:{}'.format(url, count))
        soup = BeautifulSoup(response_.text, 'lxml')
        #找到始发站终点站对应位置
        start_terminal_ = soup.find(attrs={'class': 'main_title'}).find_all('strong')[-1]
        '''
        根据-来分割文字,将其分别放入dataframe的相应位置
        因为有些文字中分隔符不同,因此做以下处理
        '''
        if '——' in start_terminal_.text:
            start = start_terminal_.text.split('——')[0]
            terminal = start_terminal_.text.split('——')[1]
            df.loc[count, 'count'] = count
            df.loc[count, 'start'] = start
            df.loc[count, 'terminal'] = terminal
            count = count + 1
        elif '-' in  start_terminal_.text:
            start = start_terminal_.text.split('-')[0]
            terminal = start_terminal_.text.split('-')[1]
            df.loc[count, 'count'] = count
            df.loc[count, 'start'] = start
            df.loc[count, 'terminal'] = terminal
            count = count + 1
        elif '—' in  start_terminal_.text:
            start = start_terminal_.text.split('—')[0]
            terminal = start_terminal_.text.split('—')[1]
            df.loc[count, 'count'] = count
            df.loc[count, 'start'] = start
            df.loc[count, 'terminal'] = terminal
            count = count + 1
        else:
            df.loc[count, 'count'] = count
            df.loc[count, 'start'] = start_terminal_.text
            count = count + 1
    

    结果展示

    爬取结果

    经纬度查询

    查询网址:http://api.map.baidu.com/lbsapi/getpoint/index.html

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    import datetime
    import time
    from bs4 import BeautifulSoup
    import re
    
    headers = {
        'Host': 'http://api.map.baidu.com/lbsapi/getpoint/index.html',
        'Connection': 'keep-alive',
        'X-Requested-With': 'XMLHttpRequest',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
    }
    
    '''
    因为有动态操作,因此采用webdriver进行模拟点击
    '''
    chrome_options = Options()
    browser = webdriver.Chrome(chrome_options=chrome_options)
    url = 'http://api.map.baidu.com/lbsapi/getpoint/index.html'
    browser.get(url)
    for i in range(229, len(df)):
        
        #起始站坐标
        start = df.loc[i, 'start']
        #清除搜索框
        browser.find_element_by_id('localvalue').clear()
        #将搜索值传入搜索框
        browser.find_element_by_id('localvalue').send_keys(start +  ' 杭州')
        #搜索
        browser.find_element_by_id('localsearch').click()
        #等待页面加载完毕
        time.sleep(1.5)
        try:
            #将页面html解码
            soup = browser.find_element_by_id('MapInfo')
            #匹配出第一个坐标信息
            r=re.compile(r'坐标.*')
            text = r.search(soup.text).group()
            #匹配出经纬度坐标
            xy = re.findall('\d+.\d+,\d+.\d+', text)   
            #将坐标放入dataframe
            df.loc[i, 'start_xy'] = xy
        except:
            pass
        try:
            #终点站坐标
            terminal = df.loc[i, 'terminal']
            browser.find_element_by_id('localvalue').clear()
            browser.find_element_by_id('localvalue').send_keys(terminal +  ' 杭州')
            browser.find_element_by_id('localsearch').click()
            time.sleep(1.5)
            soup = browser.find_element_by_id('MapInfo')
            r=re.compile(r'坐标.*')
            text = r.search(soup.text).group()
            xy = re.findall('\d+.\d+,\d+.\d+', text)   #匹配出经纬度坐标
            df.loc[i, 'terminal_xy'] = xy
        except:
            pass
        print(i)
    
    '''
    坐标处理
    '''
    for i in range(len(df)):
        print(i)
        try:
            df.loc[i, 'start_x'] = float(df.loc[i, 'start_xy'].split(',')[0])
            df.loc[i, 'start_y'] = float(df.loc[i, 'start_xy'].split(',')[1])
            df.loc[i, 'terminal_x'] = float(df.loc[i, 'terminal_xy'].split(',')[0])
            df.loc[i, 'terminal_y'] = float(df.loc[i, 'terminal_xy'].split(',')[1])
        except:
            continue
    
    

    可视化

    import matplotlib.pyplot as plt
    import seaborn as sns
    %matplotlib inline
    plt.style.use('seaborn-whitegrid')
    
    # this function will be used more often to plot data on the NYC map
    def plot_on_map(df, BB, map_, s=10, alpha=0.2):
        fig, axs = plt.subplots(1, 2, figsize=(30,20))
        axs[0].scatter(df.start_x, df.start_y, zorder=1, alpha=alpha, c='r', s=s)
        axs[0].set_xlim((BB[0], BB[1]))
        axs[0].set_ylim((BB[2], BB[3]))
        axs[0].set_title('start locations')
        axs[0].imshow(map_, zorder=0, extent=BB)
    
        axs[1].scatter(df.terminal_x, df.terminal_y, zorder=1, alpha=alpha, c='r', s=s)
        axs[1].set_xlim((BB[0], BB[1]))
        axs[1].set_ylim((BB[2], BB[3]))
        axs[1].set_title('terminal locations')
        axs[1].imshow(map_, zorder=0, extent=BB)
        
    # load image of NYC map
    #BB为背景图的经纬度坐标
    BB = (119.710941, 120.673801, 29.685506, 30.552774)
    map_ = plt.imread('loc.png')
    # plot training data on map
    plot_on_map(d, BB, map_, s=1, alpha=0.3)
    plt.savefig('station.png')
    
    红点为站点位置

    相关文章

      网友评论

          本文标题:爬虫 爬取杭州公交始末站,经纬度

          本文链接:https://www.haomeiwen.com/subject/vwaxwctx.html