美文网首页我爱编程
NYC出租车数据分析(Python)

NYC出租车数据分析(Python)

作者: 咸鱼干lili | 来源:发表于2018-04-12 20:04 被阅读0次

    1.1 导入需要使用的包:

      import pandas as pd
      import numpy as np
      from matplotlib import pyplot as plt
      %matplotlib inline
    

    1.2 利用pd.read_csv导入数据, 命名为cab

     cab = pd.read_csv('/Users/tangyu/Desktop/test.csv')
    

    1.3 查看cab的数据组成

    cab.info
    

    1.4 结果如下,注意1) 数据类型为DataFrame 2)列名和类型 3)内存使用情况

    <class 'pandas.core.frame.DataFrame'>
    
    RangeIndex: 625134 entries, 0 to 625133
    
    Data columns (total 9 columns):
    
    id                    625134 non-null object
    
    vendor_id             625134 non-null int64
    
    pickup_datetime       625134 non-null object
    
    passenger_count       625134 non-null int64
    
    pickup_longitude      625134 non-null float64
    
    pickup_latitude       625134 non-null float64
    
    dropoff_longitude     625134 non-null float64
    
    dropoff_latitude      625134 non-null float64
    
    store_and_fwd_flag    625134 non-null object
    
    dtypes: float64(4), int64(2), object(3)
    
    memory usage: 42.9+ MB
    

    1.5 查看数据前/后5项

    cab.head() #  前5项
    cab.tail() # 后5项
    len(cab.id) # 查看数据总数
    >>> 625134
    

    1.6 自定义函数Jan返回一月份日期,利用filter函数代替for循环

     dates = cab['pickup_datetime']
     pas = cab['passenger_count']
     def jan(date):
         return date <= '2016-01-31'
     jan_list = filter(jan, dates)
    

    1.7 np.unique函数计算每辆车运营的乘客人数
    sum()计算总人数
    利用for循环计算每一种人数客车的比例

    pas_per_car = np.unique(cab['passenger_count']) #       total_pas = sum(cab['passenger_count']) #     pas_per_car_ratio = [] #     
    pas_per_car_counts = []
      for i in pas_per_car:
          pas_per_car_count = sum(cab['passenger_count'] == i)
          pas_per_car_counts.append(pas_per_car_count)
          ratio = pas_per_car_count/float(total_pas)
          pas_per_car_ratio.append(ratio)
    

    1.8 计算每个月的数据,注意 & 和 | 表示并和或

    # 获取每个月的数据     
    cab_Jan = cab[cab['pickup_datetime'] <= '2016-02-01'] cab_Feb = cab[(cab['pickup_datetime'] <= '2016-03-01') & (cab['pickup_datetime'] >='2016-02-01')]
    cab_May = cab[(cab['pickup_datetime'] <= '2016-04-01') & (cab['pickup_datetime'] >='2016-03-01')]
    cab_Apr = cab[(cab['pickup_datetime'] <= '2016-05-01') & (cab['pickup_datetime'] >='2016-04-01')]
    cab_Mar = cab[(cab['pickup_datetime'] <= '2016-06-01') & (cab['pickup_datetime'] >='2016-05-01')]
    cab_Jun = cab[(cab['pickup_datetime'] <= '2016-07-01') & (cab['pickup_datetime'] >='2016-06-01')]
    

    1.9 每个月乘客人数分别求;每个月乘客人数的最大值,可以看出每个月都一致

    pas_list=[] # 每个月的passengers人数求和
    pas_max_list = [] # 每个月人数最大值
    pas_per_car_monthly_counts = [] #每个月的人数比较
    
    for cab_mons in [cab_Jan, cab_Feb, cab_May, cab_Apr, cab_Mar, cab_Jun]: pas_count = sum(cab_mons['passenger_count']) #总人数
    pas_list.append(pas_count) # list.append()
    pas_max = max(cab_mons['passenger_count']) #
          cab_max_monthly = cab_mons[cab_mons['passenger_count'] == pas_max]
          pas_max_list.append(pas_max)
          for i in pas_per_car:
              pas_per_car_count_month = sum(cab_mons['passenger_count'] == i)
              pas_per_car_monthly_counts.append(pas_per_car_count_month)
      print pas_list
      print pas_max_list
      print pas_per_car_monthly_counts
    
    [162880, 169805, 182752, 178058, 179140, 166191] [9, 6, 6, 6, 6, 9] [2, 69319, 13984, 3848, 1848, 5396, 3278, 1, 2, 72881, 14592, 4012, 1897, 5464, 3466, 0, 8, 77849, 15545, 4576, 2177, 5875, 3667, 0, 3, 76139, 15647, 4480, 2055, 5683, 3425, 0, 6, 75934, 15793, 4484, 2068, 5814, 3471, 0, 2, 71325, 14466, 4286, 1972, 5179, 3214, 1]
    
    pas_per_car #每辆车的人数
    >>> array([0, 1, 2, 3, 4, 5, 6, 9])
    

    1.10 最后一列为N的共3430行数

    sum(cab.store_and_fwd_flag !='N')
    >>> 3430
    

    作图部分

    2.1 配色卡

    import seaborn as sns sns.palplot(sns.cubehelix_palette(8, start=.5, rot=-.75))
    
    本文使用的配色卡

    2.2 不同人数的乘客比例分布

    plt.figure(figsize=(7,7))
      labels = pas_per_car
      explode = (0, 0.1, 0.1, 0, 0, 0, 0, 0)  # explode 1st slice
      # Plot
      plt.pie(pas_per_car_counts, explode=explode, labels=labels,
              autopct='%1.1f%%', shadow=True, startangle=140)
      plt.axis('equal')
      plt.legend(loc='upper left', bbox_to_anchor=(-0.1, 1))
    

    2.3 不同月份中不同人数的占比分布:基本一致

     plt.figure(figsize=(20,20))
    labels = pas_per_car
    explode = (0, 0.1, 0.1, 0, 0, 0, 0, 0) # explode 1st slice
    # Plot
    y = 0;
    xlabel = ['Jan', 'Feb', 'May', 'Apr', 'Mar', 'Jun']
    for i in range(1,7):
        plt.subplot(3,2,i)
        plt.pie(pas_per_car_monthly_counts[y:y+8], explode=explode,
    labels=labels,
                autopct='%1.1f%%', shadow=True, startangle=140)
        plt.axis('equal')
    plt.xlabel(xlabel[i-1])
    plt.legend(loc='upper left', bbox_to_anchor=(-0.1, 1))
    y=y+8
    

    2.4 发现有乘客为0的情况,可能的解释 1)这种情况可能是司机师傅没有打表计费; 2)乘车人取消了订单;3)数据错误

    sum(cab['passenger_count'] == 0)
    >>> 23
    

    2.5 2016上半年总运载的乘客人数为1038826人次

    total_pas
    >>> 1038826
    

    2.6 一月中乘客为9人的数据

    cab_mons[cab_mons['passenger_count'] == pas_max]
    

    2.7 一月到六月的乘客人数对比

    # Plot
    x = range(1,7) # 生成1~6的数组
    plt.ylim((150000, 200000)) # 固定y的坐标轴值
    plt.xlabel('Month') # 添加横坐标文字
    plt.ylabel('Passenger count') # 添加纵坐标文字
    sns.barplot(x, pas_list) 
    
    一月到六月的乘客人数条形图对比

    仔细看第一天的情况

    3.1 获取第一天的数据

    cab_1st_day = cab[cab['pickup_datetime'] <= '2016-01-02'] # the first day
       cab_hour = []
      for i in range(24):
          cab['pickup_datatime'] <=
          sum_by_hour = sum(cab[])
    >>> (array([ 2057., 0., 605., 0., 155., 0., 103., 0., 168., 93.]), array([ 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. ]), )
    

    3.2 一天的人流变化

    # 一天的人流变化
    import datetime
    date = [] 
    time = []
    pas_hour = 0
    pas_hours = []
    date_point = datetime.datetime.strptime("2016-01-01 22:59:59", "%Y-%m-%d %H:%M:%S") 
    for i in range(0,len(cab_1st_day['pickup_datetime'])):
    # for i in range(0,1100):
        date_time = cab_1st_day['pickup_datetime'].tolist()[i]
        date_obj = datetime.datetime.strptime(date_time,"%Y-%m-%d %H:%M:%S")
        if date_obj > date_point:
            pas_hour += sum(cab_1st_day[cab_1st_day['pickup_datetime'] == date_time].passenger_count) # 加该时间短的人数
        else:
            pas_hours.append(pas_hour)
            pas_hour = 0
            date_point -= datetime.timedelta(hours=1)# 1个小时作为一个间隔
            pas_hour += sum(cab_1st_day[cab_1st_day['pickup_datetime'] == date_time].passenger_count) # 加该时间短的人数
    
    # plot
    x = range(1,len(pas_hours)+1)
    #plt.ylim((150000, 200000))
    plt.xlabel('hour')
    plt.ylabel('Passenger count in the first day')
    # plt.bar(x , pas_hours) 
    sns.barplot(x, pas_hours)
    

    3.3 六月的情况

    import datetime
    date = [] 
    time = []
    pas_hour = 0
    pas_hours = []
    date_point = datetime.datetime.strptime("2016-06-30 23:59:59", "%Y-%m-%d %H:%M:%S")
    for i in range(0,len(cab_Jun['id'])):
    # for i in range(0,1100):
        date_time = cab_Jun['pickup_datetime'].tolist()[i]
        date_obj = datetime.datetime.strptime(date_time,"%Y-%m-%d %H:%M:%S")
        if date_obj > date_point:
            pas_hour += sum(cab_Jun[cab_Jun['pickup_datetime'] == date_time].passenger_count) # 加该时间短的人数
        else:
            pas_hours.append(pas_hour)
            pas_hour = 0
            date_point -= datetime.timedelta(days=1)# 1个小时作为一个间隔
            pas_hour += sum(cab_Jun[cab_Jun['pickup_datetime'] == date_time].passenger_count) # 加该时间短的人数
    
    len(cab_Jun['id']) # 六月的记录条数
    >>> 100445
    
     sum(cab_Jun['passenger_count']) # 六月的乘客数
    

    画了最后三天的柱状图:

    plt.figure(figsize=(10,10))
      x = range(1,len(pas_hours)+1)
    plt.xlabel('hour')
      plt.ylabel('Passenger count in Jun')
    sns.barplot(x, pas_hours)
    
    三天的流量分布

    根据经纬度画出地图

    4.1 上车和下车的经纬度最大/最小值

       lon_max = max(cab['pickup_longitude'])
      lon_min = min(cab['pickup_longitude'])
      lat_max = max(cab['pickup_latitude'])
      lat_min = min(cab['pickup_latitude'])
      print(lon_max)
      print(lon_min)
      print(lat_max)
      print(lat_min)
    >>> -69.248916626 -121.933128357 42.8149375916 37.3895874023
    

    4.2 统计相关

     import math
      longitudes =list['pickup_longitude','dropoff_longitude']
      for longitude in longitudes:
    df['longitude']
    

    4.3 首先利用matplotlib 和 basemap 画出地图

    from mpl_toolkits.basemap import Basemap, cm
      import numpy as np
      import matplotlib.pyplot as plt
      from matplotlib import cm
    
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_axes([0.1,0.1,0.8,0.8])
    # create polar stereographic Basemap instance.
    # NYC地图
    #m = Basemap(projection='stere',
    #           lon_0=-73.93, # 地图中心
    #           lat_0=40.65,
    #           llcrnrlat=40.2,
    #           urcrnrlat=40.9,
    #           llcrnrlon=-74.10,
    #           urcrnrlon=-73.75,
    #           resolution='l')
    # 美国整体地图
    m = Basemap(projection='stere',lat_0=90,lon_0=-105,
                llcrnrlat=23.41 ,urcrnrlat=45.44,
               llcrnrlon=-118.67,urcrnrlon=-64.52,
               rsphere=6371200.,resolution='l',area_thresh=10000)
    
    # draw coastlines, state and country boundaries, edge of map.
    #m.drawcoastlines() # drew coastlines
    m.drawstates() # draw states 
    m.drawcountries() # draw countries
    #m.drawcounties() # draw conties
    # draw parallels.
    parallels = np.arange(0.,90,10.)
    m.drawparallels(parallels,labels=[1,0,0,0],fontsize=10)
    # draw meridians
    #meridians = np.arange(180.,360.,10.)
    #m.drawmeridians(meridians,labels=[0,0,0,1],fontsize=10)
    meridians = np.arange(-110.,-60.,10.)
    m.drawmeridians(meridians,labels=[0,0,0,1],fontsize=10)
    m.fillcontinents(color = 'coral')
    # ny = data.shape[0]; nx = data.shape[1]
    #lons, lats = m.makegrid(cab_1st_day['pickup_longitude'], cab_1st_day['pickup_latitude']) # get lat/lons of ny by nx evenly space grid.
    
    
    
    lons = np.array(cab_1st_day['pickup_longitude'])
    lats = np.array(cab_1st_day['pickup_latitude'])
    x, y = m(lons, lats)
    #m.scatter(x,y,50,marker='.',color='b')
    plt.scatter(x,y,50,cmap=cm.hsv, color ='b')
    
    plt.scatter(lons,lats,50,cmap=cm.hsv, color = 'b') #
    

    仔细看一天的地图

    5.1 获取第一天的上下车经纬度和时间

    lons = np.array(cab_1st_day['pickup_longitude'])
      lats = np.array(cab_1st_day['pickup_latitude'])
      time = np.array(cab_1st_day['pickup_datetime'])
    

    5.2 第一天的上下车情况,以一个小时做为时间间隔,画出每个小时的散点图

    # 一天的人流变化
    import folium
    import datetime
    import time
    import os
    from selenium import webdriver
    #date = [] 
    #time = []
    #pas_hour = 0
    #pas_hours = []
    j=1;
    m = folium.Map(location=[40.8, -73.9],zoom_start=11)
    #p_lons = np.array(cab_1st_day['pickup_longitude'])
    #p_lats = np.array(cab_1st_day['pickup_latitude'])
    d_lons = np.array(cab_1st_day['dropoff_longitude'])
    d_lats = np.array(cab_1st_day['dropoff_latitude'])
    date_point = datetime.datetime.strptime("2016-01-01 22:59:59", "%Y-%m-%d %H:%M:%S") 
    #for p_lat, p_lon, i in zip(p_lats, p_lons,  range(0,len(p_lons))):
    for d_lat, d_lon, i in zip(d_lats, d_lons,  range(0,len(d_lons))):
    # for i in range(0,1000):
        date_time = cab_1st_day['pickup_datetime'].tolist()[i]
        date_obj = datetime.datetime.strptime(date_time,"%Y-%m-%d %H:%M:%S")
        if date_obj > date_point:
            #m.add_child(folium.Circle(location=[p_lat,p_lon]))
            m.add_child(folium.Circle(location=[d_lat,d_lon],color = '#FF0000'))
            #pas_hour += sum(cab_1st_day[cab_1st_day['pickup_datetime'] == date_time].passenger_count) # 加该时间短的人数
        else:
            #pas_hours.append(pas_hour)
            #pas_hour = 0
            display(m)
            #time.sleep(4)
            lj = str(j)
            fn = lj + '.html'
            # fn='testmap.html'
            tmpurl='file://{path}/{mapfile}'.format(path=os.getcwd(),mapfile=fn)
            m.save(fn)
            browser = webdriver.Safari()
            browser.get(tmpurl)
            #Give the map tiles some time to load
            time.sleep(4)
            browser.save_screenshot(lj + '.jpg')
            browser.quit()
            date_point -= datetime.timedelta(hours=1)# 1个小时作为一个间隔
            #m.add_child(folium.Circle(location=[p_lat,p_lon]))
            m.add_child(folium.Circle(location=[d_lat,d_lon],color = '#FF0000'))
            j += 1
            # pas_hour += sum(cab_1st_day[cab_1st_day['pickup_datetime'] == date_time].passenger_count) # 加该时间短的人数
    
    好看多了

    5.3 制作gif

    from PIL import Image, ImageSequence
    import sys, os
    filenames=sorted(fn for fn in os.listdir('.') if fn.endswith('.jpg')) # 获取文件并排序        
    filenames.sort(key = lambda f: int(filter(str.isdigit, f))) # sort by int
    # 再次按照int排序:保证排序       
    im=Image.open('1.jpg')
    print(filenames)
    im.save('traffic_1st_day_drop.gif', save_all=True, append_images= [Image.open(filename) for filename in filenames],loop=5,duration=500)
    

    相关文章

      网友评论

        本文标题:NYC出租车数据分析(Python)

        本文链接:https://www.haomeiwen.com/subject/bfpfkftx.html