美文网首页
26 Pandas处理分析网站原始访问日志

26 Pandas处理分析网站原始访问日志

作者: Viterbi | 来源:发表于2022-11-15 19:20 被阅读0次

    26 Pandas处理分析网站原始访问日志

    目标:真实项目的实战,探索Pandas的数据处理与分析

    实例: 数据来源:我自己的wordpress博客http://www.crazyant.net/ 的访问日志

    实现步骤:
    1、读取数据、清理、格式化
    2、统计爬虫spider的访问比例,输出柱状图
    3、统计http状态码的访问占比,输出饼图 4、统计按小时、按天的PV/UV流量趋势,输出折线图

    1、读取数据并清理格式化

    import pandas as pd
    import numpy as np
    
    pd.set_option('display.max_colwidth', -1)
    
    from pyecharts import options as opts
    from pyecharts.charts import Bar,Pie,Line
    
    # 读取整个目录,将所有的文件合并到一个dataframe
    data_dir = "./datas/crazyant/blog_access_log"
    
    df_list = []
    
    import os
    for fname in os.listdir(f"{data_dir}"):
        df_list.append(pd.read_csv(f"{data_dir}/{fname}", sep=" ", header=None, error_bad_lines=False))
    
    df = pd.concat(df_list)
    
        b'Skipping line 2245: expected 10 fields, saw 16\nSkipping line 2889: expected 10 fields, saw 14\nSkipping line 2890: expected 10 fields, saw 14\nSkipping line 2891: expected 10 fields, saw 13\nSkipping line 2892: expected 10 fields, saw 13\nSkipping line 2900: expected 10 fields, saw 11\nSkipping line 2902: expected 10 fields, saw 11\nSkipping line 3790: expected 10 fields, saw 14\nSkipping line 3791: expected 10 fields, saw 14\nSkipping line 3792: expected 10 fields, saw 13\nSkipping line 3793: expected 10 fields, saw 13\nSkipping line 3833: expected 10 fields, saw 11\nSkipping line 3835: expected 10 fields, saw 11\nSkipping line 9936: expected 10 fields, saw 16\n'
        b'Skipping line 11748: expected 10 fields, saw 11\nSkipping line 11750: expected 10 fields, saw 11\n'
        
    
    
    
    df.head()
    
    .dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
    0 1 2 3 4 5 6 7 8 9
    0 106.11.153.226 - - [02/Dec/2019:22:40:18 +0800] GET /740.html?replytocom=1194 HTTP/1.0 200 13446 - YisouSpider
    1 42.156.254.60 - - [02/Dec/2019:22:40:23 +0800] POST /wp-json/wordpress-popular-posts/v1/popular-posts HTTP/1.0 201 55 http://www.crazyant.net/740.html?replytocom=1194 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
    2 106.11.159.254 - - [02/Dec/2019:22:40:27 +0800] GET /576.html HTTP/1.0 200 13461 - YisouSpider
    3 106.11.157.254 - - [02/Dec/2019:22:40:28 +0800] GET /?lwfcdw=t9n2d3&oqzohc=m5e7j1&oubyvq=iab6a3&oudmbg=6osqd3 HTTP/1.0 200 10485 - YisouSpider
    4 42.156.137.109 - - [02/Dec/2019:22:40:30 +0800] POST /wp-json/wordpress-popular-posts/v1/popular-posts HTTP/1.0 201 55 http://www.crazyant.net/576.html Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
    df = df[[0, 3, 6, 9]].copy()
    df.head()
    
    .dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
    0 3 6 9
    0 106.11.153.226 [02/Dec/2019:22:40:18 200 YisouSpider
    1 42.156.254.60 [02/Dec/2019:22:40:23 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
    2 106.11.159.254 [02/Dec/2019:22:40:27 200 YisouSpider
    3 106.11.157.254 [02/Dec/2019:22:40:28 200 YisouSpider
    4 42.156.137.109 [02/Dec/2019:22:40:30 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
    df.columns = ["ip", "stime", "status", "client"]
    df.head()
    
    .dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
    ip stime status client
    0 106.11.153.226 [02/Dec/2019:22:40:18 200 YisouSpider
    1 42.156.254.60 [02/Dec/2019:22:40:23 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
    2 106.11.159.254 [02/Dec/2019:22:40:27 200 YisouSpider
    3 106.11.157.254 [02/Dec/2019:22:40:28 200 YisouSpider
    4 42.156.137.109 [02/Dec/2019:22:40:30 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
    df.dtypes
    
    
    
    
        ip        object
        stime     object
        status    int64 
        client    object
        dtype: object
    
    

    2、统计spider的比例

    df["is_spider"] = df["client"].str.lower().str.contains("spider")
    df.head()
    
    .dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
    ip stime status client is_spider
    0 106.11.153.226 [02/Dec/2019:22:40:18 200 YisouSpider True
    1 42.156.254.60 [02/Dec/2019:22:40:23 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 True
    2 106.11.159.254 [02/Dec/2019:22:40:27 200 YisouSpider True
    3 106.11.157.254 [02/Dec/2019:22:40:28 200 YisouSpider True
    4 42.156.137.109 [02/Dec/2019:22:40:30 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 True
    df_spider = df["is_spider"].value_counts()
    df_spider
    
    
    
        False    46641
        True     3637 
        Name: is_spider, dtype: int64
    
    
    
    
    bar = (
            Bar()
            .add_xaxis([str(x) for x in df_spider.index])
            .add_yaxis("是否Spider", df_spider.values.tolist())
            .set_global_opts(title_opts=opts.TitleOpts(title="爬虫访问量占比"))
    )
    bar.render_notebook()
    

    3、访问状态码的数量对比

    df_status = df.groupby("status").size()
    df_status
    
    
    
        status
        200    41924
        201    3432 
        206    70   
        301    2364 
        302    23   
        304    19   
        400    20   
        403    92   
        404    1474 
        405    12   
        444    846  
        500    1    
        504    1    
        dtype: int64
    
    
    list(zip(df_status.index, df_status))
    
    
    
    
        [(200, 41924),
         (201, 3432),
         (206, 70),
         (301, 2364),
         (302, 23),
         (304, 19),
         (400, 20),
         (403, 92),
         (404, 1474),
         (405, 12),
         (444, 846),
         (500, 1),
         (504, 1)]
    
    
    
    pie = (
            Pie()
            .add("状态码比例", list(zip(df_status.index, df_status)))
            .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
        )
    pie.render_notebook()
    

    4、实现按小时、按天粒度的流量统计

    df.head()
    
    
    .dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
    ip stime status client is_spider
    0 106.11.153.226 [02/Dec/2019:22:40:18 200 YisouSpider True
    1 42.156.254.60 [02/Dec/2019:22:40:23 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 True
    2 106.11.159.254 [02/Dec/2019:22:40:27 200 YisouSpider True
    3 106.11.157.254 [02/Dec/2019:22:40:28 200 YisouSpider True
    4 42.156.137.109 [02/Dec/2019:22:40:30 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 True
    df["stime"] = pd.to_datetime(df["stime"].str[1:], format="%d/%b/%Y:%H:%M:%S")
    df.head()
    
    
    .dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
    ip stime status client is_spider
    0 106.11.153.226 2019-12-02 22:40:18 200 YisouSpider True
    1 42.156.254.60 2019-12-02 22:40:23 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 True
    2 106.11.159.254 2019-12-02 22:40:27 200 YisouSpider True
    3 106.11.157.254 2019-12-02 22:40:28 200 YisouSpider True
    4 42.156.137.109 2019-12-02 22:40:30 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 True
    df.set_index("stime", inplace=True)
    df.sort_index(inplace=True)
    df.head()
    
    .dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
    ip status client is_spider
    stime
    2019-12-02 22:40:18 106.11.153.226 200 YisouSpider True
    2019-12-02 22:40:23 42.156.254.60 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 True
    2019-12-02 22:40:27 106.11.159.254 200 YisouSpider True
    2019-12-02 22:40:28 106.11.157.254 200 YisouSpider True
    2019-12-02 22:40:30 42.156.137.109 201 Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 True
    df.index
    
    
    
        DatetimeIndex(['2019-12-02 22:40:18', '2019-12-02 22:40:23',
                       '2019-12-02 22:40:27', '2019-12-02 22:40:28',
                       '2019-12-02 22:40:30', '2019-12-02 22:40:46',
                       '2019-12-02 22:41:52', '2019-12-02 22:41:52',
                       '2019-12-02 22:41:55', '2019-12-02 22:42:16',
                       ...
                       '2019-12-07 21:30:16', '2019-12-07 21:30:17',
                       '2019-12-07 21:30:19', '2019-12-07 21:30:20',
                       '2019-12-07 21:30:21', '2019-12-07 21:30:22',
                       '2019-12-07 21:30:23', '2019-12-07 21:30:56',
                       '2019-12-07 21:30:58', '2019-12-07 21:31:02'],
                      dtype='datetime64[ns]', name='stime', length=50278, freq=None)
    
    
    
    # 按小时统计
    #df_pvuv = df.resample("H")["ip"].agg(pv=np.size, uv=pd.Series.nunique)
    
    # 按每6个小时统计
    #df_pvuv = df.resample("6H")["ip"].agg(pv=np.size, uv=pd.Series.nunique)
    
    # 按天统计
    df_pvuv = df.resample("D")["ip"].agg(pv=np.size, uv=pd.Series.nunique)
    
    df_pvuv.head()
    
    .dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
    pv uv
    stime
    2019-12-02 288 70
    2019-12-03 10285 1180
    2019-12-04 13618 1197
    2019-12-05 10485 1152
    2019-12-06 9469 1261
    line = (
            Line()
            .add_xaxis(df_pvuv.index.to_list())
            .add_yaxis("PV", df_pvuv["pv"].to_list())
            .add_yaxis("UV", df_pvuv["uv"].to_list())
            .set_global_opts(
                title_opts=opts.TitleOpts(title="PVUV数据对比"),
                tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross")
            )
        )
    line.render_notebook()
    

    本文使用 文章同步助手 同步

    相关文章

      网友评论

          本文标题:26 Pandas处理分析网站原始访问日志

          本文链接:https://www.haomeiwen.com/subject/tatjtdtx.html