26 Pandas处理分析网站原始访问日志

作者: Viterbi | 来源:发表于2022-11-15 19:20 被阅读0次

26 Pandas处理分析网站原始访问日志

目标：真实项目的实战，探索Pandas的数据处理与分析

实例：数据来源：我自己的wordpress博客http://www.crazyant.net/ 的访问日志

实现步骤：
1、读取数据、清理、格式化
2、统计爬虫spider的访问比例，输出柱状图
3、统计http状态码的访问占比，输出饼图 4、统计按小时、按天的PV/UV流量趋势，输出折线图

1、读取数据并清理格式化

import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', -1)

from pyecharts import options as opts
from pyecharts.charts import Bar,Pie,Line

# 读取整个目录，将所有的文件合并到一个dataframe
data_dir = "./datas/crazyant/blog_access_log"

df_list = []

import os
for fname in os.listdir(f"{data_dir}"):
    df_list.append(pd.read_csv(f"{data_dir}/{fname}", sep=" ", header=None, error_bad_lines=False))

df = pd.concat(df_list)

    b'Skipping line 2245: expected 10 fields, saw 16\nSkipping line 2889: expected 10 fields, saw 14\nSkipping line 2890: expected 10 fields, saw 14\nSkipping line 2891: expected 10 fields, saw 13\nSkipping line 2892: expected 10 fields, saw 13\nSkipping line 2900: expected 10 fields, saw 11\nSkipping line 2902: expected 10 fields, saw 11\nSkipping line 3790: expected 10 fields, saw 14\nSkipping line 3791: expected 10 fields, saw 14\nSkipping line 3792: expected 10 fields, saw 13\nSkipping line 3793: expected 10 fields, saw 13\nSkipping line 3833: expected 10 fields, saw 11\nSkipping line 3835: expected 10 fields, saw 11\nSkipping line 9936: expected 10 fields, saw 16\n'
    b'Skipping line 11748: expected 10 fields, saw 11\nSkipping line 11750: expected 10 fields, saw 11\n'
    



df.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	0	1	2	3	4	5	6	7	8	9
0	106.11.153.226	-	-	[02/Dec/2019:22:40:18	+0800]	GET /740.html?replytocom=1194 HTTP/1.0	200	13446	-	YisouSpider
1	42.156.254.60	-	-	[02/Dec/2019:22:40:23	+0800]	POST /wp-json/wordpress-popular-posts/v1/popular-posts HTTP/1.0	201	55	http://www.crazyant.net/740.html?replytocom=1194	Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
2	106.11.159.254	-	-	[02/Dec/2019:22:40:27	+0800]	GET /576.html HTTP/1.0	200	13461	-	YisouSpider
3	106.11.157.254	-	-	[02/Dec/2019:22:40:28	+0800]	GET /?lwfcdw=t9n2d3&oqzohc=m5e7j1&oubyvq=iab6a3&oudmbg=6osqd3 HTTP/1.0	200	10485	-	YisouSpider
4	42.156.137.109	-	-	[02/Dec/2019:22:40:30	+0800]	POST /wp-json/wordpress-popular-posts/v1/popular-posts HTTP/1.0	201	55	http://www.crazyant.net/576.html	Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36

df = df[[0, 3, 6, 9]].copy()
df.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	0	3	6	9
0	106.11.153.226	[02/Dec/2019:22:40:18	200	YisouSpider
1	42.156.254.60	[02/Dec/2019:22:40:23	201	Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
2	106.11.159.254	[02/Dec/2019:22:40:27	200	YisouSpider
3	106.11.157.254	[02/Dec/2019:22:40:28	200	YisouSpider
4	42.156.137.109	[02/Dec/2019:22:40:30	201	Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36

df.columns = ["ip", "stime", "status", "client"]
df.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	ip	stime	status	client
0	106.11.153.226	[02/Dec/2019:22:40:18	200	YisouSpider
1	42.156.254.60	[02/Dec/2019:22:40:23	201	Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36
2	106.11.159.254	[02/Dec/2019:22:40:27	200	YisouSpider
3	106.11.157.254	[02/Dec/2019:22:40:28	200	YisouSpider
4	42.156.137.109	[02/Dec/2019:22:40:30	201	Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36

df.dtypes




    ip        object
    stime     object
    status    int64 
    client    object
    dtype: object

2、统计spider的比例

df["is_spider"] = df["client"].str.lower().str.contains("spider")
df.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	ip	stime	status	client	is_spider
0	106.11.153.226	[02/Dec/2019:22:40:18	200	YisouSpider	True
1	42.156.254.60	[02/Dec/2019:22:40:23	201	Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36	True
2	106.11.159.254	[02/Dec/2019:22:40:27	200	YisouSpider	True
3	106.11.157.254	[02/Dec/2019:22:40:28	200	YisouSpider	True
4	42.156.137.109	[02/Dec/2019:22:40:30	201	Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36	True

df_spider = df["is_spider"].value_counts()
df_spider



    False    46641
    True     3637 
    Name: is_spider, dtype: int64




bar = (
        Bar()
        .add_xaxis([str(x) for x in df_spider.index])
        .add_yaxis("是否Spider", df_spider.values.tolist())
        .set_global_opts(title_opts=opts.TitleOpts(title="爬虫访问量占比"))
)
bar.render_notebook()

3、访问状态码的数量对比

df_status = df.groupby("status").size()
df_status



    status
    200    41924
    201    3432 
    206    70   
    301    2364 
    302    23   
    304    19   
    400    20   
    403    92   
    404    1474 
    405    12   
    444    846  
    500    1    
    504    1    
    dtype: int64


list(zip(df_status.index, df_status))




    [(200, 41924),
     (201, 3432),
     (206, 70),
     (301, 2364),
     (302, 23),
     (304, 19),
     (400, 20),
     (403, 92),
     (404, 1474),
     (405, 12),
     (444, 846),
     (500, 1),
     (504, 1)]



pie = (
        Pie()
        .add("状态码比例", list(zip(df_status.index, df_status)))
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    )
pie.render_notebook()

4、实现按小时、按天粒度的流量统计

df.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	ip	stime	status	client	is_spider
0	106.11.153.226	[02/Dec/2019:22:40:18	200	YisouSpider	True
1	42.156.254.60	[02/Dec/2019:22:40:23	201	Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36	True
2	106.11.159.254	[02/Dec/2019:22:40:27	200	YisouSpider	True
3	106.11.157.254	[02/Dec/2019:22:40:28	200	YisouSpider	True
4	42.156.137.109	[02/Dec/2019:22:40:30	201	Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36	True

df["stime"] = pd.to_datetime(df["stime"].str[1:], format="%d/%b/%Y:%H:%M:%S")
df.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	ip	stime	status	client	is_spider
0	106.11.153.226	2019-12-02 22:40:18	200	YisouSpider	True
1	42.156.254.60	2019-12-02 22:40:23	201	Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36	True
2	106.11.159.254	2019-12-02 22:40:27	200	YisouSpider	True
3	106.11.157.254	2019-12-02 22:40:28	200	YisouSpider	True
4	42.156.137.109	2019-12-02 22:40:30	201	Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36	True

df.set_index("stime", inplace=True)
df.sort_index(inplace=True)
df.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	ip	status	client	is_spider
stime
2019-12-02 22:40:18	106.11.153.226	200	YisouSpider	True
2019-12-02 22:40:23	42.156.254.60	201	Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36	True
2019-12-02 22:40:27	106.11.159.254	200	YisouSpider	True
2019-12-02 22:40:28	106.11.157.254	200	YisouSpider	True
2019-12-02 22:40:30	42.156.137.109	201	Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36	True

df.index



    DatetimeIndex(['2019-12-02 22:40:18', '2019-12-02 22:40:23',
                   '2019-12-02 22:40:27', '2019-12-02 22:40:28',
                   '2019-12-02 22:40:30', '2019-12-02 22:40:46',
                   '2019-12-02 22:41:52', '2019-12-02 22:41:52',
                   '2019-12-02 22:41:55', '2019-12-02 22:42:16',
                   ...
                   '2019-12-07 21:30:16', '2019-12-07 21:30:17',
                   '2019-12-07 21:30:19', '2019-12-07 21:30:20',
                   '2019-12-07 21:30:21', '2019-12-07 21:30:22',
                   '2019-12-07 21:30:23', '2019-12-07 21:30:56',
                   '2019-12-07 21:30:58', '2019-12-07 21:31:02'],
                  dtype='datetime64[ns]', name='stime', length=50278, freq=None)



# 按小时统计
#df_pvuv = df.resample("H")["ip"].agg(pv=np.size, uv=pd.Series.nunique)

# 按每6个小时统计
#df_pvuv = df.resample("6H")["ip"].agg(pv=np.size, uv=pd.Series.nunique)

# 按天统计
df_pvuv = df.resample("D")["ip"].agg(pv=np.size, uv=pd.Series.nunique)

df_pvuv.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>

	pv	uv
stime
2019-12-02	288	70
2019-12-03	10285	1180
2019-12-04	13618	1197
2019-12-05	10485	1152
2019-12-06	9469	1261

line = (
        Line()
        .add_xaxis(df_pvuv.index.to_list())
        .add_yaxis("PV", df_pvuv["pv"].to_list())
        .add_yaxis("UV", df_pvuv["uv"].to_list())
        .set_global_opts(
            title_opts=opts.TitleOpts(title="PVUV数据对比"),
            tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross")
        )
    )
line.render_notebook()

本文使用文章同步助手同步

网友评论

本文标题：26 Pandas处理分析网站原始访问日志

本文链接：https://www.haomeiwen.com/subject/tatjtdtx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

26 Pandas处理分析网站原始访问日志