26 Pandas处理分析网站原始访问日志
目标:真实项目的实战,探索Pandas的数据处理与分析
实例:
数据来源:我自己的wordpress博客http://www.crazyant.net/ 的访问日志
实现步骤:
1、读取数据、清理、格式化
2、统计爬虫spider的访问比例,输出柱状图
3、统计http状态码的访问占比,输出饼图
4、统计按小时、按天的PV/UV流量趋势,输出折线图
1、读取数据并清理格式化
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', -1)
from pyecharts import options as opts
from pyecharts.charts import Bar,Pie,Line
data_dir = "./datas/crazyant/blog_access_log"
df_list = []
import os
for fname in os.listdir(f"{data_dir}"):
df_list.append(pd.read_csv(f"{data_dir}/{fname}", sep=" ", header=None, error_bad_lines=False))
df = pd.concat(df_list)
b'Skipping line 2245: expected 10 fields, saw 16\nSkipping line 2889: expected 10 fields, saw 14\nSkipping line 2890: expected 10 fields, saw 14\nSkipping line 2891: expected 10 fields, saw 13\nSkipping line 2892: expected 10 fields, saw 13\nSkipping line 2900: expected 10 fields, saw 11\nSkipping line 2902: expected 10 fields, saw 11\nSkipping line 3790: expected 10 fields, saw 14\nSkipping line 3791: expected 10 fields, saw 14\nSkipping line 3792: expected 10 fields, saw 13\nSkipping line 3793: expected 10 fields, saw 13\nSkipping line 3833: expected 10 fields, saw 11\nSkipping line 3835: expected 10 fields, saw 11\nSkipping line 9936: expected 10 fields, saw 16\n'
b'Skipping line 11748: expected 10 fields, saw 11\nSkipping line 11750: expected 10 fields, saw 11\n'
df.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
<pre><code>.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</code></pre>
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
0 |
106.11.153.226 |
- |
- |
[02/Dec/2019:22:40:18 |
+0800] |
GET /740.html?replytocom=1194 HTTP/1.0 |
200 |
13446 |
- |
YisouSpider |
1 |
42.156.254.60 |
- |
- |
[02/Dec/2019:22:40:23 |
+0800] |
POST /wp-json/wordpress-popular-posts/v1/popular-posts HTTP/1.0 |
201 |
55 |
http://www.crazyant.net/740.html?replytocom=1194 |
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 |
2 |
106.11.159.254 |
- |
- |
[02/Dec/2019:22:40:27 |
+0800] |
GET /576.html HTTP/1.0 |
200 |
13461 |
- |
YisouSpider |
3 |
106.11.157.254 |
- |
- |
[02/Dec/2019:22:40:28 |
+0800] |
GET /?lwfcdw=t9n2d3&oqzohc=m5e7j1&oubyvq=iab6a3&oudmbg=6osqd3 HTTP/1.0 |
200 |
10485 |
- |
YisouSpider |
4 |
42.156.137.109 |
- |
- |
[02/Dec/2019:22:40:30 |
+0800] |
POST /wp-json/wordpress-popular-posts/v1/popular-posts HTTP/1.0 |
201 |
55 |
http://www.crazyant.net/576.html |
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 |
df = df[[0, 3, 6, 9]].copy()
df.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
<pre><code>.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</code></pre>
|
0 |
3 |
6 |
9 |
0 |
106.11.153.226 |
[02/Dec/2019:22:40:18 |
200 |
YisouSpider |
1 |
42.156.254.60 |
[02/Dec/2019:22:40:23 |
201 |
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 |
2 |
106.11.159.254 |
[02/Dec/2019:22:40:27 |
200 |
YisouSpider |
3 |
106.11.157.254 |
[02/Dec/2019:22:40:28 |
200 |
YisouSpider |
4 |
42.156.137.109 |
[02/Dec/2019:22:40:30 |
201 |
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 |
df.columns = ["ip", "stime", "status", "client"]
df.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
<pre><code>.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</code></pre>
|
ip |
stime |
status |
client |
0 |
106.11.153.226 |
[02/Dec/2019:22:40:18 |
200 |
YisouSpider |
1 |
42.156.254.60 |
[02/Dec/2019:22:40:23 |
201 |
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 |
2 |
106.11.159.254 |
[02/Dec/2019:22:40:27 |
200 |
YisouSpider |
3 |
106.11.157.254 |
[02/Dec/2019:22:40:28 |
200 |
YisouSpider |
4 |
42.156.137.109 |
[02/Dec/2019:22:40:30 |
201 |
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 |
df.dtypes
ip object
stime object
status int64
client object
dtype: object
2、统计spider的比例
df["is_spider"] = df["client"].str.lower().str.contains("spider")
df.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
<pre><code>.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</code></pre>
|
ip |
stime |
status |
client |
is_spider |
0 |
106.11.153.226 |
[02/Dec/2019:22:40:18 |
200 |
YisouSpider |
True |
1 |
42.156.254.60 |
[02/Dec/2019:22:40:23 |
201 |
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 |
True |
2 |
106.11.159.254 |
[02/Dec/2019:22:40:27 |
200 |
YisouSpider |
True |
3 |
106.11.157.254 |
[02/Dec/2019:22:40:28 |
200 |
YisouSpider |
True |
4 |
42.156.137.109 |
[02/Dec/2019:22:40:30 |
201 |
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 |
True |
df_spider = df["is_spider"].value_counts()
df_spider
False 46641
True 3637
Name: is_spider, dtype: int64
bar = (
Bar()
.add_xaxis([str(x) for x in df_spider.index])
.add_yaxis("是否Spider", df_spider.values.tolist())
.set_global_opts(title_opts=opts.TitleOpts(title="爬虫访问量占比"))
)
bar.render_notebook()
3、访问状态码的数量对比
df_status = df.groupby("status").size()
df_status
status
200 41924
201 3432
206 70
301 2364
302 23
304 19
400 20
403 92
404 1474
405 12
444 846
500 1
504 1
dtype: int64
list(zip(df_status.index, df_status))
[(200, 41924),
(201, 3432),
(206, 70),
(301, 2364),
(302, 23),
(304, 19),
(400, 20),
(403, 92),
(404, 1474),
(405, 12),
(444, 846),
(500, 1),
(504, 1)]
pie = (
Pie()
.add("状态码比例", list(zip(df_status.index, df_status)))
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
pie.render_notebook()
4、实现按小时、按天粒度的流量统计
df.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
<pre><code>.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</code></pre>
|
ip |
stime |
status |
client |
is_spider |
0 |
106.11.153.226 |
[02/Dec/2019:22:40:18 |
200 |
YisouSpider |
True |
1 |
42.156.254.60 |
[02/Dec/2019:22:40:23 |
201 |
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 |
True |
2 |
106.11.159.254 |
[02/Dec/2019:22:40:27 |
200 |
YisouSpider |
True |
3 |
106.11.157.254 |
[02/Dec/2019:22:40:28 |
200 |
YisouSpider |
True |
4 |
42.156.137.109 |
[02/Dec/2019:22:40:30 |
201 |
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 |
True |
df["stime"] = pd.to_datetime(df["stime"].str[1:], format="%d/%b/%Y:%H:%M:%S")
df.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
<pre><code>.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</code></pre>
|
ip |
stime |
status |
client |
is_spider |
0 |
106.11.153.226 |
2019-12-02 22:40:18 |
200 |
YisouSpider |
True |
1 |
42.156.254.60 |
2019-12-02 22:40:23 |
201 |
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 |
True |
2 |
106.11.159.254 |
2019-12-02 22:40:27 |
200 |
YisouSpider |
True |
3 |
106.11.157.254 |
2019-12-02 22:40:28 |
200 |
YisouSpider |
True |
4 |
42.156.137.109 |
2019-12-02 22:40:30 |
201 |
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 |
True |
df.set_index("stime", inplace=True)
df.sort_index(inplace=True)
df.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
<pre><code>.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</code></pre>
|
ip |
status |
client |
is_spider |
stime |
|
|
|
|
2019-12-02 22:40:18 |
106.11.153.226 |
200 |
YisouSpider |
True |
2019-12-02 22:40:23 |
42.156.254.60 |
201 |
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 |
True |
2019-12-02 22:40:27 |
106.11.159.254 |
200 |
YisouSpider |
True |
2019-12-02 22:40:28 |
106.11.157.254 |
200 |
YisouSpider |
True |
2019-12-02 22:40:30 |
42.156.137.109 |
201 |
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 YisouSpider/5.0 Safari/537.36 |
True |
df.index
DatetimeIndex(['2019-12-02 22:40:18', '2019-12-02 22:40:23',
'2019-12-02 22:40:27', '2019-12-02 22:40:28',
'2019-12-02 22:40:30', '2019-12-02 22:40:46',
'2019-12-02 22:41:52', '2019-12-02 22:41:52',
'2019-12-02 22:41:55', '2019-12-02 22:42:16',
...
'2019-12-07 21:30:16', '2019-12-07 21:30:17',
'2019-12-07 21:30:19', '2019-12-07 21:30:20',
'2019-12-07 21:30:21', '2019-12-07 21:30:22',
'2019-12-07 21:30:23', '2019-12-07 21:30:56',
'2019-12-07 21:30:58', '2019-12-07 21:31:02'],
dtype='datetime64[ns]', name='stime', length=50278, freq=None)
df_pvuv = df.resample("D")["ip"].agg(pv=np.size, uv=pd.Series.nunique)
df_pvuv.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
<pre><code>.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</code></pre>
|
pv |
uv |
stime |
|
|
2019-12-02 |
288 |
70 |
2019-12-03 |
10285 |
1180 |
2019-12-04 |
13618 |
1197 |
2019-12-05 |
10485 |
1152 |
2019-12-06 |
9469 |
1261 |
line = (
Line()
.add_xaxis(df_pvuv.index.to_list())
.add_yaxis("PV", df_pvuv["pv"].to_list())
.add_yaxis("UV", df_pvuv["uv"].to_list())
.set_global_opts(
title_opts=opts.TitleOpts(title="PVUV数据对比"),
tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross")
)
)
line.render_notebook()
本文使用 文章同步助手 同步
网友评论