项目背景:
本文是基于上篇文章的延伸Adventure Works数据库自行车业务分析
为满足业务自主分析的需求,考虑将计算的结果进行可视化的动态页面展示。
具体实现:
一、mysql数据源观察利用
两个数据库:
- 主数据库(只读):公司的生产数据库,专门有ETL工程师在维护。
-
部门数据库(可增删查改):部门专门的数据库,可以通过读取公司生产数据库来加工保存到该数据库中。
主数据库
部门数据库
二、和业务沟通需要的自主分析数据指标
经和业务沟通后,整理后的数据字典部分如下图所示:
数据字典
三、根据基础数据读取,利用python进行加工
1、dw_order_by_day
(按每日汇总数据,并加上每日环比)
目的:对订单明细表(ods_sales_orders)聚合,形成dw_order_by_day
数据库:adventure_ods
读取数据表:
- ods_sales_order(订单明细表)
import pandas as pd
import random
import pymysql
pymysql.install_as_MySQLdb()
import sqlalchemy
adventure_conn_read = sqlalchemy.create_engine('mysql://用户名:密码@ip地址/adventure_ods')
sum_amount_order = pd.read_sql_query("select * from ods_sales_orders ",
con=adventure_conn_read)
- dim_date_df(每日更新的维度表)
利用py对读取的数据表进行加工
(1)sum_amount_order(按天计算订单量和交易金额)
实现思路:读取ods_sales_order(订单明细表),根据create_date聚合,求总销量/订单量/客单价
#根据create_date分组求总和单价和客户数量
sum_amount_order=sum_amount_order.groupby(by='create_date').agg(
{'unit_price':sum,'customer_key':pd.Series.nunique}).reset_index()
sum_amount_order.rename(columns={'unit_price':'sum_amount',
'customer_key':'sum_order'},
inplace=True)
#客单价
sum_amount_order['amount_div_order']=\
sum_amount_order['sum_amount']/sum_amount_order['sum_order']
sum_amount_order.head()
(2)sum_amount_order_goal(销售订单聚合表+目标值)即销售订单聚合目标表
实现思路:
按照一定规则生成目标值sum_amount_goal+sum_order_goal(目标金额+目标销量)再和销售订单聚合表(sum_amount_order)concat
#利用空列表及循环生成对应随机值
sum_amount_goal_list=[]
sum_order_goal_list=[]
#获取sum_amount_order中的create_date
create_date_list=list(sum_amount_order['create_date'])
for i in create_date_list:
#生成一个在[0.85,1.1]随机数
a=random.uniform(0.85,1.1)
b=random.uniform(0.85,1.1)
# 对应日期下生成总金额(sum_amount)*a 的列
amount_goal=list(sum_amount_order[sum_amount_order['create_date']==i]
['sum_amount'])[0]*a
# 对应日期下生成总订单数(sum_order)*b 的列
order_goal=list(sum_amount_order[sum_amount_order['create_date']==i]
['sum_order'])[0]*b
#将生成的目标值加入空列表
sum_amount_goal_list.append(amount_goal)
sum_order_goal_list.append(order_goal)
(3)date_info
实现思路:
基于dim_date_df(日期维度表)
date_sql = """
select create_date,
is_current_year,
is_last_year,
is_yesterday,
is_today,
is_current_month,
is_current_quarter
from dim_date_df
"""
date_info = pd.read_sql_query(date_sql, con=adventure_conn_tosql)
date_info
(4)融合上面的数据
"""
输入:
sum_amount_order_goal销量订单聚合目标表,
date_info日期维度表
输出:
amount_order_by_day销量订单聚合目标及日期维度表
"""
#查看create_date
sum_amount_order_goal['create_date'].iloc[1]
#转化create_date格式为标准日期格式
sum_amount_order_goal['create_date']=sum_amount_order_goal['create_date'].apply(lambda x:x.strftime('%Y-%m-%d'))
#通过主键create_date连接日期维度
amount_order_by_day=pd.merge(sum_amount_order_goal,date_info,
on='create_date',how='inner')
amount_order_by_day
(5)订单数据进行存储
"""
待存储表:amount_order_by_day
欲存储engine:adventure_conn_tosql
"""
#将amount_order_by_day数据追加到数据库dw_order_by_day(每日环比表)当中
#先计算环比,因为日期已经是升序排列了,所以直接利用pct_change()即可
# pct_change()表示当前元素与先前元素的相差百分比,默认竖向,例:前面元素x,当前元素y,公式 result = (y-x)/x
amount_order_by_day['amount_diff']=amount_order_by_day['sum_amount'].pct_change().fillna(0)
adventure_conn_tosql = sqlalchemy.create_engine('mysql+pymysql://用户名:密码@ip地址/datafrog05_adventure')
amount_order_by_day.to_sql('dw_order_by_day_zhaoyqiu', con=adventure_conn_tosql,
if_exists='append', index=False) # 追加数据至dw_order_by_day
2、dw_amount_diff(当日维度表)
目的:读取dw_order_by_day 生成dw_amount_diff(当日维度表)(按当天/昨天/当月/当季/当年的同比)
数据库:adventure_ods
数据表:dw_order_by_day
准备工作:
import datetime
from datetime import timedelta
import pandas as pd
import pymysql
pymysql.install_as_MySQLdb()
import sqlalchemy
import os
#结果保存路径
output_path='F:/some_now/pro_output'
if not os.path.exists(output_path):
os.makedirs(output_path)
# engine = create_engine('mysql+pymysql://用户名:密码@ip地址:端口号/数据库名称')
adventure_conn_read = sqlalchemy.create_engine('mysql://用户名:密码@ip地址:端口号/adventure_ods')
adventure_conn_tosql = sqlalchemy.create_engine('mysql+pymysql://用户名:密码@ip地址:端口号/datafrog05_adventure')
dw_order_by_day = pd.read_sql_query("select * from dw_order_by_day ",
con=adventure_conn_read)
先转换一下create_date的格式
#create_date转化为日期格式
dw_order_by_day['create_date']=pd.to_datetime(dw_order_by_day['create_date'])
def diff(stage,indictor):
"""
stage:日期维度的判断,如:is_today内有[0,1]
indictor:需取值字段,如:sum_amount(总金额),sum_order(总订单量)
输出:当前时间维度下总和,如年同期总和
"""
#求当前日期维度stage下的indictor总和
current_stage_indictor=dw_order_by_day[dw_order_by_day
[stage]==1][indictor].sum()
#取出当前日期维度下的前年对应日期列表
before_stage_list=list(dw_order_by_day[dw_order_by_day[stage]==1]
['create_date']+timedelta(days=-365))
#求当前日期维度下的前一年对应indictor总和
before_stage_indictor=dw_order_by_day[dw_order_by_day['create_date']
.isin(before_stage_list)][indictor].sum()
return current_stage_indictor,before_stage_indictor
#各阶段的金额(sum_amount)
today_amount, before_year_today_amount = diff('is_today', 'sum_amount')
yesterday_amount, before_year_yesterday_amount = diff('is_yesterday', 'sum_amount')
month_amount, before_year_month_amount = diff('is_current_month', 'sum_amount')
quarter_amount, before_year_quarter_amount = diff('is_current_quarter', 'sum_amount')
year_amount, before_year_year_amount = diff('is_current_year', 'sum_amount')
#各阶段的订单数(sum_order)
today_order, before_year_today_order = diff('is_today', 'sum_order')
yesterday_order, before_year_yesterday_order = diff('is_yesterday', 'sum_order')
month_order, before_year_month_order = diff('is_current_month', 'sum_order')
quarter_order, before_year_quarter_order = diff('is_current_quarter', 'sum_order')
year_order, before_year_year_order = diff('is_current_year', 'sum_order')
同比增长或同比下降(均与去年对比):总金额/订单量/客单价,当日/昨日/当月/当季/当年/
如:今天订单量110 前年今天订单量100,则110/100-1 = 0.1,增长10%
amount_dic = {'today_diff': [today_amount / before_year_today_amount - 1,
today_order / before_year_today_order - 1,
(today_amount / today_order) / (before_year_today_amount /
before_year_today_order) - 1],
'yesterday_diff': [yesterday_amount / before_year_yesterday_amount - 1,
yesterday_order / before_year_yesterday_order - 1,
(yesterday_amount / yesterday_order) / (before_year_yesterday_amount /
before_year_yesterday_order) - 1],
'month_diff': [month_amount / before_year_month_amount - 1,
month_order / before_year_month_order - 1,
(month_amount / month_order) / (before_year_month_amount /
before_year_month_order) - 1],
'quarter_diff': [quarter_amount / before_year_quarter_amount - 1,
quarter_order / before_year_quarter_order - 1,
(quarter_amount / quarter_order) / (before_year_quarter_amount /
before_year_quarter_order) - 1],
'year_diff': [year_amount / before_year_year_amount - 1,
year_order / before_year_year_order - 1,
(year_amount / year_order) / (before_year_year_amount /
before_year_year_order) - 1],
'flag': ['amount', 'order', 'avg']} # 做符号简称,横向提取数据方便
amount_diff = pd.DataFrame(amount_dic)
amount_diff
amount_diff.to_sql('dw_amount_diff_zhaoyqiu', con=adventure_conn_tosql,if_exists='append', index=False) # 存储为当日维度表
3、update_sum_data(生成时间地区产品聚合表)
目的:输出sum_customer_order(时间地区产品聚合表)
数据库:
adventure_ods
数据表:
ods_sales_orders(订单明细表)→今日数据
ods_customer(每日新增用户表)
dim_date_df(日期维度表)
py加工表
# 读取每日新增用户表ods_customer
customer_sql="""
select customer_key,
chinese_territory,
chinese_province,
chinese_city
from ods_customer"""
customer_info=pd.read_sql_query(customer_sql,con=adventure_conn_read)
def order_data(today_date, tomorrow_date, adventure_conn_read):
'''
读取今日的ods_sales_orders(订单明细表)
:param tomorrow_date:
:param today_date:
:param adventure_conn_read: 读取数据库
:return:
'''
order_sql = """select sales_order_key,
create_date,
customer_key,
english_product_name,
cpzl_zw,
cplb_zw,
unit_price
from ods_sales_orders where create_date>='{today_date}' and create_date<'{tomorrow_date}'
""".format(today_date=today_date, tomorrow_date=tomorrow_date)
order_info = pd.read_sql_query(order_sql, con=adventure_conn_read)
return order_info
def date_data(adventure_conn_to_sql):
#读取日期维度表dim_date_df
date_sql="""
select create_date,
is_current_year,
is_last_year,
is_yesterday,
is_today,
is_current_month,
is_current_quarter
from dim_date_df"""
date_info = pd.read_sql_query(date_sql, con=adventure_conn_tosql)
return date_info
def sum_data(order_info,customer_info,date_info):
"""
order_info:当天订单明细表ods_sales_orders
customer_info:每日新增用户表ods_customer
date_info:日期维度表
"""
#通过客户编号连接表
sales_customer_order=pd.merge(order_info,customer_info,left_on='customer_key',
right_on='customer_key',how='left')
# 提取订单主键/订单日期/客户编号/产品名/产品子类/产品类别/产品单价/所在区域/所在省份/所在城市
sales_customer_order=sales_customer_order[["sales_order_key", "create_date", "customer_key",
"english_product_name", "cpzl_zw", "cplb_zw", "unit_price",
"chinese_territory",
"chinese_province",
"chinese_city"]]
# 形成按照 订单日期/产品名/产品子类/产品类别/所在区域/所在省份/所在城市的逐级聚合表,获得订单总量/客户总量/销售总金额
sum_customer_order = sales_customer_order.groupby(["create_date", "english_product_name", "cpzl_zw", "cplb_zw",
"chinese_territory", "chinese_province",
"chinese_city"], as_index=False). \
agg({'sales_order_key': pd.Series.nunique, 'customer_key': pd.Series.nunique,
"unit_price": "sum"}).rename(columns={'sales_order_key': 'order_num', \
'customer_key': 'customer_num', 'unit_price': 'sum_amount', \
"english_product_name": "product_name"})
# 转化订单日期为字符型格式
sum_customer_order['create_date'] = sum_customer_order['create_date'].apply(
lambda x: x.strftime('%Y-%m-%d'))
sum_customer_order = pd.merge(sum_customer_order, date_info, on='create_date', how='inner') # 获取当日日期维度
return sum_customer_order
#获取昨日时间
start_time=datetime.date.today()+datetime.timedelta(days=-1)
#获取今日时间
end_time=datetime.date.today()
#获取天数
interval_num=(end_time-start_time).days
for i in range(1,interval_num+1):
start_date=(start_time+datetime.timedelta(days=i)).strftime('%Y-%m-%d')
end_date=(start_time+datetime.timedelta(days=i+1)).strftime('%Y-%m-%d')
#获取订单信息
order_info=order_data(start_date,end_date,adventure_conn_read)
#获取时间信息表
date_info=date_data(adventure_conn_read)
#将订单信息和商户信息进行融合并汇总
# 形成按照 订单日期/产品名/产品子类/产品类别/所在区域/所在省份/所在城市的逐级聚合表,获得订单总量/客户总量/销售总金额+当日日期维度
sum_customer_order=sum_data(order_info,customer_info,date_info)
网友评论