*https://github.com/AuroraZiling/TrackingCOVID
data:image/s3,"s3://crabby-images/8711b/8711befcb749e6a42ab2a73426f52990173bc44c" alt=""
data:image/s3,"s3://crabby-images/71ae9/71ae92b705adbb4f1d8d38b9778ac753e075ec9e" alt=""
data:image/s3,"s3://crabby-images/13f2e/13f2ea4c1a16010c641539b6b4662642ab4680c8" alt=""
一、module 按照
https://pypi.org/project/pandas-bokeh/
pycharm 没有搜索到该库
pip安装
二、主程序 main.py 调用端
"""
看似是main,其实就是个调用端
"""
from modules import updater, generator, renderer
import time
from rich.console import Console
console = Console()
upd = updater.Updater(2022)
data_aspects = {1: "confirmed_new|当日新增确诊", 2: "confirmed_current|当日已有确诊", 3: "asymptomatic_new|当日新增无症状", 4: "asymptomatic_current|当日已有无症状",
5: "recoveries|当日新增痊愈", 6: "deaths_new|当日新增死亡"}
renderer_aspects = {1: "Pygal", 2: "Matplotlib", 3: "Panda_bokeh"}
render_types = {1: "折线图"}
console.rule("[green]Tracking COVID-19")
data_mode = console.input("是否更新数据?[green]Y/N[white]").lower()
with console.status("正在更新疫情数据..."):
if data_mode == "y":
reply = upd.download_html()
elif data_mode == "n":
reply = upd.use_backup_html()
if "online" in reply:
console.print("[green]已成功更新疫情数据[/]")
elif "offline" in reply:
console.print(f"[yellow]更新疫情数据失败,已使用近期的备份数据 | 备份时间:{reply[-1]}[/]")
elif "backup" in reply:
console.print(f"[yellow]已直接使用备份的数据 | 备份时间:{reply[-1]}[/]")
year = int(console.input("请选择数据年份:"))
if year == 2022:
original_html = reply[0]
console.rule("[green]数据类型")
for each in range(1, len(data_aspects) + 1):
print(f"{each}. {data_aspects[each]}")
aspect = data_aspects[int(console.input("请选择数据类型:"))].split("|")[0]
with console.status("正在生成疫情数据..."):
gene = generator.Generator(original_html, year, "chinese")
time.sleep(0.5)
generated_data = gene.get_proceed_data_sequence(aspect)
console.rule("[green]渲染类型")
for each in range(1, len(renderer_aspects) + 1):
print(f"{each}. {renderer_aspects[each]}")
render_selection = renderer_aspects[int(console.input("请选择渲染库:"))]
if render_selection == "Pygal":
render = renderer.Pygal_render(generated_data, aspect, True, True)
elif render_selection == "Matplotlib":
render = renderer.Matplotlib_render(generated_data, aspect, True, True)
elif render_selection == "Panda_bokeh":
render = renderer.Pandas_render(generated_data, aspect, True, True)
else:
raise ValueError("渲染库选择错误")
console.rule("[green]图表类型")
for each in range(1, len(render_types) + 1):
print(f"{each}. {render_types[each]}")
render_selection = render_types[int(console.input("请选择渲染图表类型:"))]
if render_selection == "折线图":
render.output_as_line()
else:
raise ValueError("暂时只支持2022年的数据")
二、生成模块
from bs4 import BeautifulSoup
months_converter = {"January": "1", "February": "2", "March": "3", "April": "4", "May": "5", "June": "6", "July": "7",
"August": "8", "September": "9", "October": "10", "November": "11", "December": "12"}
def real_time_converter(data_time):
data_time = data_time.split(" ")
return f"{data_time[-1]}-{months_converter[data_time[0]]}-{data_time[1].replace(',', '') if not data_time[1][0] == '0' else data_time[1][1]}"
class Generator:
def __init__(self, original_html, year=2022, date_format="default"):
self.proceed_data = {}
if year == 2022:
temp_var = 0
proceed_data_model = {"date": "", "confirmed_new": 0, "confirmed_current": 0, "asymptomatic_new": 0,
"asymptomatic_current": 0, "recoveries": 0, "deaths_new": 0}
simple_data = BeautifulSoup(original_html, features="lxml").find_all('p')[11:]
for each in range(len(simple_data)):
if "National Health Commission Update on" in simple_data[each].text:
temp_var = each
break
first_data = [each.text for each in simple_data[:temp_var] if not each.text == ""]
for each in range(0, len(first_data), 5):
part = first_data[each:each + 5]
key_time = real_time_converter(part[1])
if date_format == "default":
proceed_data_model["date"] = part[1]
elif date_format == "chinese":
temp_date = part[1].split(" ")
temp_date[0] = months_converter[temp_date[0]]
temp_date[1] = temp_date[1].replace(",", "")
if temp_date[1][0] == "0":
temp_date[1] = temp_date[1][1]
proceed_data_model["date"] = f"{temp_date[2]}年{temp_date[0]}月{temp_date[1]}日"
else:
raise ValueError("'date_format' parameter must be 'default' or 'chinese'")
proceed_data_model["confirmed_new"] = int(part[2].split()[1].replace(",", ""))
proceed_data_model["confirmed_current"] = int(part[2].split()[3].replace(",", ""))
proceed_data_model["asymptomatic_new"] = int(part[3].split()[1].replace(",", ""))
proceed_data_model["asymptomatic_current"] = int(part[3].split()[3].replace(",", ""))
proceed_data_model["recoveries"] = int(part[4].split()[1].replace(",", ""))
proceed_data_model["deaths_new"] = int(part[4].split()[4].replace(",", ""))
self.proceed_data[key_time] = proceed_data_model.copy()
# 补丁(因为网页中有一些数据为0)
self.proceed_data["2022-3-26"]["confirmed_current"] = 27312
else:
raise ValueError("'year' 参数仅为 2022")
def get_proceed_data(self):
return self.proceed_data
def get_proceed_data_sequence(self, data_type):
return [{each: self.proceed_data[each][data_type]} for each in list(self.proceed_data.keys())][::-1]
三、render模块
import os
import pygal
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pandas_bokeh as pb
chinese_translation = {"confirmed_new": "新增感染", "confirmed_current": "当前感染", "asymptomatic_new": "新增无症状",
"asymptomatic_current": "当前无症状", "recoveries": "新增治愈", "deaths_new": "新增死亡"}
class Pygal_render:
def __init__(self, data_sequence, data_type, open_file=False, chn_trans=False):
self.data_type = data_type
self.data_sequence = data_sequence
self.open_file = open_file
self.chn_trans = chn_trans
if os.path.exists("rendered_data"):
os.mkdir("rendered_data")
if chn_trans:
self.data_type = chinese_translation[self.data_type]
self.data_sequence_values = [list(each.values())[0] for each in data_sequence]
self.pygal_data_view = pygal.Line()
def output_as_line(self, filename="pygal_line.svg", window_title=None):
self.pygal_data_view.title = f"有关{self.data_type}的数据(折线图)[Pygal]" if not window_title else window_title
self.pygal_data_view.add(self.data_type, self.data_sequence)
self.pygal_data_view.render_to_file(filename)
if self.open_file:
os.startfile(filename)
class Matplotlib_render:
def __init__(self, data_sequence, data_type, open_file=False, chn_trans=False):
self.data_type = data_type
self.data_sequence = data_sequence
self.open_file = open_file
self.chn_trans = chn_trans
if os.path.exists("rendered_data"):
os.mkdir("rendered_data")
if chn_trans:
self.data_type = chinese_translation[self.data_type]
self.data_sequence_values = [list(each.values())[0] for each in data_sequence]
self.data_sequence_keys = [list(each.keys())[0][5:] for each in data_sequence]
def output_as_line(self, filename="matplotlib_line.png", window_title=None):
plt.title(f"有关{self.data_type}的数据(折线图)[Matplotlib]") if not window_title else window_title
plt.plot(self.data_sequence_keys, self.data_sequence_values)
plt.show()
plt.savefig(filename)
class Pandas_render:
def __init__(self, data_sequence, data_type, open_file=False, chn_trans=False):
self.data_type = data_type
self.data_sequence = data_sequence
self.open_file = open_file
self.chn_trans = chn_trans
if os.path.exists("rendered_data"):
os.mkdir("rendered_data")
if chn_trans:
self.data_type = chinese_translation[self.data_type]
self.data_sequence_values = [list(each.values())[0] for each in data_sequence]
self.data_sequence_keys = [list(each.keys())[0][5:] for each in data_sequence]
def output_as_line(self, filename="panda_line.html", window_title=None):
data_frame = {"年份": self.data_sequence_keys, self.data_type: self.data_sequence_values}
df = pd.DataFrame(data_frame)
df.plot_bokeh.line(
x='年份',
y=self.data_type,
xlabel=self.data_type,
ylabel='年份',
title=f"有关{self.data_type}的数据(折线图)[Pandas_bokeh]" if not window_title else window_title,
figsize=(800, 500),
ylim=(5000, 20000)
)
pb.output_file(filename)
四、数据更新模块 update
import time
import requests
import os
class UpdaterError(Exception):
pass
class Updater:
def __init__(self, year):
self.data_url_2022 = 'https://weekly.chinacdc.cn/news/TrackingtheEpidemic.htm'
self.data_url_2021 = 'https://weekly.chinacdc.cn/news/TrackingtheEpidemic2021.htm'
self.data_url_2020 = 'https://weekly.chinacdc.cn/news/TrackingtheEpidemic2020.htm'
self.year = year
self.connection_status = "backup"
if self.year == 2022:
self.url = self.data_url_2022
elif self.year == 2021:
raise UpdaterError('暂无2021年数据')
# self.url = self.data_url_2021
elif self.year == 2020:
raise UpdaterError('暂无2020年数据')
# self.url = self.data_url_2020
else:
raise UpdaterError('请输入正确的年份(2022)')
def download_html(self):
try:
self.connection_status = "online"
original_html = requests.get(self.url).text
if not os.path.exists("data_backup"):
os.mkdir("data_backup")
open(f"data_backup/Tracking the Epidemic ({self.year}).html", "w", encoding="utf-8").write(original_html)
return original_html, self.connection_status
except ConnectionError:
reply = self.use_backup_html()
self.connection_status = "offline"
if not reply:
raise UpdaterError('网络连接错误,且没有备份,可能是连接次数过多或网络不好导致的')
return reply
def use_backup_html(self):
if os.path.exists(f"data_backup/Tracking the Epidemic ({self.year}).html"):
return open(f"data_backup/Tracking the Epidemic ({self.year}).html", "r",
encoding="utf-8").read(), self.connection_status, time.ctime(os.path.getmtime(f"data_backup/Tracking the Epidemic ({self.year}).html"))
return None
五、数据分析模块(待续)
未来实现按省份可视化;
热点图
回归分析等
TrackingCOVID
施工中...
实时追踪中国疫情数据
爬虫、数据分析练手项目(
架构
程序入口 (main.py)
数据更新器 (Updater.py)
相关库: requests
类: Updater
传参: year
数据年份(仅支持2022)
函数 download_html()
作用: 下载原HTML文件,并存储至data_backup
目录下
返回结果: (original_html, status)
original_html
: 原HTML文件
status
: 获取数据的状态 online: 在线更新 | offline: 因为离线而使用备份
函数 use_backup_html()
作用: 使用备份的HTML文件
返回结果: (original_html, status, backup_time)
original_html
: 原HTML文件
status
: 获取数据的状态 backup: 使用了备份
backup_time
: HTML文件的备份时间
数据处理器 (Generator.py)
相关库: beautifulsoup4
类: Generator
传参:
original_html
: 原始HTML文本
year
: 数据年份 目前仅有2022年
date_format
: 日期格式 默认:default: January 1, 2022 | chinese: 2022年1月1日
函数 get_proceed_data(self)
作用: 获取处理后的数据
返回结果: proceed_data [dict]
函数 get_proceed_data_sequence(self, data_type)
作用: 筛选字典中的特定数据
data_type
: 指定数据类型 参数: confirmed_new confirmed_current asymptomatic_new asymptomatic_current recoveries deaths_new
返回结果: proceed_data_sequence [list]
数据分析器 (Analyzer.py)
摸了
数据渲染器 (Renderer.py)
函数 output_as_line(data_sequence, data_type, lib, filename, open_file, chn_trans)
作用: 将处理后的数据序列和指定的数据类型生成为折线图
data_sequence
: 处理后的数据序列
data_type
: 数据的类型
lib[optional]
: 使用的库 默认: pygal
| 支持: pygal
, matplotlib
, pandas_bokeh
filename[optional]
: 导出的文件名 默认:output.svg
| 后缀为.svg
open_file[optional]
: 是否生成数据图后打开文件 默认:False
chn_trans[optional]
: 是否翻译数据类型为中文 默认:False
目标
- 基本架构完成 基本能用 Processing
- 支持2021和2020年的数据 Planning
- 地区支持 Planning
- 多语言支持 Planning
- 日志记录 Planning
- 全GUI支持 Planning
数据更新器
- 类化 Finished
- 可直接使用备份数据 Finished
- 显示更新所需时间 Planning
数据处理器
- 类化 Processing
- 完善原数据的数据类型 Planning
数据分析器
- 趋势分析 Planning
数据渲染器
- 类化 Planning
- 多库数据渲染 pygal, matplotlib, pandas_bokeh Basically Finished
- 渲染参数设置 Processing
已知问题
- 数据网站在2022年3月26日的
confirmed_current
为0 |已解决|
替代数据来源(已不可访问): 中华人民共和国国家卫生健康委员会 截至3月25日24时新型冠状病毒肺炎疫情最新情况
网友评论