注意我这要用到几个库 requests,bs4,json请注意下载哦
import json
import time
from urllib.parse import urlparse
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import requests
from bs4 import BeautifulSoup
baseUrl-'http://www.weather.com.cn/textFC/hb.shtml'
TEMPTATURE_LIST = []
def get_html(url):
"""
通过get请求获取网页内容
:param url
"""
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36',
'Upgrade-Insecure-Requests':1
'Referer':'http://www.weather.com.cn/textFC/hb.shtml'
'Host':'www.weather.com.cn'
}
requests = requests.get(url,headers)
if response.status_code == 200:
return response.content
else:
print(response)
return None
def get_host(url):
"""
获取url中的主机地址,用于构建http请求全路径
:param url:
:return:
"""
parse = urlparse(url)
return url.split(parse.path)[0]
def get_urls(html,url):
"""
提取按区域抓取天气的url集合
:param soup:
:return:
"""
host = get_host(url)
soup = BeautifulSoup(html,'lxml')
ul = soup.find(name='ul',attrs={"class":"lq_contentboxTab2"})
a_list = ul.find_all("a")
url_list = []
for a in a_list:
new_url = host + a.attrs["href"]
if new_url != url:
url_list.append(new_url)
return url_list
def get_temperatures(html):
if html is None:
print("网页内容为空!!")
return
"""
获取温度信息
:return:
"""
soup = BeautifulSoup(html,'lxml')
province = None
# 找到当天的
conMidtab = soup.find("div",attrs={'class':'conMidtab'})
# 找到所有省
conMidtab2_list = conMidtab.find_all('div')
for conMidtab2 in conMidtab2_list:
# 对应所有的市
tr_list = conMidtab2.find_all('div')
for index,tr in enumerate(tr_list):
td_list = tr.find_all('td')
if index == 0:
province = td_list[0].text.replace('\n','')
city = province + td_list[1].text.replace("\n", "")
min_temp = td_list[7].text.replace("\n", "")
else:
city = province + td_list[0].text.replace("\n", "")
min_temp = td_list[6].text.replace("\n", "")
TEMPTATURE_LIST.append({"city": city, "min": min_temp})
print("一次分析结束")
def get_gat_temperatures(url):
"""
港澳台的页面需要页面js调用才能得到完整的html内容,所以不能正常分析得到
:param url:
:return:
"""
html = get_html(url)
soup = BeautifulSoup(html,'html.parser')
tr_list = soup.find_all('tr')
get_list = ['香港', '澳门', '台北', '高雄', '台中']
index = 0
for tr in tr_list:
if (index < 2 and tr.text.find(gat_list[index]) > -1) or (index == 2 and tr.text.find(gat_list[index]) > -1):
# 如果是香港或澳门,或者台北
td_list = tr.find_all('td')
province = td_list[0].text.replace("\n", "")
city = province + td_list[1].text.replace("\n", "")
min_temp = td_list[7].text.replace("\n", "")
index += 1
TEMPTATURE_LIST.append({"city": city, "min": min_temp})
else 2 < index < len(gat_list) and tr.text.find(gat_list[index]) > -1:
# 台湾其他
td_list = tr.find_all("td")
city = province + td_list[0].text.replace("\n", "")
min_temp = td_list[6].text.replace("\n", "")
index += 1
TEMPTATURE_LIST.append({"city": city, "min": min_temp})
print('港澳台分析结束')
def spide_temperature():
html = get_html(baseUrl)
if html is None:
print('请求失败')
else:
get_temperatures(html)
urls = get_urls(html,baseUrl)
# 港澳台特殊
get_gat_temperatures(url[-1])
for url in urls[:-2]:
time.sleep(2)
content = get_html(url)
get_temperatures(content)
while open("temprature.json",'w',encoding="utf-8") as fp:
json.dump(EMPTATURE_LIST, fp)
def show_temperature():
with open("temprature.json", "r") as fp:
TEMPTATURE_LIST = json.load(fp,encoding='utf-8')
CITY_LIST = [] # 城市
MAX_LIST = [] # 最高天气
for i in range(20):
city_max = TEMPTATURE_LIST[np.random.randint(0, len(TEMPTATURE_LIST))]
CITY_LIST.append(city_max["city"])
MAX_LIST.append(int(city_max["min"]))
ind = np.arange(len(MAX_LIST))
print(ind)
print(CITY_LIST)
print(MAX_LIST)
# 解决中文乱码问题
zhfont1 = matplotlib.font_manager.FontProperties(fname='C:\Windows\Fonts\simsun.ttc')
fig, ax = plt.subplots()
plt.bar(ind, MAX_LIST)
plt.xticks(ind, CITY_LIST, fontproperties=zhfont1, rotation=60)
plt.ylabel(u'温度', fontproperties=zhfont1)
plt.title(u'今日随机20个城市的温度', fontproperties=zhfont1)
# show the figure, but do not block
plt.show()
def main():
spide_temperature()
show_temperature()
if __name__=="__main__":
main()
网友评论