整体思路: 先获取省份列表,根据省份获取城市详情页地址,在详情页通过动态网页解析技术拿到降水量,将数据汇总到省份 并进行排序等后续操作
依赖
selenium用来解析动态网页,运行下面程序需要提前在本机上安装对应浏览器的驱动,建议使用chrom
安装方法
import requests
#BeautifulSoup 用来解析html
from bs4 import BeautifulSoup as bf
import time
from selenium import webdriver
from matplotlib import pyplot as plt
#numpy 是数据处理可科学计算常用工具包
import numpy as np
#解决中文乱码问题
plt.rcParams['font.sans-serif']=['SimHei']
def getAllProviceUrl(url):
url_dic = dict()
result = requests.get(url)
#print result.content
soup = bf(result.content, 'html.parser')
# 关于BeautifulSoup元素定位的方法有时间可以多练练
div_conMidTab = soup.find("div",attrs={'class':'conMidtab'})
div_conMidTab2 = div_conMidTab.find_all("div",attrs={'class':'conMidtab2'})
for x in div_conMidTab2:
a_ls = x.find_all('a', attrs={'target':'_blank'})
province = a_ls[0].text
#url_dic.setdefault(province, [])
# 获取a_ls中符合自己要求的url数据
url_dic[province] = [v['href'] for v in a_ls if v.text=='详情']
time.sleep(3) #每爬取一段数据 休息一下 避免造成对方服务器压力
return url_dic
def getRainNum(driver, url):
driver.get(url)
html_text = driver.page_source
soup = bf(html_text, 'html.parser')
#chrom 右键检查或者F12 定位要抓取元素的xpath路径
text = soup.select('div.split > p.rain')[0].get_text()
num = float(text.split(':')[1].split('mm')[0])
print(num,url)
return num
def show(province_rain_ls):
plt.figure(dpi=800)
plt.xticks(rotation=270)
plt.xticks(range(len(province_rain_ls)), [k[0] for k in province_rain_ls])
plt.bar(range(len(province_rain_ls)), [k[1] for k in province_rain_ls])
plt.show()
def sort(province_rain, reverse=True):
return sorted(province_rain, key = lambda x: x[1], reverse=reverse)
def printDetail(province_rain_ls):
pro_max = sort(province_rain_ls)[0]
pro_min = sort(province_rain_ls)[-1]
avg = np.average([k[1] for k in province_rain_ls])
med = np.median([k[1] for k in province_rain_ls])
print(pro_max,pro_min,avg,med)
def crawlerData():
result = requests.get('http://www.weather.com.cn/textFC/hb.shtml')
obj = bf(result.content,'html.parser')
#find与find_all都是根据html元素路径进行定位的;find :查找单一元素 find_all:返回的是一个列表
li_list = obj.find("ul",attrs={'class':'lq_contentboxTab2'}).find_all('li')[0:-1]
#初始化selenium参数
options=webdriver.ChromeOptions()
options.add_argument('headless') #不弹出浏览器
options.add_experimental_option("excludeSwitches", ['enable-automation'])
province_rain = dict()
#遍历获取数据
for li in li_list:
url = 'http://www.weather.com.cn/'+li.find('a')['href']
url_data = getAllProviceUrl(url) #获取地区所有省份下城市url
# print(ls)
for province, urls in url_data.items():
ls = []
for city_url in urls:
# 因为获取降雨量的页面是个动态页面 这里需要用到selenium来解析, 关于selenium的安装可以看下这个文章 https://www.jianshu.com/p/39716ea15d99?utm_source=oschina-app
try:
driver = webdriver.Chrome(chrome_options=options, executable_path="C:/Users/bigdata/Downloads/chromedriver_win32/chromedriver.exe")
ls.append(getRainNum(driver, city_url))
driver.quit()
except BaseException as e:
print('error:',city_url, e)
province_rain[province] = sum(ls)
print(province_rain)
return province_rain
province_rain = crawlerData()
province_rain = sort(province_rain.items())
show(province_rain)
printDetail(province_rain)
网友评论