美文网首页
python 爬取中国天气网降水量数据(动态网页)

python 爬取中国天气网降水量数据(动态网页)

作者: 郭彦超 | 来源:发表于2021-04-14 13:17 被阅读0次

    整体思路: 先获取省份列表,根据省份获取城市详情页地址,在详情页通过动态网页解析技术拿到降水量,将数据汇总到省份 并进行排序等后续操作

    依赖

    selenium用来解析动态网页,运行下面程序需要提前在本机上安装对应浏览器的驱动,建议使用chrom
    安装方法

     
    import requests
    #BeautifulSoup 用来解析html
    from bs4 import  BeautifulSoup as bf
    import time
    from selenium import webdriver
    from matplotlib import pyplot as plt
    #numpy 是数据处理可科学计算常用工具包
    import numpy as np
    #解决中文乱码问题
    plt.rcParams['font.sans-serif']=['SimHei']
    
    def getAllProviceUrl(url):
        url_dic = dict()
        result = requests.get(url)
        #print result.content
        soup = bf(result.content, 'html.parser')
        # 关于BeautifulSoup元素定位的方法有时间可以多练练
        div_conMidTab =  soup.find("div",attrs={'class':'conMidtab'})
        div_conMidTab2 = div_conMidTab.find_all("div",attrs={'class':'conMidtab2'})
        for x in div_conMidTab2:
            a_ls = x.find_all('a', attrs={'target':'_blank'})
            province = a_ls[0].text
            #url_dic.setdefault(province, [])
            # 获取a_ls中符合自己要求的url数据 
            url_dic[province] = [v['href'] for v in a_ls if v.text=='详情']
        time.sleep(3) #每爬取一段数据 休息一下 避免造成对方服务器压力
        return url_dic
    
    
    def getRainNum(driver, url):
        driver.get(url)
        html_text = driver.page_source
        
        soup = bf(html_text, 'html.parser')
         #chrom 右键检查或者F12 定位要抓取元素的xpath路径
        text = soup.select('div.split > p.rain')[0].get_text()
        num = float(text.split(':')[1].split('mm')[0])
        print(num,url)
        
        return num
    
    def show(province_rain_ls):
        plt.figure(dpi=800)
        plt.xticks(rotation=270)
        plt.xticks(range(len(province_rain_ls)), [k[0] for k in province_rain_ls])
    
        plt.bar(range(len(province_rain_ls)), [k[1] for k in province_rain_ls])
        plt.show()
        
    def sort(province_rain, reverse=True):
        return sorted(province_rain, key = lambda x: x[1], reverse=reverse)
    
     
    def printDetail(province_rain_ls):
        pro_max = sort(province_rain_ls)[0]
        pro_min = sort(province_rain_ls)[-1]
        avg = np.average([k[1] for k in province_rain_ls])
        med = np.median([k[1] for k in province_rain_ls])
        print(pro_max,pro_min,avg,med)
    
    def crawlerData():
        
        result = requests.get('http://www.weather.com.cn/textFC/hb.shtml')
        obj = bf(result.content,'html.parser')
        #find与find_all都是根据html元素路径进行定位的;find :查找单一元素   find_all:返回的是一个列表
        li_list = obj.find("ul",attrs={'class':'lq_contentboxTab2'}).find_all('li')[0:-1]  
    
        #初始化selenium参数
        options=webdriver.ChromeOptions()
        options.add_argument('headless') #不弹出浏览器
        options.add_experimental_option("excludeSwitches", ['enable-automation'])
        province_rain = dict()
        #遍历获取数据
    
        for li in li_list:
            url = 'http://www.weather.com.cn/'+li.find('a')['href']
            url_data = getAllProviceUrl(url)  #获取地区所有省份下城市url
        #     print(ls)
    
            for province, urls in url_data.items():
                ls = []
                for city_url in urls:
                    # 因为获取降雨量的页面是个动态页面 这里需要用到selenium来解析, 关于selenium的安装可以看下这个文章 https://www.jianshu.com/p/39716ea15d99?utm_source=oschina-app
                    try:
                        driver = webdriver.Chrome(chrome_options=options, executable_path="C:/Users/bigdata/Downloads/chromedriver_win32/chromedriver.exe")
                        ls.append(getRainNum(driver, city_url))
                        driver.quit()
                    except BaseException as e:
                        print('error:',city_url, e)
                province_rain[province] = sum(ls)
                  
            print(province_rain)
        return province_rain
    
    province_rain = crawlerData()
    province_rain = sort(province_rain.items())            
    show(province_rain)
    printDetail(province_rain)
    

    相关文章

      网友评论

          本文标题:python 爬取中国天气网降水量数据(动态网页)

          本文链接:https://www.haomeiwen.com/subject/rkbklltx.html