python 用python爬取中国天气

作者: 顾四秋 | 来源:发表于2017-12-15 17:22 被阅读0次

注意我这要用到几个库 requests,bs4,json请注意下载哦

import json
import time
from urllib.parse import urlparse

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import requests
from bs4 import BeautifulSoup


baseUrl-'http://www.weather.com.cn/textFC/hb.shtml'
TEMPTATURE_LIST = []

def get_html(url):
    """
    通过get请求获取网页内容
    :param url
    """
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36',
        'Upgrade-Insecure-Requests':1
        'Referer':'http://www.weather.com.cn/textFC/hb.shtml'
        'Host':'www.weather.com.cn'
    }
    requests = requests.get(url,headers)
    if response.status_code == 200:
        return response.content
    else:
        print(response)
        return None

def get_host(url):
    """
     获取url中的主机地址,用于构建http请求全路径
    :param url:
    :return:
    """
    parse = urlparse(url)
    return url.split(parse.path)[0]

def get_urls(html,url):
    """
    提取按区域抓取天气的url集合
    :param soup:
    :return:
    """
    host = get_host(url)

    soup = BeautifulSoup(html,'lxml')
    ul = soup.find(name='ul',attrs={"class":"lq_contentboxTab2"})
    a_list = ul.find_all("a")
    url_list = []
    for a in a_list:
        new_url = host + a.attrs["href"]
        if new_url != url:
            url_list.append(new_url)
    return url_list

def get_temperatures(html):
    if html is None:
        print("网页内容为空!!")
        return
    """
    获取温度信息
    :return:
    """
    soup = BeautifulSoup(html,'lxml')
    province = None
    # 找到当天的
    conMidtab = soup.find("div",attrs={'class':'conMidtab'})
    # 找到所有省
    conMidtab2_list = conMidtab.find_all('div')
    for conMidtab2 in conMidtab2_list:
        # 对应所有的市
        tr_list = conMidtab2.find_all('div')
        for index,tr in enumerate(tr_list):
            td_list = tr.find_all('td')
            if index == 0:
                province = td_list[0].text.replace('\n','')
                city = province + td_list[1].text.replace("\n", "")
                min_temp = td_list[7].text.replace("\n", "")
            else:
                city = province + td_list[0].text.replace("\n", "")
                min_temp = td_list[6].text.replace("\n", "")

                TEMPTATURE_LIST.append({"city": city, "min": min_temp})
    print("一次分析结束")


def get_gat_temperatures(url):
    """
    港澳台的页面需要页面js调用才能得到完整的html内容,所以不能正常分析得到
    :param url:
    :return:
    """
    html = get_html(url)
    soup = BeautifulSoup(html,'html.parser')
    tr_list = soup.find_all('tr')

    get_list = ['香港', '澳门', '台北', '高雄', '台中']

    index = 0
    for tr in tr_list:
        if (index < 2 and tr.text.find(gat_list[index]) > -1) or (index == 2 and tr.text.find(gat_list[index]) > -1):
        # 如果是香港或澳门,或者台北
            td_list = tr.find_all('td')
            province = td_list[0].text.replace("\n", "")
            city = province + td_list[1].text.replace("\n", "")
            min_temp = td_list[7].text.replace("\n", "")
            index += 1
            TEMPTATURE_LIST.append({"city": city, "min": min_temp})

        else 2 < index < len(gat_list) and tr.text.find(gat_list[index]) > -1:
            # 台湾其他
            td_list = tr.find_all("td")
            city = province + td_list[0].text.replace("\n", "")
            min_temp = td_list[6].text.replace("\n", "")
            index += 1
            TEMPTATURE_LIST.append({"city": city, "min": min_temp})
    print('港澳台分析结束')

def spide_temperature():
    html = get_html(baseUrl)
    if html is None:
        print('请求失败')
    else:
        get_temperatures(html)

        urls = get_urls(html,baseUrl)

        # 港澳台特殊
        get_gat_temperatures(url[-1])

        for url in urls[:-2]:
            time.sleep(2)

            content = get_html(url)
            get_temperatures(content)

        while open("temprature.json",'w',encoding="utf-8") as fp:
            json.dump(EMPTATURE_LIST, fp)

def show_temperature():
    with open("temprature.json", "r") as fp:
        TEMPTATURE_LIST = json.load(fp,encoding='utf-8')

    CITY_LIST = [] # 城市
    MAX_LIST = [] # 最高天气
    for i in range(20):
        city_max = TEMPTATURE_LIST[np.random.randint(0, len(TEMPTATURE_LIST))]
        CITY_LIST.append(city_max["city"])
        MAX_LIST.append(int(city_max["min"]))
    ind = np.arange(len(MAX_LIST))
    print(ind)
    print(CITY_LIST)
    print(MAX_LIST)

    # 解决中文乱码问题
    zhfont1 = matplotlib.font_manager.FontProperties(fname='C:\Windows\Fonts\simsun.ttc')

    fig, ax = plt.subplots()
    plt.bar(ind, MAX_LIST)
    plt.xticks(ind, CITY_LIST, fontproperties=zhfont1, rotation=60)
    plt.ylabel(u'温度', fontproperties=zhfont1)
    plt.title(u'今日随机20个城市的温度', fontproperties=zhfont1)

    # show the figure, but do not block
    plt.show()

def main():
    spide_temperature()
    show_temperature()

if __name__=="__main__":
    main()

相关文章

网友评论

    本文标题:python 用python爬取中国天气

    本文链接:https://www.haomeiwen.com/subject/womdwxtx.html