python 用python爬取中国天气

作者: 顾四秋 | 来源:发表于2017-12-15 17:22 被阅读0次

python 用python爬取中国天气
python爬取中国天气网天气并保存为word格式文件
用python网络爬虫爬取英雄联盟英雄图片
Python爬取中国天气网
3个适合新人上手的Python项目
Python学习
0.Python 爬虫之Scrapy入门实践指南（Scrapy基
python爬取手机app
python爬虫之知乎话题精华答案
各类链接

注意我这要用到几个库 requests，bs4，json请注意下载哦

import json
import time
from urllib.parse import urlparse

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import requests
from bs4 import BeautifulSoup


baseUrl-'http://www.weather.com.cn/textFC/hb.shtml'
TEMPTATURE_LIST = []

def get_html(url):
    """
    通过get请求获取网页内容
    ：param url
    """
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36',
        'Upgrade-Insecure-Requests':1
        'Referer':'http://www.weather.com.cn/textFC/hb.shtml'
        'Host':'www.weather.com.cn'
    }
    requests = requests.get(url,headers)
    if response.status_code == 200:
        return response.content
    else:
        print(response)
        return None

def get_host(url):
    """
     获取url中的主机地址，用于构建http请求全路径
    :param url:
    :return:
    """
    parse = urlparse(url)
    return url.split(parse.path)[0]

def get_urls(html,url):
    """
    提取按区域抓取天气的url集合
    :param soup:
    :return:
    """
    host = get_host(url)

    soup = BeautifulSoup(html,'lxml')
    ul = soup.find(name='ul',attrs={"class":"lq_contentboxTab2"})
    a_list = ul.find_all("a")
    url_list = []
    for a in a_list:
        new_url = host + a.attrs["href"]
        if new_url != url:
            url_list.append(new_url)
    return url_list

def get_temperatures(html):
    if html is None:
        print("网页内容为空！！")
        return
    """
    获取温度信息
    :return:
    """
    soup = BeautifulSoup(html,'lxml')
    province = None
    # 找到当天的
    conMidtab = soup.find("div",attrs={'class':'conMidtab'})
    # 找到所有省
    conMidtab2_list = conMidtab.find_all('div')
    for conMidtab2 in conMidtab2_list:
        # 对应所有的市
        tr_list = conMidtab2.find_all('div')
        for index,tr in enumerate(tr_list):
            td_list = tr.find_all('td')
            if index == 0:
                province = td_list[0].text.replace('\n','')
                city = province + td_list[1].text.replace("\n", "")
                min_temp = td_list[7].text.replace("\n", "")
            else:
                city = province + td_list[0].text.replace("\n", "")
                min_temp = td_list[6].text.replace("\n", "")

                TEMPTATURE_LIST.append({"city": city, "min": min_temp})
    print("一次分析结束")


def get_gat_temperatures(url):
    """
    港澳台的页面需要页面js调用才能得到完整的html内容，所以不能正常分析得到
    :param url:
    :return:
    """
    html = get_html(url)
    soup = BeautifulSoup(html,'html.parser')
    tr_list = soup.find_all('tr')

    get_list = ['香港', '澳门', '台北', '高雄', '台中']

    index = 0
    for tr in tr_list:
        if (index < 2 and tr.text.find(gat_list[index]) > -1) or (index == 2 and tr.text.find(gat_list[index]) > -1):
        # 如果是香港或澳门,或者台北
            td_list = tr.find_all('td')
            province = td_list[0].text.replace("\n", "")
            city = province + td_list[1].text.replace("\n", "")
            min_temp = td_list[7].text.replace("\n", "")
            index += 1
            TEMPTATURE_LIST.append({"city": city, "min": min_temp})

        else 2 < index < len(gat_list) and tr.text.find(gat_list[index]) > -1:
            # 台湾其他
            td_list = tr.find_all("td")
            city = province + td_list[0].text.replace("\n", "")
            min_temp = td_list[6].text.replace("\n", "")
            index += 1
            TEMPTATURE_LIST.append({"city": city, "min": min_temp})
    print('港澳台分析结束')

def spide_temperature():
    html = get_html(baseUrl)
    if html is None:
        print('请求失败')
    else:
        get_temperatures(html)

        urls = get_urls(html,baseUrl)

        # 港澳台特殊
        get_gat_temperatures(url[-1])

        for url in urls[:-2]:
            time.sleep(2)

            content = get_html(url)
            get_temperatures(content)

        while open("temprature.json",'w',encoding="utf-8") as fp:
            json.dump(EMPTATURE_LIST, fp)

def show_temperature():
    with open("temprature.json", "r") as fp:
        TEMPTATURE_LIST = json.load(fp,encoding='utf-8')

    CITY_LIST = [] # 城市
    MAX_LIST = [] # 最高天气
    for i in range(20):
        city_max = TEMPTATURE_LIST[np.random.randint(0, len(TEMPTATURE_LIST))]
        CITY_LIST.append(city_max["city"])
        MAX_LIST.append(int(city_max["min"]))
    ind = np.arange(len(MAX_LIST))
    print(ind)
    print(CITY_LIST)
    print(MAX_LIST)

    # 解决中文乱码问题
    zhfont1 = matplotlib.font_manager.FontProperties(fname='C:\Windows\Fonts\simsun.ttc')

    fig, ax = plt.subplots()
    plt.bar(ind, MAX_LIST)
    plt.xticks(ind, CITY_LIST, fontproperties=zhfont1, rotation=60)
    plt.ylabel(u'温度', fontproperties=zhfont1)
    plt.title(u'今日随机20个城市的温度', fontproperties=zhfont1)

    # show the figure, but do not block
    plt.show()

def main():
    spide_temperature()
    show_temperature()

if __name__=="__main__":
    main()