美文网首页
下载wyoming大学的探空数据

下载wyoming大学的探空数据

作者: 沐辰老爹 | 来源:发表于2018-07-09 20:56 被阅读0次

    水平有限,欢迎指正交流,共同进步!

    俄怀明大学将全球探空站的数据共享使用,并通过计算了很多的变量指数.非常有用,近期有这方面的需求,暂时写了一段未经优化的代码下载.
    如果要大量下载可以尝试代理池1或者代理池2的方式,通过代理和各种规避反扒措施来抓取.

    
    
    import os
    import datetime
    import requests
    from io import StringIO
    import numpy as np
    import pandas as pd
    from bs4 import BeautifulSoup
    import calendar
    import random
    # url = http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2018&MONTH=07&FROM=0312&TO=0312&STNM=72558
    # http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR=2018&MONTH=07&FROM=0312&TO=0312&STNM=54511
    
    USER_AGENTS = ["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
                   "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", ]
    
    
    def main(start_date, end_date, sta_id=54511):
        pass
        # now_date = datetime.date(year, month, 1)
        allmonths = pd.date_range(start_date, end_date, freq='m')
        # print(allmonths)
        for idate, vdate in enumerate(allmonths.astype(object)):
            handle_html(vdate.year, vdate.month)
            print('搞定这个时间了:--->>>', vdate)
    
    
    def set_url(date, sta_id):
        dayNums = calendar.monthrange(date.year, date.month)[1]
        return 'http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR={year}&MONTH={month:4s}&FROM=0100&TO={day_end}12&STNM={sta_id}'.format(
            year=date.year,
            month=str(date.month).zfill(2),
            day_end=dayNums,
            sta_id=str(sta_id)
        )
    
    
    def handle_html(year, month, sta_id=54511):
        now_date = datetime.date(year, month, 1)
        urls = set_url(now_date, sta_id)
        header = {'User-Agent': random.choice(USER_AGENTS)}
        # print(random.choice(USER_AGENTS))
        content = requests.get(urls, headers=header).content
        # print(content)
        soup_html(content)
    
    
    def soup_html(html_page):
        soup = BeautifulSoup(html_page)
        headers_1 = soup.find_all('h2')
        details_data = soup.find_all('pre')
        d1s = details_data[::2]
        d2s = details_data[1::2]
        headers_2 = soup.find_all('h3')
        for iheader, vheader in enumerate(headers_1):
            convert_one_time(vheader, d1s[iheader], d2s[iheader])
    
    
    def convert_one_time(h1,  d1, d2, filedir=None):
        h1_list = h1.string.split()
        obser_time = pd.to_datetime(
            ' '.join([i for i in h1_list[-4:]]), format='%HZ %d %b %Y')
        if filedir is None:
            filedir = os.path.join(homedir, 'data', obser_time.strftime('%Y%m'))
            if not os.path.exists(filedir):
                try:
                    os.makedirs(filedir)
                except:
                    pass
        # h1 = '_'.join([i for i in h1.string.split()])
        filename1 = 'sounding_details_{}_{}.csv'.format(
            h1_list[1], obser_time.strftime('%Y%m%d%H'))
        filename2 = 'sounding_indices_{}_{}.csv'.format(
            h1_list[1], obser_time.strftime('%Y%m%d%H'))
    
        detail_data = pd.read_fwf(
            StringIO(d1.string.replace('-', '')),
            widths=[7 for _ in range(11)],  skiprows=[1, 4]
        )
        detail_data.columns = [
            '{}_{}'.format(i, j) for i, j in zip(detail_data.iloc[0], detail_data.iloc[1])
        ]
        detail_data.drop([0, 1], axis=0, inplace=True)
        detail_data.to_csv(os.path.join(filedir, filename1), index=False)
    
        index_data = pd.read_csv(
            StringIO(d2.string), delimiter=':', names=['variable', 'value']).T
        
        index_data.columns = index_data.iloc[0]
        index_data.drop(['variable'], axis=0, inplace=True)
        index_data.to_csv(os.path.join(filedir, filename2), index=False)
    
    if __name__ == '__main__':
        homedir = os.path.dirname(os.path.realpath(__file__))
        print(homedir)
        datadir = os.path.join(homedir, 'data')
        if not os.path.exists(datadir):
            try:
                os.makedirs(datadir)
            except:
                pass
        # main('2000-01-01', '2018-07-01')
    
    

    相关文章

      网友评论

          本文标题:下载wyoming大学的探空数据

          本文链接:https://www.haomeiwen.com/subject/rtlsuftx.html