为了方便进行各种统计,从网页上爬取数据很常见,今天我们就来看看爬取数据的简单脚本编写方法,为大家之后的需求扩充思路。
首先,我们先明确本次实验的需求:
我希望每天能够获取包括当天的未来七天北京的天气情况,并对其进行入库,为之后的数据统计积累数据基础。
OK,需求有了,接下来就该考虑从那里获取数据了。这里我们把目光锁定了中央气象台网站,毕竟这是全中国最权威的天气预报网页了。打开后我们发现了我们想要的:
未来7天预报既然网页上有数据,那我们能够爬取这些数据的可能性就是有的。打开开发者工具,找到打开页面时的各种请求,看看有没有获取这些数据的接口:
获取数据的网址
果然,所有的数据都是从以下这个接口中获取的,这里我们附上接口请求和返回值:
GET http://www.nmc.cn/rest/weather?stationid=54511&_=1585531728560
{
"msg": "success",
"code": 0,
"data": {
"real": {
"station": {
"code": "54511",
"province": "北京市",
"city": "北京",
"url": "/publish/forecast/ABJ/beijing.html"
},
"publish_time": "2020-03-30 08:45",
"weather": {
"temperature": 9.5,
"temperatureDiff": 1.2,
"airpressure": 1015,
"humidity": 55,
"rain": 0,
"rcomfort": 47,
"icomfort": -2,
"info": "晴",
"img": "0",
"feelst": 8.9
},
"wind": {
"direct": "西南风",
"power": "微风",
"speed": ""
},
"warn": {
"alert": "9999",
"pic": "9999",
"province": "9999",
"city": "9999",
"url": "9999",
"issuecontent": "9999",
"fmeans": "9999",
"signaltype": "9999",
"signallevel": "9999",
"pic2": "9999"
}
},
"predict": {
"station": {
"code": "54511",
"province": "北京市",
"city": "北京",
"url": "/publish/forecast/ABJ/beijing.html"
},
"publish_time": "2020-03-30 08:00",
"detail": [
{
"date": "2020-03-30",
"pt": "2020-03-30 08:00",
"day": {
"weather": {
"info": "多云",
"img": "1",
"temperature": "18"
},
"wind": {
"direct": "西南风",
"power": "3~4级"
}
},
"night": {
"weather": {
"info": "多云",
"img": "1",
"temperature": "7"
},
"wind": {
"direct": "南风",
"power": "3~4级"
}
}
},
{
"date": "2020-03-31",
"pt": "2020-03-30 08:00",
"day": {
"weather": {
"info": "多云",
"img": "1",
"temperature": "21"
},
"wind": {
"direct": "东北风",
"power": "3~4级"
}
},
"night": {
"weather": {
"info": "多云",
"img": "1",
"temperature": "7"
},
"wind": {
"direct": "北风",
"power": "3~4级"
}
}
},
{
"date": "2020-04-01",
"pt": "2020-03-30 08:00",
"day": {
"weather": {
"info": "晴",
"img": "0",
"temperature": "17"
},
"wind": {
"direct": "北风",
"power": "微风"
}
},
"night": {
"weather": {
"info": "晴",
"img": "0",
"temperature": "4"
},
"wind": {
"direct": "西南风",
"power": "微风"
}
}
},
{
"date": "2020-04-02",
"pt": "2020-03-30 08:00",
"day": {
"weather": {
"info": "晴",
"img": "0",
"temperature": "21"
},
"wind": {
"direct": "西南风",
"power": "微风"
}
},
"night": {
"weather": {
"info": "晴",
"img": "0",
"temperature": "7"
},
"wind": {
"direct": "北风",
"power": "微风"
}
}
},
{
"date": "2020-04-03",
"pt": "2020-03-30 08:00",
"day": {
"weather": {
"info": "晴",
"img": "0",
"temperature": "24"
},
"wind": {
"direct": "北风",
"power": "微风"
}
},
"night": {
"weather": {
"info": "晴",
"img": "0",
"temperature": "8"
},
"wind": {
"direct": "北风",
"power": "微风"
}
}
},
{
"date": "2020-04-04",
"pt": "2020-03-30 08:00",
"day": {
"weather": {
"info": "晴",
"img": "0",
"temperature": "21"
},
"wind": {
"direct": "南风",
"power": "微风"
}
},
"night": {
"weather": {
"info": "晴",
"img": "0",
"temperature": "6"
},
"wind": {
"direct": "南风",
"power": "微风"
}
}
},
{
"date": "2020-04-05",
"pt": "2020-03-30 08:00",
"day": {
"weather": {
"info": "多云",
"img": "1",
"temperature": "20"
},
"wind": {
"direct": "南风",
"power": "3~4级"
}
},
"night": {
"weather": {
"info": "多云",
"img": "1",
"temperature": "8"
},
"wind": {
"direct": "南风",
"power": "微风"
}
}
}
]
},
"air": {
"forecasttime": "2020-03-30 08:00",
"aqi": 80,
"aq": 2,
"text": "良",
"aqiCode": "99006;99008;99009;99010;99011;99012;99013;99014;99015;99016;99017"
},
"tempchart": [
{
"time": "2020/03/23",
"max_temp": 22,
"min_temp": 4.5,
"day_img": "9999",
"day_text": "9999",
"night_img": "9999",
"night_text": "9999"
},
{
"time": "2020/03/24",
"max_temp": 18.4,
"min_temp": 6.4,
"day_img": "9999",
"day_text": "9999",
"night_img": "9999",
"night_text": "9999"
},
{
"time": "2020/03/25",
"max_temp": 21.1,
"min_temp": 10.9,
"day_img": "9999",
"day_text": "9999",
"night_img": "9999",
"night_text": "9999"
},
{
"time": "2020/03/26",
"max_temp": 15.3,
"min_temp": 5.4,
"day_img": "9999",
"day_text": "9999",
"night_img": "9999",
"night_text": "9999"
},
{
"time": "2020/03/27",
"max_temp": 13.4,
"min_temp": 4,
"day_img": "9999",
"day_text": "9999",
"night_img": "9999",
"night_text": "9999"
},
{
"time": "2020/03/28",
"max_temp": 13.7,
"min_temp": -0.8,
"day_img": "9999",
"day_text": "9999",
"night_img": "9999",
"night_text": "9999"
},
{
"time": "2020/03/29",
"max_temp": 16.7,
"min_temp": 2.7,
"day_img": "9999",
"day_text": "9999",
"night_img": "9999",
"night_text": "9999"
},
{
"time": "2020/03/30",
"max_temp": 18,
"min_temp": 7,
"day_img": "1",
"day_text": "多云",
"night_img": "1",
"night_text": "多云"
},
{
"time": "2020/03/31",
"max_temp": 21,
"min_temp": 7,
"day_img": "1",
"day_text": "多云",
"night_img": "1",
"night_text": "多云"
},
{
"time": "2020/04/01",
"max_temp": 17,
"min_temp": 4,
"day_img": "0",
"day_text": "晴",
"night_img": "0",
"night_text": "晴"
},
{
"time": "2020/04/02",
"max_temp": 21,
"min_temp": 7,
"day_img": "0",
"day_text": "晴",
"night_img": "0",
"night_text": "晴"
},
{
"time": "2020/04/03",
"max_temp": 24,
"min_temp": 8,
"day_img": "0",
"day_text": "晴",
"night_img": "0",
"night_text": "晴"
},
{
"time": "2020/04/04",
"max_temp": 21,
"min_temp": 6,
"day_img": "0",
"day_text": "晴",
"night_img": "0",
"night_text": "晴"
},
{
"time": "2020/04/05",
"max_temp": 20,
"min_temp": 8,
"day_img": "1",
"day_text": "多云",
"night_img": "1",
"night_text": "多云"
}
],
"passedchart": [
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 8.2,
"tempDiff": "",
"humidity": 60,
"pressure": 1014,
"windDirection": 200,
"windSpeed": 1.6,
"time": "2020-03-30 08:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 6.8,
"tempDiff": "",
"humidity": 64,
"pressure": 1014,
"windDirection": 205,
"windSpeed": 1.5,
"time": "2020-03-30 07:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 6.4,
"tempDiff": "",
"humidity": 62,
"pressure": 1014,
"windDirection": 188,
"windSpeed": 1.8,
"time": "2020-03-30 06:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 6.8,
"tempDiff": "",
"humidity": 60,
"pressure": 1013,
"windDirection": 231,
"windSpeed": 1.5,
"time": "2020-03-30 05:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 7.4,
"tempDiff": "",
"humidity": 58,
"pressure": 1013,
"windDirection": 214,
"windSpeed": 1.9,
"time": "2020-03-30 04:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 8.4,
"tempDiff": "",
"humidity": 57,
"pressure": 1013,
"windDirection": 205,
"windSpeed": 2.4,
"time": "2020-03-30 03:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 9.6,
"tempDiff": "",
"humidity": 51,
"pressure": 1013,
"windDirection": 180,
"windSpeed": 4.5,
"time": "2020-03-30 02:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 9.9,
"tempDiff": "",
"humidity": 48,
"pressure": 1013,
"windDirection": 200,
"windSpeed": 2.6,
"time": "2020-03-30 01:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 10.4,
"tempDiff": "",
"humidity": 44,
"pressure": 1013,
"windDirection": 200,
"windSpeed": 3.3,
"time": "2020-03-30 00:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 11.2,
"tempDiff": "",
"humidity": 40,
"pressure": 1013,
"windDirection": 203,
"windSpeed": 1.6,
"time": "2020-03-29 23:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 11.5,
"tempDiff": "",
"humidity": 40,
"pressure": 1013,
"windDirection": 177,
"windSpeed": 1.4,
"time": "2020-03-29 22:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 12.1,
"tempDiff": "",
"humidity": 38,
"pressure": 1013,
"windDirection": 211,
"windSpeed": 2.2,
"time": "2020-03-29 21:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 12.9,
"tempDiff": "",
"humidity": 35,
"pressure": 1012,
"windDirection": 214,
"windSpeed": 2.3,
"time": "2020-03-29 20:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 14.1,
"tempDiff": "",
"humidity": 31,
"pressure": 1012,
"windDirection": 211,
"windSpeed": 3.2,
"time": "2020-03-29 19:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 15.5,
"tempDiff": "",
"humidity": 27,
"pressure": 1011,
"windDirection": 211,
"windSpeed": 3.1,
"time": "2020-03-29 18:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 16.3,
"tempDiff": "",
"humidity": 24,
"pressure": 1011,
"windDirection": 214,
"windSpeed": 4.6,
"time": "2020-03-29 17:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 16.7,
"tempDiff": "",
"humidity": 22,
"pressure": 1012,
"windDirection": 174,
"windSpeed": 2.4,
"time": "2020-03-29 16:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 16.5,
"tempDiff": "",
"humidity": 22,
"pressure": 1013,
"windDirection": 177,
"windSpeed": 3.7,
"time": "2020-03-29 15:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 15.8,
"tempDiff": "",
"humidity": 23,
"pressure": 1014,
"windDirection": 256,
"windSpeed": 6,
"time": "2020-03-29 14:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 15,
"tempDiff": "",
"humidity": 24,
"pressure": 1015,
"windDirection": 211,
"windSpeed": 5.6,
"time": "2020-03-29 13:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 13.8,
"tempDiff": "",
"humidity": 24,
"pressure": 1016,
"windDirection": 194,
"windSpeed": 3.5,
"time": "2020-03-29 12:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 12.5,
"tempDiff": "",
"humidity": 27,
"pressure": 1017,
"windDirection": 236,
"windSpeed": 3,
"time": "2020-03-29 11:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 10.9,
"tempDiff": "",
"humidity": 30,
"pressure": 1018,
"windDirection": 239,
"windSpeed": 5.2,
"time": "2020-03-29 10:00"
},
{
"rain1h": 0,
"rain24h": 9999,
"rain12h": 9999,
"rain6h": 9999,
"temperature": 8.8,
"tempDiff": "",
"humidity": 36,
"pressure": 1018,
"windDirection": 242,
"windSpeed": 3.2,
"time": "2020-03-29 09:00"
}
],
"climate": {
"time": "1981年-2010年",
"month": [
{
"month": 1,
"maxTemp": 1.8,
"minTemp": -7.3,
"precipitation": 2.8
},
{
"month": 2,
"maxTemp": 6.2,
"minTemp": -4.1,
"precipitation": 4.4
},
{
"month": 3,
"maxTemp": 12.8,
"minTemp": 1.8,
"precipitation": 9.9
},
{
"month": 4,
"maxTemp": 20.6,
"minTemp": 8.9,
"precipitation": 23.7
},
{
"month": 5,
"maxTemp": 27,
"minTemp": 15,
"precipitation": 37.6
},
{
"month": 6,
"maxTemp": 30.7,
"minTemp": 19.9,
"precipitation": 70.5
},
{
"month": 7,
"maxTemp": 32.1,
"minTemp": 23.1,
"precipitation": 159.6
},
{
"month": 8,
"maxTemp": 30.6,
"minTemp": 21.8,
"precipitation": 139.4
},
{
"month": 9,
"maxTemp": 26.6,
"minTemp": 16.3,
"precipitation": 48.7
},
{
"month": 10,
"maxTemp": 19.4,
"minTemp": 8.9,
"precipitation": 23.9
},
{
"month": 11,
"maxTemp": 10.2,
"minTemp": 0.6,
"precipitation": 9.6
},
{
"month": 12,
"maxTemp": 3.5,
"minTemp": -5.1,
"precipitation": 2
}
]
},
"radar": {
"title": "华北",
"image": "/product/2020/03/30/RDCP/SEVP_AOC_RDCP_SLDAS_EBREF_ANCN_L88_PI_20200330010000001.PNG?v=1585530510509",
"url": "/publish/radar/huabei.html"
}
}
}
Emmm... 从接口的返回值来看,data.predict.detail数组里面的数据应该就是我们需要的。而且,通过实验,发现接口query参数中的时间戳“_”参数不传也能够返回请求时间的数据,所以我们就简化一下接口请求,只带着site参数即可。
好,万事俱备,开始写脚本:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 27 10:44:22 2020
@author: xingshulin
"""
import requests
import pymysql
#连接数据库
print('连接到mysql服务器...')
conn = pymysql.connect(host='127.0.0.1', port=33306, user='test', password='123456', db='test', charset='utf8')
print('连接上了!')
cs1 = conn.cursor()
#请求参数
url = "http://www.nmc.cn/rest/weather?stationid=54511" # 接口地址
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Mobile Safari/537.36'
}
r = requests.get(url, headers=headers, verify=False)
#请求数据处理
jsonObject = r.json()
weather_data = jsonObject.get('data')
weather_predict = weather_data.get('predict')
weather_predict_detail = weather_predict.get('detail')
print(weather_predict_detail)
# 写入数据库
for x in weather_predict_detail:
date = x.get('date')
day = x.get('day')
night = x.get('night')
# 执行sql语句
query = 'insert into weather(date, day, night) values(%s,%s,%s)'
cs1.execute(query, (date, str(day), str(night)))
# 提交之前的操作,如果之前已经执行多次的execute,那么就都进行提交
conn.commit()
# 关闭cursor对象
cs1.close()
# 关闭connection对象
conn.close()
运行,看结果,数据已经被成功写入数据库中:
数据库数据
这个例子是针对GET接口的,对于POST等接口,可能需要payload数据,接口请求代码编写模式如下:
#请求参数
url = "https://www.XXX.com/api/data/list" # 接口地址
headers = {
'Content-Type': 'application/json;charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Mobile Safari/537.36',
'Cookie': 'rememberMe=eGluZ3NodWxpbjo4NjQ3MjMxOjk5YjcyNWJjZDIxNGJjOTNlMzdhOTk5YzQwYzdkYTdm; JSESSIONID=F713491FE9EF3361CF8F5328A4690E14',
}
payload = {"pageNo":1,"pageSize":500}
r = requests.post(url, json=payload, headers=headers, verify=False)
网友评论