美文网首页我爱编程
Python入门(2)-第一次爬虫

Python入门(2)-第一次爬虫

作者: 两只cows | 来源:发表于2018-05-27 11:58 被阅读0次

一.爬虫目的

爬取webservice上提供的json格式的气象数据,解析后存入SQLServer数据库。

二.开发环境

Python2.7+Spyder+SQLServer2008

三.注意事项

1.webservice所提供的气象数据是定时刷新的,所以要制作定时器,定时爬取

2.在模拟登陆验证身份信息的时候,要注意把自己的sessionID带上

四.定时爬虫程序代码

# -*- coding: utf-8 -*-

"""

Created on Sat Nov 04 22:30:37 2017

@author: Administrator

"""

import requests

import json

from apscheduler.schedulers.blocking import BlockingScheduler

from apscheduler.executors.pool import ThreadPoolExecutor, ProcessPoolExecutor

import pymssql

class MSSQL:

#初始化参数

def __init__(self,host='*****',user='***',pwd='****',db='****'):

self.host = host

self.user = user

self.pwd = pwd

self.db = db

def connect(self):

self.conn=pymssql.connect(host=self.host,user=self.user,password=self.pwd,database=self.db)

cur = self.conn.cursor()

if not cur:

raise(NameError,"连接数据库失败")

else:

return cur

def login(self):

url = 'http://www.citygrid.net.cn/sensor/api/ApiLogin/';

payload = {"userid": "gttdzzzx", "pwd":"guotu123"}

r = requests.get(url, params=payload)

print r.text

def accept(self):

results = requests.Session().get("http://www.citygrid.net.cn/sensor/api/GetApiLatestvaluebydevicecode/?devicecode=*******&format=json";,

headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0',

"Host": "www.citygrid.net.cn";,

"Referer": "http://www.citygrid.net.cn/sensor/api/GetApiLatestvaluebydevicecode";,

"Cookie":"sessionid=******"})

print results.text

datalist = []

t = json.loads(results.text)

datalist.append(t)

for ele in datalist:

print ele

sql = "insert into [dbo].[wea]  values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s');" %(ele[u'id'],ele[u'Time'],ele[u'Version'],ele[u'Status'],ele[u'Battery'],ele[u'Temperature'],ele[u'Humidity'],ele[u'Light'],ele[u'Noise'],ele[u'Pressure'],ele[u'PM2p5'],ele[u'PM10'],ele[u'WindSpeed'],ele[u'WindDirection'],ele[u'CO2'],ele[u'CO'],ele[u'VOC'],ele[u'SO2'],ele[u'NO2'],ele[u'O3'],ele[u'HumanFlow'],ele[u'HumanFlow2'],ele[u'HumanFlow3'],ele[u'HumanFlow4'],ele[u'CarFlow'],ele[u'CarFlow2'],ele[u'CarFlow3'],ele[u'CarFlow4'],ele[u'CarSpeed'],ele[u'CarSpeed2'],ele[u'CarSpeed3'],ele[u'CarSpeed4'],ele[u'GPSLongDeg'],ele[u'GPSLongMin'],ele[u'GPSLongSec'],ele[u'GPSLatDeg'],ele[u'GPSLatMin'],ele[u'GPSLatSec'],ele[u'UV'],ele[u'HCHO'])

print sql

con = self.connect()

con.execute(sql)

self.conn.commit()

self.conn.close()

def main():

obj = MSSQL(host='****8',user='***',pwd='***',db='*****')

obj.login()

obj.connect()

obj.accept()

sched = BlockingScheduler()

sched.add_job(obj.accept, 'interval', seconds=60)

sched.start()

if __name__ == '__main__':

main()

相关文章

网友评论

    本文标题:Python入门(2)-第一次爬虫

    本文链接:https://www.haomeiwen.com/subject/whtqvxtx.html