10_数据入库/01_MySQL查找.py:
"""
创建UTF-8的数据库
CREATE DATABASE 数据库名字 DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci;
CREATE DATABASE spider DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci;
"""
import pymysql
host = "localhost"
port = 8001
db = "spider"
user = "admin"
password = "qwe123"
conn = pymysql.connect(host=host, port=port, db=db, user=user, password=password)
# print(conn) # 打印 <pymysql.connections.Connection object at 0x0000019F12DA1550>
# cursor = conn.cursor() # 获取游标
cursor = conn.cursor(pymysql.cursors.DictCursor) # 不加pymysql.cursors.DictCursor返回元组,加了返回字典
cursor.execute("SELECT * FROM Students") # 执行语句
print(cursor.fetchone()) # 查找一个
# print(cursor.fetchall()) # 查找所有
cursor.close() # 先关闭游标
conn.close() # 再关闭链接
10_数据入库/02_MySQL插入数据.py:
"""
插入一条:insert into Students (name,age) values ('贾克斯',40);
插入多条:insert into Students (name,age) values ('贾克斯',40),('贾克斯',40),('贾克斯',40);
"""
import pymysql
host = "localhost"
port = 8001
db = "spider"
user = "admin"
password = "qwe123"
conn = pymysql.connect(host=host, port=port, db=db, user=user, password=password)
cursor = conn.cursor() # 获取游标
cursor.execute("insert into Students (name,age) values ('贾克斯',40);") # 执行语句
conn.commit() # 确认提交,注意!!!
cursor.close() # 先关闭游标
conn.close() # 再关闭链接
10_数据入库/03_MongoDB插入数据.py:
"""
pip install pymongo
mongodb 默认端口 27017
在 ubuntu 下安装以及开启远程访问:
1. sudo vi /etc/mongodb.conf
将 bind_ip = 127.0.0.1 修改为 bind_ip = 0.0.0.0
2. /etc/init.d/mongodb restart 重启服务
"""
from pymongo import MongoClient
conn = MongoClient('localhost',8881)
db = conn.students # 如果没有这个数据库,那么创建
my_set = db.words # 如果没有这个表(集合),那么创建
# data = [{'name':'雷霆嘎巴2','age':18},{'name':'马尔扎哈2','age':18}]
data = [{'name':'雷霆嘎巴2','age':[1,2,3,4]}]
my_set.insert_many(data) # 添加数据
10_数据入库/04_MongoDB查找数据.py:
from pymongo import MongoClient
conn = MongoClient('localhost',8881)
db = conn.students # 如果没有这个数据库,那么创建
my_set = db.words # 如果没有这个表(集合),那么创建
# print(my_set.find()) # 打印 <pymongo.cursor.Cursor object at 0x000001D4924A2908>
for data in my_set.find():
print(data)
print(data['age'])
10_数据入库/05_爬取双色球历史数据.py:
import pymysql
from pymongo import MongoClient
from requests_html import HTMLSession
class Spider:
def __init__(self):
self.url = "https://datachart.500.com/ssq/history/newinc/history.php?start=19000&end=21018"
self.session = HTMLSession()
# 只要连接一次,千万不要放到循环里!!!!!!
# MongoDB连接
conn = MongoClient('localhost',8881)
db = conn['dual_colored_ball'] # 如果没有这个数据库,那么创建
self.my_set = db['words'] # 如果没有这个表(集合),那么创建
# MySQL连接
host = "localhost"
port = 8001
db = "spider"
user = "admin"
password = "qwe123"
self.conn = pymysql.connect(host=host, port=port, db=db, user=user, password=password)
self.cursor = self.conn.cursor() # 获取游标
def parse(self):
response = self.session.get(url=self.url)
for tr in response.html.xpath('//tbody[@id="tdata"]/tr'):
number = tr.xpath('//td[1]/text()')[0] # 期号
red = tr.xpath('//td[2]/text()|//td[3]/text()|//td[4]/text()|//td[5]/text()|//td[6]/text()|//td[7]/text()') # 红球
blue = tr.xpath('//td[8]/text()')[0] # 蓝球
prizePool = tr.xpath('//td[10]/text()')[0] # 奖池奖金(元)
FirstPrize = tr.xpath('//td[11]/text()|//td[12]/text()') # 一等奖
SecondPrize = tr.xpath('//td[13]/text()|//td[14]/text()') # 二等奖
Total_bet = tr.xpath('//td[15]/text()')[0] # 投注总金额
Date = tr.xpath('//td[16]/text()')[0] # 开奖日期
data = (number,red,blue,prizePool,FirstPrize,SecondPrize,Total_bet,Date)
# self.saveMongoDB(data)
self.saveMySQL(data)
print(data) # 如:打印['19077'] ['09', '11', '13', '18', '21', '22'] ['15'] ['928,983,242'] ['1', '10,000,000'] ['118', '221,011'] ['331,156,004'] ['2019-07-04']
def saveMySQL(self,data):
self.cursor.execute("insert into dual_colored_ball values ('%s','%s','%s','%s','%s','%s','%s','%s');"%(
int(data[0]),
'-'.join(data[1]), # 拼接列表
data[2],
data[3],
'-'.join(data[4]),
'-'.join(data[5]),
data[6],
data[7],
))
self.conn.commit() # 确认提交,注意!!!
def saveMongoDB(self,data):
# insert_many插入的是列表,所以需要在字典外加个[]
self.my_set.insert_many([{
"number": data[0],
"red": data[1],
"blue": data[2],
"prizePool": data[3],
"FirstPrize": data[4],
"SecondPrize": data[5],
"Total_bet": data[6],
"Date": data[7],
}]) # 添加数据
def run(self):
self.parse()
self.cursor.close()
self.conn.close()
if __name__ == '__main__':
spider = Spider()
spider.run()
10_数据入库/06_链家网.py:
from requests_html import HTMLSession
import pymysql
from pymongo import MongoClient
import re
import csv
class Spider:
def __init__(self):
self.url = "https://cs.lianjia.com/ershoufang/"
self.session = HTMLSession()
# 只要连接一次,千万不要放到循环里!!!!!!
# MongoDB连接
conn = MongoClient('localhost',8881)
db = conn['HOME_LINK_net']
self.my_set = db['house_datas']
# MySQL连接
host = "localhost"
port = 8001
db = "spider"
user = "admin"
password = "qwe123"
self.conn = pymysql.connect(host=host, port=port, db=db, user=user, password=password)
self.cursor = self.conn.cursor() # 获取游标
def parse(self):
response = self.session.get(url=self.url)
for div in response.html.xpath('//div[@class="info clear"]'):
title = div.xpath('//div[@class="title"]/a/text()')[0] # 标题
position_Small = div.xpath('//div[@class="positionInfo"]/a[1]/text()')[0].strip() # 打印 和美星城
position_Big = div.xpath('//div[@class="positionInfo"]/a[2]/text()')[0]
position = '{}-{}'.format(position_Small, position_Big) # 打印和美星城-暮云
house = div.xpath('//div[@class="houseInfo"]/text()')[0]
follow = div.xpath('//div[@class="followInfo"]/text()')[0] # 打印 0人关注 / 7天以前发布
followinfo = follow.split('/') # 打印 ['0人关注 ', ' 7天以前发布']
amount_of_attention = followinfo[0]
release_time = followinfo[1]
"""
难爬部分:
<div class="followInfo">
<span class="starIcon"></span >
"0人关注 / 7天以前发布"
</div >
"""
house_price = div.xpath('//div[@class="totalPrice totalPrice2"]/span/text()|//div[@class="totalPrice totalPrice2"]/i[2]/text()') # 打印 ['121', '万']
house_price = house_price[0]+house_price[1] # 打印 121万
per_yuan = div.xpath('//div[@class="unitPrice"]/span/text()')[0]
data = (title,position,house,amount_of_attention,release_time,house_price,per_yuan)
# CSV写入
# (a:附加写方式打开,不可读;a+:附加读写方式打开)
with open('房优选择.csv', 'a+', encoding='utf-8', newline='') as fp:
writer = csv.writer(fp)
writer.writerow(data)
# # MongoDB写入
# self.saveMongoDB(data)
# # MySQL写入
# self.saveMySQL(data)
print(data)
def saveMySQL(self,data):
# 记得先新建数据表
self.cursor.execute("insert into house_datas values ('%s','%s','%s','%s','%s','%s','%s');" % (
data[0],
data[1],
data[2],
data[3],
data[4],
data[5],
data[6],
))
self.conn.commit() # 确认提交,注意!!!
def saveMongoDB(self,data):
# insert_many插入的是列表,所以需要在字典外加个[]
self.my_set.insert_many([{
"title": data[0],
"position": data[1],
"house": data[2],
"amount_of_attention": data[3],
"release_time": data[4],
"house_price": ''.join(data[5]),
"per_yuan": data[6],
}]) # 添加数据
def run(self):
self.parse()
self.cursor.close()
self.conn.close()
if __name__ == '__main__':
spider = Spider()
headers = ('文章标题','地点','房貌','关注量','发表时间','总房价','每平价')
with open('房优选择.csv','w',encoding='utf-8',newline='') as fp:
writer = csv.writer(fp)
writer.writerow(headers)
spider.run()
文章到这里就结束了!希望大家能多多支持Python(系列)!六个月带大家学会Python,私聊我,可以问关于本文章的问题!以后每天都会发布新的文章,喜欢的点点关注!一个陪伴你学习Python的新青年!不管多忙都会更新下去,一起加油!
Editor:Lonelyroots
网友评论