接上一篇
1、概览
第一步:抓取网页表格保存为本地xls文件
# get_xls.py
import requests
import os, sys
import time
from bs4 import BeautifulSoup
from unittest import result
i = 0
time_start = time.time()
def linkGet(startpage, endpage):
global i
c = []
for j in range(startpage, endpage):
url = 'http://www.sxmwr.gov.cn/List.action?classfyid=61¤tPage=' + str(
j) + '&turnHref=a0909d3e2591f4bad185dcc317ed1305ecd1ce6e561ebba08b16ce55831d5ab8a287dfa4a19beda0c6bc579659449f995714cf0f1d6601099d111aa8b2a942c122565fccc10321a12fa3875b48a46949d5c36fb26f106d16e54a688e17199bd5c4e6b68a622d3b2792ba2c781a2d4e17fffe1f9e8c4d6cdf6348d9a80dbcf0bdaea67d6bcc745b348c230d59c63a6576131bcee30514c0527ad244d7662c1922'
res = requests.get(url)
res.encoding = 'utf-8'
html_sample = res.text
soup = BeautifulSoup(html_sample, 'html.parser')
for link in soup.select('.articlelist'):
s = (link)
soup2 = BeautifulSoup(str(s), 'html.parser')
alinks = soup2.select('a')
for links in alinks:
b = [[links['href']], i][0]
i = i + 1
c.append(b)
return c
def xlsGet(url):
result = {}
res = requests.get(url)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
result['title'] = soup.select('#mainNewsTitles')[0].text.strip()
result['artical'] = soup.select('#mainNewsContent')[0]
return result
def xlsWrite(c):
for k in range(0,len(c)):
dic = xlsGet('http://www.sxmwr.gov.cn/' + str(c[k][0]))
s = str(dic['artical'])
names = dic['title']
f = open(str(k) + '.xls', 'w+', encoding="utf-8")
f.write(s)
c1 = linkGet(0, 20) # 获取2014年数据
c2 = linkGet(20, 40)
c3 = linkGet(40, 60)
c4 = linkGet(60, 62)
xlsWrite(c1)
xlsWrite(c2)
xlsWrite(c3)
xlsWrite(c4)
time_end = time.time()
print('已完成,总共耗时' + str(time_end - time_start) + '秒')
第二步:将多xls文件合并为一个多sheet文件(VB实现)
Sub CombineWorkbooks()
Dim FilesToOpen, ft
Dim x As Integer
Application.ScreenUpdating = False
On Error GoTo errhandler
FilesToOpen = Application.GetOpenFilename _
(FileFilter:="Micrsofe Excel文件(*.xls), *.xls", _
MultiSelect:=True, Title:="要合并的文件")
If TypeName(FilesToOpen) = "boolean" Then
MsgBox "没有选定文件"
'GoTo errhandler
End If
x = 1
While x <= UBound(FilesToOpen)
Set wk = Workbooks.Open(Filename:=FilesToOpen(x))
wk.Sheets().Move after:=ThisWorkbook.Sheets _
(ThisWorkbook.Sheets.Count)
x = x + 1
Wend
MsgBox "合并成功完成!"
errhandler:
'MsgBox Err.Description
'Resume errhandler
End Sub
第三步:获取指定站点信息
#xlsWrite.py
import xlrd
import xlwt
import os
from xlutils3.copy import copy
data = xlrd.open_workbook('1.xls') #获取工作簿
count = len(data.sheets()) #获取sheet数目
stats = ['枣园', '安塞',
'杏河', '延安', '甘谷驿']
#创建表头
wb = xlwt.Workbook()
ws = wb.add_sheet('汇总')
#寻找指定站点,变量为所读取的表格以及站点索引
def findStation(table,i):
rows = table.nrows
for row in range(0,rows):
cell_Col1 = table.col(1)[row].value #获取单元列
if cell_Col1 == stats[i]:
station = [cell_Col1, table.col(0)[1].value, table.col(2)[row].value,
table.col(3)[row].value, table.col(4)[row].value, table.col(5)[row].value,
table.col(6)[row].value]
return station
#追加数据
def appendFile(j, k):
rexcel = xlrd.open_workbook(stats[j]+'.xls') # 用wlrd提供的方法读取一个excel文件
rows = rexcel.sheets()[0].nrows # 用wlrd提供的方法获得现在已有的行数
excel = copy(rexcel) # 用xlutils提供的copy方法将xlrd的对象转化为xlwt的对象
table1 = excel.get_sheet(0) # 用xlwt对象的方法获得要操作的sheet
if findStation(table,j) == None:
table1.write(k, 0, stats[j]) # xlwt对象的写方法,参数分别是行、列、值
table1.write(k, 1, 'none')
table1.write(k, 2, 'none')
else:
cols = findStation(table,j)
for cnums in range(0,len(cols)):
table1.write(k, cnums, cols[cnums]) # xlwt对象的写方法,参数分别是行、列、值
excel.save(stats[j]+'.xls') # xlwt对象的保存方法,这时便覆盖掉了原来的excel
for j in range(0,len(stats)):
wb.save(stats[j]+'.xls')
for num in range(0,count):
table = data.sheet_by_index(num)
k = num + 1
j = 1
for j in range(0,len(stats)):
appendFile(j,k)
print('end')
2、第三步实现前的探索
如何读取单表单元格的数据
import xlrd
data = xlrd.open_workbook('1.xls')
table = data.sheet_by_index(0)
cell_A1 = table.cell(3,1).value
if cell_A1 == '河曲':
print('yes')
print(cell_A1)
如何通过遍历寻找目标值
import xlrd
data = xlrd.open_workbook('1.xls') #获取工作簿
table = data.sheet_by_index(0) #获取单元表
rows = table.nrows #获取行数
for row in range(0,rows):
cell_Col1 = table.col(1)[row].value #获取单元列
if cell_Col1 == '安塞':
ansai = table.col(4)[row].value
print(ansai)
print('end')
遍历各个站点
import xlrd
data = xlrd.open_workbook('1.xls') #获取工作簿
table = data.sheet_by_index(0) #获取单元表
stats = ['枣园', '安塞',
'杏河', '延安', '甘谷驿']
def findStation(table,i):
rows = table.nrows
for row in range(0,rows):
cell_Col1 = table.col(1)[row].value #获取单元列
if cell_Col1 == stats[i]:
station = [cell_Col1, table.col(2)[row].value, table.col(4)[row].value]
return station
zaoyuan = findStation(table,0)
ansai = findStation(table,1)
xinghe = findStation(table,2)
yanan = findStation(table,3)
ganguyi = findStation(table,4)
print(zaoyuan, ansai, xinghe, yanan, ganguyi)
print('end')
获取单个文件
import xlrd
import csv
from datetime import datetime
data = xlrd.open_workbook('1.xls') #获取工作簿
table = data.sheet_by_index(0) #获取单元表
stats = ['枣园', '安塞',
'杏河', '延安', '甘谷驿']
def findStation(table,i):
rows = table.nrows
for row in range(0,rows):
cell_Col1 = table.col(1)[row].value #获取单元列
if cell_Col1 == stats[i]:
time_value = xlrd.xldate_as_tuple(table.col(2)[row].value,data.datemode)
date_value = datetime(*time_value[:]).strftime('%Y-%m-%d %H:%M')
station = [cell_Col1, date_value, table.col(4)[row].value]
return station
def csvFile(j):
if findStation(table,j) == None:
with open(stats[j]+'.csv', 'w') as f:
f.write('none')
else:
with open(stats[j]+'.csv', 'w') as csvfile:
fieldnames = ['station', 'time', 'flows']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
dic = {'station':findStation(table,j)[0], 'time':findStation(table,j)[1],
'flows': findStation(table,j)[2]}
writer.writerow(dic)
for j in range(0,len(stats)):
csvFile(j)
print('end')
追加
import xlrd
import xlwt
import os
from xlutils3.copy import copy
from datetime import datetime
data = xlrd.open_workbook('1.xls') #获取工作簿
stats = ['枣园', '安塞',
'杏河', '延安', '甘谷驿']
#创建表头
wb = xlwt.Workbook()
ws = wb.add_sheet('汇总')
ws.write(0, 0, '站点')
ws.write(0, 1, '时间')
ws.write(0, 2, '径流')
#寻找指定站点,变量为所读取的表格以及站点索引
def findStation(table,i):
rows = table.nrows
for row in range(0,rows):
cell_Col1 = table.col(1)[row].value #获取单元列
if cell_Col1 == stats[i]:
time_value = xlrd.xldate_as_tuple(table.col(2)[row].value,data.datemode)
date_value = datetime(*time_value[:]).strftime('%Y-%m-%d %H:%M')
station = [cell_Col1, date_value, table.col(4)[row].value]
return station
#追加数据
def appendFile(j, k):
style1 = xlwt.easyxf(num_format_str='YYYY/M/D H:MM') #时间格式
rexcel = xlrd.open_workbook(stats[j]+'.xls') # 用wlrd提供的方法读取一个excel文件
rows = rexcel.sheets()[0].nrows # 用wlrd提供的方法获得现在已有的行数
excel = copy(rexcel) # 用xlutils提供的copy方法将xlrd的对象转化为xlwt的对象
table1 = excel.get_sheet(0) # 用xlwt对象的方法获得要操作的sheet
if findStation(table,j) == None:
table1.write(k, 0, stats[j]) # xlwt对象的写方法,参数分别是行、列、值
table1.write(k, 1, 'none')
table1.write(k, 2, 'none')
else:
times = findStation(table,j)[1]
flows = findStation(table,j)[2]
table1.write(k, 0, stats[j]) # xlwt对象的写方法,参数分别是行、列、值
table1.write(k, 1, times, style1)
table1.write(k, 2, flows)
excel.save(stats[j]+'.xls') # xlwt对象的保存方法,这时便覆盖掉了原来的excel
num = 0
table = data.sheet_by_index(num)
k=1
for j in range(0,len(stats)):
wb.save(stats[j]+'.xls')
appendFile(j,k)
print('end')
由于源数据格式不太统一,最终妥协方案见概览
image.png
11.20日更新
增加代理ip访问
# get_xls.py
import requests
import os, sys
import time
import random
from bs4 import BeautifulSoup
from unittest import result
i = 0
time_start = time.time()
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/'
'537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'}
def xlsGet(url, proxys):
result = {}
res = requests.get(url, headers=headers, proxies = random.choice(proxys))
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
result['title'] = soup.select('#mainNewsTitles')[0].text.strip()
result['artical'] = soup.select('#mainNewsContent')[0]
return result
def proxypool(num):
n = 1
os.chdir(r'G:/code_py/proxyIP')
fp = open('host.txt', 'r')
proxys = list()
ips = fp.readlines()
while n < num:
for p in ips:
ip = p.strip('\n').split('\t')
proxy = 'http:\\' + ip[0] + ':' + ip[1]
proxies = {'proxy': proxy}
proxys.append(proxies)
n += 1
return proxys
def linkGet(startpage, endpage, proxys):
global i
c = []
for j in range(startpage, endpage):
url = 'http://www.sxmwr.gov.cn/List.action?classfyid=61¤tPage=' + str(j) + '&turnHref=a0909d3e2591f4bad185dcc317ed1305ecd1ce6e561ebba08b16ce55831d5ab8a287dfa4a19beda0c6bc579659449f995714cf0f1d6601099d111aa8b2a942c122565fccc10321a12fa3875b48a46949d5c36fb26f106d16e54a688e17199bd5c4e6b68a622d3b2792ba2c781a2d4e17fffe1f9e8c4d6cdf6348d9a80dbcf0bdaea67d6bcc745b348c230d59c63a6576131bcee30514c0527ad244d7662c1922'
res = requests.get(url, headers=headers, proxies = random.choice(proxys))
res.encoding = 'utf-8'
html_sample = res.text
soup = BeautifulSoup(html_sample, 'html.parser')
for link in soup.select('.articlelist'):
s = (link)
soup2 = BeautifulSoup(str(s), 'html.parser')
alinks = soup2.select('a')
for links in alinks:
b = [[links['href']], i][0]
i = i + 1
c.append(b)
return c
def xlsWrite(c, proxys):
for k in range(0,len(c)):
url = 'http://www.sxmwr.gov.cn/' + str(c[k][0])
try:
res = requests.get(url, headers=headers, proxies = random.choice(proxys))
dic = xlsGet(url, proxys)
while res.status_code != 200:
dic = xlsGet(url, proxys)
except:
continue
s = str(dic['artical'])
names = dic['title']
os.chdir(r'G:/code_py/xls')
f = open(str(k) + '.xls', 'w+', encoding="utf-8")
f.write(s)
print(k)
proxyPool = proxypool(50)
c = linkGet(0, 62, proxyPool)
xlsWrite(c, proxyPool)
time_end = time.time()
print('已完成,总共耗时' + str(time_end - time_start) + '秒')
网友评论