PythonLog171118

作者: 迟客 | 来源:发表于2017-11-18 10:43 被阅读0次

PythonLog171118

1、概览

第一步：抓取网页表格保存为本地xls文件

# get_xls.py
import requests
import os, sys
import time
from bs4 import BeautifulSoup
from unittest import result

i = 0
time_start = time.time()


def linkGet(startpage, endpage):
    global i
    c = []
    for j in range(startpage, endpage):
        url = 'http://www.sxmwr.gov.cn/List.action?classfyid=61&currentPage=' + str(
            j) + '&turnHref=a0909d3e2591f4bad185dcc317ed1305ecd1ce6e561ebba08b16ce55831d5ab8a287dfa4a19beda0c6bc579659449f995714cf0f1d6601099d111aa8b2a942c122565fccc10321a12fa3875b48a46949d5c36fb26f106d16e54a688e17199bd5c4e6b68a622d3b2792ba2c781a2d4e17fffe1f9e8c4d6cdf6348d9a80dbcf0bdaea67d6bcc745b348c230d59c63a6576131bcee30514c0527ad244d7662c1922'
        res = requests.get(url)
        res.encoding = 'utf-8'
        html_sample = res.text
        soup = BeautifulSoup(html_sample, 'html.parser')
        for link in soup.select('.articlelist'):
            s = (link)
            soup2 = BeautifulSoup(str(s), 'html.parser')
            alinks = soup2.select('a')
            for links in alinks:
                b = [[links['href']], i][0] 
                i = i + 1
                c.append(b)
    return c


def xlsGet(url):
    result = {}
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    result['title'] = soup.select('#mainNewsTitles')[0].text.strip()
    result['artical'] = soup.select('#mainNewsContent')[0]
    return result


def xlsWrite(c):
    for k in range(0,len(c)):
        dic = xlsGet('http://www.sxmwr.gov.cn/' + str(c[k][0]))
        s = str(dic['artical'])
        names = dic['title']
        f = open(str(k) + '.xls', 'w+', encoding="utf-8")
        f.write(s)


c1 = linkGet(0, 20)  # 获取2014年数据
c2 = linkGet(20, 40)
c3 = linkGet(40, 60)
c4 = linkGet(60, 62)
xlsWrite(c1)
xlsWrite(c2)
xlsWrite(c3)
xlsWrite(c4)
time_end = time.time()
print('已完成，总共耗时' + str(time_end - time_start) + '秒')

第二步：将多xls文件合并为一个多sheet文件(VB实现)

Sub CombineWorkbooks()
Dim FilesToOpen, ft
Dim x As Integer
Application.ScreenUpdating = False
On Error GoTo errhandler
FilesToOpen = Application.GetOpenFilename _
(FileFilter:="Micrsofe Excel文件(*.xls), *.xls", _
MultiSelect:=True, Title:="要合并的文件")
If TypeName(FilesToOpen) = "boolean" Then
MsgBox "没有选定文件"
'GoTo errhandler
End If
x = 1
While x <= UBound(FilesToOpen)
Set wk = Workbooks.Open(Filename:=FilesToOpen(x))
wk.Sheets().Move after:=ThisWorkbook.Sheets _
(ThisWorkbook.Sheets.Count)
x = x + 1
Wend
MsgBox "合并成功完成！"
errhandler:
'MsgBox Err.Description
'Resume errhandler
End Sub

第三步：获取指定站点信息

#xlsWrite.py
import xlrd
import xlwt
import os  
from xlutils3.copy import copy 

data = xlrd.open_workbook('1.xls')  #获取工作簿
count = len(data.sheets()) #获取sheet数目
stats = ['枣园', '安塞',
         '杏河', '延安', '甘谷驿']
#创建表头        
wb = xlwt.Workbook()
ws = wb.add_sheet('汇总')


#寻找指定站点，变量为所读取的表格以及站点索引
def findStation(table,i):
    rows = table.nrows
    for row in range(0,rows):
        cell_Col1 = table.col(1)[row].value  #获取单元列
        if cell_Col1 == stats[i]:
            station = [cell_Col1, table.col(0)[1].value, table.col(2)[row].value, 
            table.col(3)[row].value, table.col(4)[row].value, table.col(5)[row].value, 
            table.col(6)[row].value]
            return station

#追加数据
def appendFile(j, k):
    rexcel = xlrd.open_workbook(stats[j]+'.xls') # 用wlrd提供的方法读取一个excel文件
    rows = rexcel.sheets()[0].nrows # 用wlrd提供的方法获得现在已有的行数
    excel = copy(rexcel) # 用xlutils提供的copy方法将xlrd的对象转化为xlwt的对象
    table1 = excel.get_sheet(0) # 用xlwt对象的方法获得要操作的sheet 
    if findStation(table,j) == None:
        table1.write(k, 0, stats[j]) # xlwt对象的写方法，参数分别是行、列、值
        table1.write(k, 1, 'none')
        table1.write(k, 2, 'none')
    else:
        cols = findStation(table,j)
        for cnums in range(0,len(cols)):
            table1.write(k, cnums, cols[cnums]) # xlwt对象的写方法，参数分别是行、列、值
    excel.save(stats[j]+'.xls') # xlwt对象的保存方法，这时便覆盖掉了原来的excel

for j in range(0,len(stats)):
        wb.save(stats[j]+'.xls')

for num in range(0,count):
    table = data.sheet_by_index(num)
    k = num + 1
    j = 1
    for j in range(0,len(stats)):
        appendFile(j,k)

print('end')

2、第三步实现前的探索

如何读取单表单元格的数据

import xlrd
data = xlrd.open_workbook('1.xls')
table = data.sheet_by_index(0)
cell_A1 = table.cell(3,1).value
if cell_A1 == '河曲':
    print('yes')
print(cell_A1)

如何通过遍历寻找目标值

import xlrd
data = xlrd.open_workbook('1.xls')  #获取工作簿
table = data.sheet_by_index(0)  #获取单元表
rows = table.nrows  #获取行数
for row in range(0,rows):
    cell_Col1 = table.col(1)[row].value  #获取单元列
    if cell_Col1 == '安塞':
        ansai = table.col(4)[row].value
        print(ansai)
print('end')

遍历各个站点

import xlrd


data = xlrd.open_workbook('1.xls')  #获取工作簿
table = data.sheet_by_index(0)  #获取单元表
stats = ['枣园', '安塞',
         '杏河', '延安', '甘谷驿']

def findStation(table,i):
    rows = table.nrows
    for row in range(0,rows):
        cell_Col1 = table.col(1)[row].value  #获取单元列
        if cell_Col1 == stats[i]:
            station = [cell_Col1, table.col(2)[row].value, table.col(4)[row].value]
            return station


zaoyuan = findStation(table,0)
ansai = findStation(table,1)
xinghe = findStation(table,2)
yanan = findStation(table,3)
ganguyi = findStation(table,4)

print(zaoyuan, ansai, xinghe, yanan, ganguyi)       
print('end')

获取单个文件

import xlrd
import csv
from datetime import datetime



data = xlrd.open_workbook('1.xls')  #获取工作簿
table = data.sheet_by_index(0)  #获取单元表
stats = ['枣园', '安塞',
         '杏河', '延安', '甘谷驿']

def findStation(table,i):
    rows = table.nrows
    for row in range(0,rows):
        cell_Col1 = table.col(1)[row].value  #获取单元列
        if cell_Col1 == stats[i]:
            time_value = xlrd.xldate_as_tuple(table.col(2)[row].value,data.datemode)
            date_value = datetime(*time_value[:]).strftime('%Y-%m-%d %H:%M') 
            station = [cell_Col1, date_value, table.col(4)[row].value]
            return station

def csvFile(j):
    if findStation(table,j) == None:
        with open(stats[j]+'.csv', 'w') as f:
            f.write('none')
    else:
        with open(stats[j]+'.csv', 'w') as csvfile:
            fieldnames = ['station', 'time', 'flows']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            dic = {'station':findStation(table,j)[0], 'time':findStation(table,j)[1],
            'flows': findStation(table,j)[2]}
            writer.writerow(dic)

for j in range(0,len(stats)):
    csvFile(j)

print('end')

追加

import xlrd
import xlwt
import os  
from xlutils3.copy import copy 
from datetime import datetime

data = xlrd.open_workbook('1.xls')  #获取工作簿
stats = ['枣园', '安塞',
         '杏河', '延安', '甘谷驿']
#创建表头        
wb = xlwt.Workbook()
ws = wb.add_sheet('汇总')
ws.write(0, 0, '站点')
ws.write(0, 1, '时间')
ws.write(0, 2, '径流')

#寻找指定站点，变量为所读取的表格以及站点索引
def findStation(table,i):
    rows = table.nrows
    for row in range(0,rows):
        cell_Col1 = table.col(1)[row].value  #获取单元列
        if cell_Col1 == stats[i]:
            time_value = xlrd.xldate_as_tuple(table.col(2)[row].value,data.datemode)
            date_value = datetime(*time_value[:]).strftime('%Y-%m-%d %H:%M') 
            station = [cell_Col1, date_value, table.col(4)[row].value]
            return station

#追加数据
def appendFile(j, k):
    style1 = xlwt.easyxf(num_format_str='YYYY/M/D H:MM') #时间格式
    rexcel = xlrd.open_workbook(stats[j]+'.xls') # 用wlrd提供的方法读取一个excel文件
    rows = rexcel.sheets()[0].nrows # 用wlrd提供的方法获得现在已有的行数
    excel = copy(rexcel) # 用xlutils提供的copy方法将xlrd的对象转化为xlwt的对象
    table1 = excel.get_sheet(0) # 用xlwt对象的方法获得要操作的sheet 
    if findStation(table,j) == None:
        table1.write(k, 0, stats[j]) # xlwt对象的写方法，参数分别是行、列、值
        table1.write(k, 1, 'none')
        table1.write(k, 2, 'none')
    else:
        times = findStation(table,j)[1]
        flows = findStation(table,j)[2]
        table1.write(k, 0, stats[j]) # xlwt对象的写方法，参数分别是行、列、值
        table1.write(k, 1, times, style1)
        table1.write(k, 2, flows)
    excel.save(stats[j]+'.xls') # xlwt对象的保存方法，这时便覆盖掉了原来的excel

num = 0
table = data.sheet_by_index(num)
k=1
for j in range(0,len(stats)):
    wb.save(stats[j]+'.xls')
    appendFile(j,k)

print('end')

由于源数据格式不太统一，最终妥协方案见概览

image.png

11.20日更新

增加代理ip访问

# get_xls.py
import requests
import os, sys
import time
import random
from bs4 import BeautifulSoup
from unittest import result

i = 0
time_start = time.time()
headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/'
                  '537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'}

def xlsGet(url, proxys):
    result = {}
    res = requests.get(url, headers=headers, proxies = random.choice(proxys))
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    result['title'] = soup.select('#mainNewsTitles')[0].text.strip()
    result['artical'] = soup.select('#mainNewsContent')[0]
    return result



def proxypool(num):
    n = 1
    os.chdir(r'G:/code_py/proxyIP')
    fp = open('host.txt', 'r')
    proxys = list()
    ips = fp.readlines()
    while n < num:
        for p in ips:
            ip = p.strip('\n').split('\t')
            proxy = 'http:\\' + ip[0] + ':' + ip[1]
            proxies = {'proxy': proxy}
            proxys.append(proxies)
            n += 1
    return proxys


def linkGet(startpage, endpage, proxys):
    global i
    c = []
    for j in range(startpage, endpage):
        url = 'http://www.sxmwr.gov.cn/List.action?classfyid=61&currentPage=' + str(j) + '&turnHref=a0909d3e2591f4bad185dcc317ed1305ecd1ce6e561ebba08b16ce55831d5ab8a287dfa4a19beda0c6bc579659449f995714cf0f1d6601099d111aa8b2a942c122565fccc10321a12fa3875b48a46949d5c36fb26f106d16e54a688e17199bd5c4e6b68a622d3b2792ba2c781a2d4e17fffe1f9e8c4d6cdf6348d9a80dbcf0bdaea67d6bcc745b348c230d59c63a6576131bcee30514c0527ad244d7662c1922'
        res = requests.get(url, headers=headers, proxies = random.choice(proxys))
        res.encoding = 'utf-8'
        html_sample = res.text
        soup = BeautifulSoup(html_sample, 'html.parser')
        for link in soup.select('.articlelist'):
            s = (link)
            soup2 = BeautifulSoup(str(s), 'html.parser')
            alinks = soup2.select('a')
            for links in alinks:
                b = [[links['href']], i][0]
                i = i + 1
                c.append(b)
    return c



def xlsWrite(c, proxys):
    for k in range(0,len(c)):
        url = 'http://www.sxmwr.gov.cn/' + str(c[k][0])
        try:
            res = requests.get(url, headers=headers, proxies = random.choice(proxys))
            dic = xlsGet(url, proxys)
            while res.status_code != 200:
                dic = xlsGet(url, proxys)
        except:
            continue
        s = str(dic['artical'])
        names = dic['title']
        os.chdir(r'G:/code_py/xls')
        f = open(str(k) + '.xls', 'w+', encoding="utf-8")
        f.write(s)
        print(k)


proxyPool = proxypool(50)
c = linkGet(0, 62, proxyPool)
xlsWrite(c, proxyPool)
time_end = time.time()
print('已完成，总共耗时' + str(time_end - time_start) + '秒')

PythonLog171118
接上一篇[https://www.jianshu.com/p/eacde7b9cce2] 1、概览第一步：抓取网...

PythonLog171118

1、概览

2、第三步实现前的探索

11.20日更新

相关文章

PythonLog171118

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读