美文网首页
PythonLog171118

PythonLog171118

作者: 迟客 | 来源:发表于2017-11-18 10:43 被阅读0次

    上一篇

    1、概览

    第一步:抓取网页表格保存为本地xls文件

    # get_xls.py
    import requests
    import os, sys
    import time
    from bs4 import BeautifulSoup
    from unittest import result
    
    i = 0
    time_start = time.time()
    
    
    def linkGet(startpage, endpage):
        global i
        c = []
        for j in range(startpage, endpage):
            url = 'http://www.sxmwr.gov.cn/List.action?classfyid=61&currentPage=' + str(
                j) + '&turnHref=a0909d3e2591f4bad185dcc317ed1305ecd1ce6e561ebba08b16ce55831d5ab8a287dfa4a19beda0c6bc579659449f995714cf0f1d6601099d111aa8b2a942c122565fccc10321a12fa3875b48a46949d5c36fb26f106d16e54a688e17199bd5c4e6b68a622d3b2792ba2c781a2d4e17fffe1f9e8c4d6cdf6348d9a80dbcf0bdaea67d6bcc745b348c230d59c63a6576131bcee30514c0527ad244d7662c1922'
            res = requests.get(url)
            res.encoding = 'utf-8'
            html_sample = res.text
            soup = BeautifulSoup(html_sample, 'html.parser')
            for link in soup.select('.articlelist'):
                s = (link)
                soup2 = BeautifulSoup(str(s), 'html.parser')
                alinks = soup2.select('a')
                for links in alinks:
                    b = [[links['href']], i][0] 
                    i = i + 1
                    c.append(b)
        return c
    
    
    def xlsGet(url):
        result = {}
        res = requests.get(url)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        result['title'] = soup.select('#mainNewsTitles')[0].text.strip()
        result['artical'] = soup.select('#mainNewsContent')[0]
        return result
    
    
    def xlsWrite(c):
        for k in range(0,len(c)):
            dic = xlsGet('http://www.sxmwr.gov.cn/' + str(c[k][0]))
            s = str(dic['artical'])
            names = dic['title']
            f = open(str(k) + '.xls', 'w+', encoding="utf-8")
            f.write(s)
    
    
    c1 = linkGet(0, 20)  # 获取2014年数据
    c2 = linkGet(20, 40)
    c3 = linkGet(40, 60)
    c4 = linkGet(60, 62)
    xlsWrite(c1)
    xlsWrite(c2)
    xlsWrite(c3)
    xlsWrite(c4)
    time_end = time.time()
    print('已完成,总共耗时' + str(time_end - time_start) + '秒')
    

    第二步:将多xls文件合并为一个多sheet文件(VB实现)

    Sub CombineWorkbooks()
    Dim FilesToOpen, ft
    Dim x As Integer
    Application.ScreenUpdating = False
    On Error GoTo errhandler
    FilesToOpen = Application.GetOpenFilename _
    (FileFilter:="Micrsofe Excel文件(*.xls), *.xls", _
    MultiSelect:=True, Title:="要合并的文件")
    If TypeName(FilesToOpen) = "boolean" Then
    MsgBox "没有选定文件"
    'GoTo errhandler
    End If
    x = 1
    While x <= UBound(FilesToOpen)
    Set wk = Workbooks.Open(Filename:=FilesToOpen(x))
    wk.Sheets().Move after:=ThisWorkbook.Sheets _
    (ThisWorkbook.Sheets.Count)
    x = x + 1
    Wend
    MsgBox "合并成功完成!"
    errhandler:
    'MsgBox Err.Description
    'Resume errhandler
    End Sub
    

    第三步:获取指定站点信息

    #xlsWrite.py
    import xlrd
    import xlwt
    import os  
    from xlutils3.copy import copy 
    
    data = xlrd.open_workbook('1.xls')  #获取工作簿
    count = len(data.sheets()) #获取sheet数目
    stats = ['枣园', '安塞',
             '杏河', '延安', '甘谷驿']
    #创建表头        
    wb = xlwt.Workbook()
    ws = wb.add_sheet('汇总')
    
    
    #寻找指定站点,变量为所读取的表格以及站点索引
    def findStation(table,i):
        rows = table.nrows
        for row in range(0,rows):
            cell_Col1 = table.col(1)[row].value  #获取单元列
            if cell_Col1 == stats[i]:
                station = [cell_Col1, table.col(0)[1].value, table.col(2)[row].value, 
                table.col(3)[row].value, table.col(4)[row].value, table.col(5)[row].value, 
                table.col(6)[row].value]
                return station
    
    #追加数据
    def appendFile(j, k):
        rexcel = xlrd.open_workbook(stats[j]+'.xls') # 用wlrd提供的方法读取一个excel文件
        rows = rexcel.sheets()[0].nrows # 用wlrd提供的方法获得现在已有的行数
        excel = copy(rexcel) # 用xlutils提供的copy方法将xlrd的对象转化为xlwt的对象
        table1 = excel.get_sheet(0) # 用xlwt对象的方法获得要操作的sheet 
        if findStation(table,j) == None:
            table1.write(k, 0, stats[j]) # xlwt对象的写方法,参数分别是行、列、值
            table1.write(k, 1, 'none')
            table1.write(k, 2, 'none')
        else:
            cols = findStation(table,j)
            for cnums in range(0,len(cols)):
                table1.write(k, cnums, cols[cnums]) # xlwt对象的写方法,参数分别是行、列、值
        excel.save(stats[j]+'.xls') # xlwt对象的保存方法,这时便覆盖掉了原来的excel
    
    for j in range(0,len(stats)):
            wb.save(stats[j]+'.xls')
    
    for num in range(0,count):
        table = data.sheet_by_index(num)
        k = num + 1
        j = 1
        for j in range(0,len(stats)):
            appendFile(j,k)
    
    print('end')
    

    2、第三步实现前的探索

    如何读取单表单元格的数据

    import xlrd
    data = xlrd.open_workbook('1.xls')
    table = data.sheet_by_index(0)
    cell_A1 = table.cell(3,1).value
    if cell_A1 == '河曲':
        print('yes')
    print(cell_A1)
    

    如何通过遍历寻找目标值

    import xlrd
    data = xlrd.open_workbook('1.xls')  #获取工作簿
    table = data.sheet_by_index(0)  #获取单元表
    rows = table.nrows  #获取行数
    for row in range(0,rows):
        cell_Col1 = table.col(1)[row].value  #获取单元列
        if cell_Col1 == '安塞':
            ansai = table.col(4)[row].value
            print(ansai)
    print('end')
    

    遍历各个站点

    import xlrd
    
    
    data = xlrd.open_workbook('1.xls')  #获取工作簿
    table = data.sheet_by_index(0)  #获取单元表
    stats = ['枣园', '安塞',
             '杏河', '延安', '甘谷驿']
    
    def findStation(table,i):
        rows = table.nrows
        for row in range(0,rows):
            cell_Col1 = table.col(1)[row].value  #获取单元列
            if cell_Col1 == stats[i]:
                station = [cell_Col1, table.col(2)[row].value, table.col(4)[row].value]
                return station
    
    
    zaoyuan = findStation(table,0)
    ansai = findStation(table,1)
    xinghe = findStation(table,2)
    yanan = findStation(table,3)
    ganguyi = findStation(table,4)
    
    print(zaoyuan, ansai, xinghe, yanan, ganguyi)       
    print('end')
    

    获取单个文件

    import xlrd
    import csv
    from datetime import datetime
    
    
    
    data = xlrd.open_workbook('1.xls')  #获取工作簿
    table = data.sheet_by_index(0)  #获取单元表
    stats = ['枣园', '安塞',
             '杏河', '延安', '甘谷驿']
    
    def findStation(table,i):
        rows = table.nrows
        for row in range(0,rows):
            cell_Col1 = table.col(1)[row].value  #获取单元列
            if cell_Col1 == stats[i]:
                time_value = xlrd.xldate_as_tuple(table.col(2)[row].value,data.datemode)
                date_value = datetime(*time_value[:]).strftime('%Y-%m-%d %H:%M') 
                station = [cell_Col1, date_value, table.col(4)[row].value]
                return station
    
    def csvFile(j):
        if findStation(table,j) == None:
            with open(stats[j]+'.csv', 'w') as f:
                f.write('none')
        else:
            with open(stats[j]+'.csv', 'w') as csvfile:
                fieldnames = ['station', 'time', 'flows']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()
                dic = {'station':findStation(table,j)[0], 'time':findStation(table,j)[1],
                'flows': findStation(table,j)[2]}
                writer.writerow(dic)
    
    for j in range(0,len(stats)):
        csvFile(j)
    
    print('end')
    

    追加

    import xlrd
    import xlwt
    import os  
    from xlutils3.copy import copy 
    from datetime import datetime
    
    data = xlrd.open_workbook('1.xls')  #获取工作簿
    stats = ['枣园', '安塞',
             '杏河', '延安', '甘谷驿']
    #创建表头        
    wb = xlwt.Workbook()
    ws = wb.add_sheet('汇总')
    ws.write(0, 0, '站点')
    ws.write(0, 1, '时间')
    ws.write(0, 2, '径流')
    
    #寻找指定站点,变量为所读取的表格以及站点索引
    def findStation(table,i):
        rows = table.nrows
        for row in range(0,rows):
            cell_Col1 = table.col(1)[row].value  #获取单元列
            if cell_Col1 == stats[i]:
                time_value = xlrd.xldate_as_tuple(table.col(2)[row].value,data.datemode)
                date_value = datetime(*time_value[:]).strftime('%Y-%m-%d %H:%M') 
                station = [cell_Col1, date_value, table.col(4)[row].value]
                return station
    
    #追加数据
    def appendFile(j, k):
        style1 = xlwt.easyxf(num_format_str='YYYY/M/D H:MM') #时间格式
        rexcel = xlrd.open_workbook(stats[j]+'.xls') # 用wlrd提供的方法读取一个excel文件
        rows = rexcel.sheets()[0].nrows # 用wlrd提供的方法获得现在已有的行数
        excel = copy(rexcel) # 用xlutils提供的copy方法将xlrd的对象转化为xlwt的对象
        table1 = excel.get_sheet(0) # 用xlwt对象的方法获得要操作的sheet 
        if findStation(table,j) == None:
            table1.write(k, 0, stats[j]) # xlwt对象的写方法,参数分别是行、列、值
            table1.write(k, 1, 'none')
            table1.write(k, 2, 'none')
        else:
            times = findStation(table,j)[1]
            flows = findStation(table,j)[2]
            table1.write(k, 0, stats[j]) # xlwt对象的写方法,参数分别是行、列、值
            table1.write(k, 1, times, style1)
            table1.write(k, 2, flows)
        excel.save(stats[j]+'.xls') # xlwt对象的保存方法,这时便覆盖掉了原来的excel
    
    num = 0
    table = data.sheet_by_index(num)
    k=1
    for j in range(0,len(stats)):
        wb.save(stats[j]+'.xls')
        appendFile(j,k)
    
    print('end')
    
    

    由于源数据格式不太统一,最终妥协方案见概览


    image.png

    11.20日更新

    增加代理ip访问

    # get_xls.py
    import requests
    import os, sys
    import time
    import random
    from bs4 import BeautifulSoup
    from unittest import result
    
    i = 0
    time_start = time.time()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/'
                      '537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'}
    
    def xlsGet(url, proxys):
        result = {}
        res = requests.get(url, headers=headers, proxies = random.choice(proxys))
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        result['title'] = soup.select('#mainNewsTitles')[0].text.strip()
        result['artical'] = soup.select('#mainNewsContent')[0]
        return result
    
    
    
    def proxypool(num):
        n = 1
        os.chdir(r'G:/code_py/proxyIP')
        fp = open('host.txt', 'r')
        proxys = list()
        ips = fp.readlines()
        while n < num:
            for p in ips:
                ip = p.strip('\n').split('\t')
                proxy = 'http:\\' + ip[0] + ':' + ip[1]
                proxies = {'proxy': proxy}
                proxys.append(proxies)
                n += 1
        return proxys
    
    
    def linkGet(startpage, endpage, proxys):
        global i
        c = []
        for j in range(startpage, endpage):
            url = 'http://www.sxmwr.gov.cn/List.action?classfyid=61&currentPage=' + str(j) + '&turnHref=a0909d3e2591f4bad185dcc317ed1305ecd1ce6e561ebba08b16ce55831d5ab8a287dfa4a19beda0c6bc579659449f995714cf0f1d6601099d111aa8b2a942c122565fccc10321a12fa3875b48a46949d5c36fb26f106d16e54a688e17199bd5c4e6b68a622d3b2792ba2c781a2d4e17fffe1f9e8c4d6cdf6348d9a80dbcf0bdaea67d6bcc745b348c230d59c63a6576131bcee30514c0527ad244d7662c1922'
            res = requests.get(url, headers=headers, proxies = random.choice(proxys))
            res.encoding = 'utf-8'
            html_sample = res.text
            soup = BeautifulSoup(html_sample, 'html.parser')
            for link in soup.select('.articlelist'):
                s = (link)
                soup2 = BeautifulSoup(str(s), 'html.parser')
                alinks = soup2.select('a')
                for links in alinks:
                    b = [[links['href']], i][0]
                    i = i + 1
                    c.append(b)
        return c
    
    
    
    def xlsWrite(c, proxys):
        for k in range(0,len(c)):
            url = 'http://www.sxmwr.gov.cn/' + str(c[k][0])
            try:
                res = requests.get(url, headers=headers, proxies = random.choice(proxys))
                dic = xlsGet(url, proxys)
                while res.status_code != 200:
                    dic = xlsGet(url, proxys)
            except:
                continue
            s = str(dic['artical'])
            names = dic['title']
            os.chdir(r'G:/code_py/xls')
            f = open(str(k) + '.xls', 'w+', encoding="utf-8")
            f.write(s)
            print(k)
    
    
    proxyPool = proxypool(50)
    c = linkGet(0, 62, proxyPool)
    xlsWrite(c, proxyPool)
    time_end = time.time()
    print('已完成,总共耗时' + str(time_end - time_start) + '秒')
    

    相关文章

      网友评论

          本文标题:PythonLog171118

          本文链接:https://www.haomeiwen.com/subject/cashvxtx.html