美文网首页
PythonLog171125

PythonLog171125

作者: 迟客 | 来源:发表于2017-11-25 12:32 被阅读0次

接上:
http://www.jianshu.com/p/49ca8ab54075

通过代理ip抓取网页表格,并保存为csv文件

# csvGet.py
import requests
import os
import time
import random
import csv
from bs4 import BeautifulSoup
from unittest import result
from html.parser import HTMLParser    
from urllib.request import HTTPError


i = 0
time_start = time.time()
headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/'
                  '537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'}

'''
#num获取num页 国内高匿ip的网页中代理数据
def fetch_proxy(num):
    #修改当前工作文件夹
    os.chdir(r'G:/code_py/proxyIP')
    api = 'http://www.xicidaili.com/nn/{}'
    fp = open('host.txt', 'a+', encoding=('utf-8'))
    for i in range(num+1):
        api = api.format(i+1)
        respones = requests.get(url=api, headers=header)
        soup = BeautifulSoup(respones.text, 'lxml')
        container = soup.find_all(name='tr',attrs={'class':'odd'})
        for tag in container:
            try:
                con_soup = BeautifulSoup(str(tag),'lxml')
                td_list = con_soup.find_all('td')
                ip = str(td_list[1])[4:-5]
                port = str(td_list[2])[4:-5]
                IPport = ip + '\t' + port + '\n'
                fp.write(IPport)
            except Exception as e:
                print('No IP!')
        time.sleep(1)
    fp.close()
fetch_proxy(10)
'''


#ip池
def proxypool(num):
    n = 1
    os.chdir(r'G:/code_py/proxyIP')
    fp = open('host.txt', 'r')
    proxys = list()
    ips = fp.readlines()
    while n < num:
        for p in ips:
            ip = p.strip('\n').split('\t')
            proxy = 'http:\\' + ip[0] + ':' + ip[1]
            proxies = {'proxy': proxy}
            proxys.append(proxies)
            n += 1
    return proxys


#获得表格数据链接
def linkGet(startpage, endpage, proxys):
    global i
    c = []
    for j in range(startpage, endpage):
        url = 'http://www.sxmwr.gov.cn/List.action?classfyid=61&currentPage=' + str(j) + '&turnHref=a0909d3e2591f4bad185dcc317ed1305ecd1ce6e561ebba08b16ce55831d5ab8a287dfa4a19beda0c6bc579659449f995714cf0f1d6601099d111aa8b2a942c122565fccc10321a12fa3875b48a46949d5c36fb26f106d16e54a688e17199bd5c4e6b68a622d3b2792ba2c781a2d4e17fffe1f9e8c4d6cdf6348d9a80dbcf0bdaea67d6bcc745b348c230d59c63a6576131bcee30514c0527ad244d7662c1922'

        res = requests.get(url, headers=headers, proxies = random.choice(proxys))
        res.encoding = 'utf-8'
        html_sample = res.text
        soup = BeautifulSoup(html_sample, 'html.parser')
        for link in soup.select('.articlelist'):
            s = (link)
            soup2 = BeautifulSoup(str(s), 'html.parser')
            alinks = soup2.select('a')
            for links in alinks:
                b = [[links['href']], i][0]
                i = i + 1
                c.append(b)
    return c


#获取网页中的表格
def tbGet(url, proxys):
    res = requests.get(url, headers=headers, proxies = random.choice(proxys))
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    table = soup.select('#mainNewsContent')[0]
    return table


#将网页表格写入本地csv文件
def csvWrite(c, proxys):
    for k in range(0,len(c)):
        url = 'http://www.sxmwr.gov.cn/' + str(c[k][0])
        try:
            res = requests.get(url, headers=headers, proxies = random.choice(proxys))
            tb = tbGet(url, proxys)
            while res.status_code != 200:
                tb = tbGet(url, proxys)
        except:
            continue
        rows = tb.findAll("tr")
        os.chdir(r'G:/code_py/csv') #文件保存文件夹
        csvFile = open(str(k) + '.csv','wt',encoding='utf-8')
        writer = csv.writer(csvFile)  
        try:  
            for row in rows:  
                csvRow = []  
                for cell in row.findAll('td'):  
                    csvRow.append(cell.get_text(strip=True))
                writer.writerow(csvRow)  
        finally:  
            csvFile.close()
        print(k) #计数


proxyPool = proxypool(50)
c = linkGet(0, 62, proxyPool)
csvWrite(c, proxyPool)
time_end = time.time()
print('已完成,总共耗时' + str(time_end - time_start) + '秒')

结果

运行结果

ps:才发现excel打开utf-8的csv会乱码。

将下载下的csv数据按大小分类

网站数据略乱,通过观察大概可以分成4类


分类
第二类文件最多
#p2.py
#-*- coding=utf-8 -*-

import pandas as pd
import numpy as np
import glob
import csv
import os


def num2String(stdf, name):
    myformat = lambda x: '%.3f'%x
    Tpart1 = stdf.iloc[:,0]
    Tpart2 = stdf.iloc[:,1]
    str1 = pd.DataFrame(Tpart2, columns=['流量'])
    str2 = str1.to_string(float_format = myformat, index = False, header = False)  #去除column name以及index name
    str3 = str2.split('\n')
    Tpart2 = pd.DataFrame(str3, columns=['流量'], index = [name])   #重新变为DataFrame
    newstdf = pd.concat([Tpart1, Tpart2], axis = 1)  #合并
    return newstdf


def fomatCsv(csvdata,idname):
    datavalue = csvdata.values
    data = pd.DataFrame(datavalue[1:],columns=datavalue[0])
    data.set_index(idname,inplace=True)
    return data


def csvContent(name,lstin,skiprow,idname):
    list1 = []
    list2 = []
    csvRow = []
    stdf = pd.DataFrame()
    for i in range(len(lstin)):
        try:
            csvdata = pd.read_csv(lstin[i],header = None, sep=',',skiprows = skiprow,
                usecols=[0, 1, 2, 3, 4, 5, 6, 7],encoding='utf-8' )
            data = fomatCsv(csvdata,idname)
            select_index = data[data.index == name]
            if select_index.empty:
                lstin1 = lstin[i]
                list1.append(lstin1)
            else:
                stdf = data.loc[[name],['时间', '流量']]
                stdf = num2String(stdf,name)
                print(lst[i])
            csvRow.append(stdf)
        except:
            lstin1 = lstin[i]
            list2.append(lstin1)
            continue
    list2 = list1 + list2
    if len(csvRow) == 0:
        return list2
    else:
        newstdf = pd.concat(csvRow,axis = 0)
        os.chdir(r'G:/code_py/csv/result') #文件保存文件夹
        newstdf.to_csv(stat+'.csv',mode='a',sep='\t',index=False,encoding='utf-8')
        return list2


stats = ['枣园', '安塞', '杏河', '延安', '甘谷驿']
for idname in ['测站名称','站名']:
    for skiprow in range(5):
        for stat in stats:
            os.chdir(r'G:/code_py/csv/2')
            lst = glob.glob("*.csv")
            list2 = csvContent(stat,lst,skiprow,idname)
print('end')

通过修改skiprows这个参数来匹配idname

其他两个文件

#p3.py
#-*- coding=utf-8 -*-

import pandas as pd
import numpy as np
import glob
import csv
import os


#浮点数保留小数点后3位
def num2String(stdf,name):
    myformat = lambda x: '%.3f'%x
    Tpart1 = stdf.iloc[:,0]
    Tpart2 = stdf.iloc[:,1]
    str1 = pd.DataFrame(Tpart2, columns=['流量'])
    str2 = str1.to_string(float_format = myformat, index = False, header = False)  #去除column name以及index name
    str3 = str2.split('\n')
    Tpart2 = pd.DataFrame(str3, columns=['流量'], index = ([name]*len(str3))) #重新变为DataFrame
    newstdf = pd.concat([Tpart1, Tpart2], axis = 1)  #合并
    return newstdf


def fomatCsv(csvdata):
    str1 = '河流名称 测站名称 时间 水位 流量 水势 警戒流量 保证流量'
    str2 = str1.split(' ')
    colname = list(str2)
    csvdata.columns = colname
    data = csvdata
    data.set_index('测站名称',inplace=True)
    return data


def csvContent(name):
    os.chdir(r'G:/code_py/csv/3')
    lst = glob.glob("*.csv")
    stdf = pd.DataFrame()
    for i in range(len(lst)):
        try:
            csvdata = pd.read_csv(lst[i],header = None, sep=',',skiprows = 3,
                usecols=[0, 1, 2, 3, 4, 5, 6, 7],encoding='utf-8' )
            data = fomatCsv(csvdata)
            select_index=data[data.index == name]
            if select_index.empty:
                print('None')
            else:
                stdf = data.loc[[name],['时间', '流量']]
                stdf = num2String(stdf,name)
                print(lst[i])
        except:
            csvdata = pd.read_csv(lst[i],header = None, sep=',',skiprows = 2,
                usecols=[0, 1, 2, 3, 4, 5, 6, 7],encoding='utf-8' )
            data = fomatCsv(csvdata)
            select_index=data[data.index == name]
            if select_index.empty:
                print('None')
            else:
                stdf = data.loc[[name],['时间', '流量']]
                stdf = num2String(stdf,name)                
                print('except',lst[i])
            continue
        csvRow.append(stdf)
    return csvRow


stats = ['枣园', '安塞', '杏河', '延安', '甘谷驿']
for stat in stats:
    csvRow = [] #清楚上次循环产生的数据
    csvRow = csvContent(stat)
    newstdf = pd.concat(csvRow,axis = 0)
    os.chdir(r'G:/code_py/csv/result') #文件保存文件夹
    newstdf.to_csv(stat+'.csv',mode='a',sep='\t',index=False,encoding='utf-8')

大致原理相同

#p4.py
#-*- coding=utf-8 -*-

import pandas as pd
import numpy as np
import glob
import csv
import os


#浮点数保留小数点后3位
def num2String(stdf,name):
    myformat = lambda x: '%.3f'%x
    Tpart1 = stdf.iloc[:,0]
    Tpart2 = stdf.iloc[:,1]
    str1 = pd.DataFrame(Tpart2, columns=['流量'])
    str2 = str1.to_string(float_format = myformat, index = False, header = False)  #去除column name以及index name
    str3 = str2.split('\n')
    Tpart2 = pd.DataFrame(str3, columns=['流量'], index = ([name]*len(str3))) #重新变为DataFrame
    newstdf = pd.concat([Tpart1, Tpart2], axis = 1)  #合并
    return newstdf


def fomatCsv(csvdata):
    str1 = '流域,河流,站名,时间,水位,流量,水势,警戒水位,警戒流量,站码'
    str2 = str1.split(',')
    colname = list(str2)
    csvdata.columns = colname
    data = csvdata
    data.set_index('站名',inplace=True)
    return data


def csvContent(name):
    os.chdir(r'G:/code_py/csv/4')
    lst = glob.glob("*.csv")
    stdf = pd.DataFrame()
    for i in range(len(lst)):
        try:
            csvdata = pd.read_csv(lst[i],header = None, sep=',',skiprows = 3,
                usecols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],encoding='utf-8' )
            data = fomatCsv(csvdata)
            select_index=data[data.index == name]
            if select_index.empty:
                print('None')
            else:
                stdf = data.loc[[name],['时间', '流量']]
                stdf = num2String(stdf,name)
                print(lst[i])
        except:
            csvdata = pd.read_csv(lst[i],header = None, sep=',',skiprows = 6,
                usecols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],encoding='utf-8' )
            data = fomatCsv(csvdata)
            select_index=data[data.index == name]
            if select_index.empty:
                print('None')
            else:
                stdf = data.loc[[name],['时间', '流量']]
                stdf = num2String(stdf,name)                
                print('except',lst[i])
            continue
        csvRow.append(stdf)
    return csvRow


stats = ['枣园', '安塞', '杏河', '延安', '甘谷驿']
for stat in stats:
    csvRow = [] #清楚上次循环产生的数据
    csvRow = csvContent(stat)
    newstdf = pd.concat(csvRow,axis = 0)
    os.chdir(r'G:/code_py/csv/result') #文件保存文件夹
    newstdf.to_csv(stat+'.csv',mode='a',sep='\t',index=False,encoding='utf-8')

整理生成的数据

#resultcsv.py
#-*- coding=utf-8 -*-


import pandas as pd
import numpy as np
import glob
import csv
import os


def csvData(id):
    os.chdir(r'G:/code_py/csv/result')
    f= open(lst[id],encoding='utf-8')
    csvdata = pd.read_csv(f,header = None, sep='\t',skiprows = 0,encoding='utf-8')
    str1 = '时间 流量'
    str2 = str1.split(' ')
    colname = list(str2)
    csvdata.columns = colname
    csvdata.set_index('时间', inplace = True)
    csvdata.drop('时间',axis=0, inplace=True)
    for i in range(len(csvdata.index)):
        dates = csvdata.index[i]
        try:
            datetime = pd.to_datetime(dates, format = '%Y-%m-%d %H:%M')
            csvdata.index.values[i] = datetime
        except ValueError:
            continue
    os.chdir(r'G:/code_py/csv/resultcsv') #文件保存文件夹
    csvdata = csvdata.drop_duplicates() #去重
    csvdata.to_csv(lst[id],mode='a',sep='\t',index=True,encoding='utf-8')

os.chdir(r'G:/code_py/csv/result')
lst = glob.glob("*.csv")


for idname in range(len(lst)):
    csvData(idname)
print('end')

PS:

1、最后发现,好像哪一步丢失了些数据。
2、从简单爬虫,xlrd,xlwt,csv,到代理ip访问,pandas。感觉这个网站可以玩一年呀。

相关文章

  • PythonLog171125

    接上:http://www.jianshu.com/p/49ca8ab54075[https://www.jian...

网友评论

      本文标题:PythonLog171125

      本文链接:https://www.haomeiwen.com/subject/omutbxtx.html