接上:
http://www.jianshu.com/p/49ca8ab54075
通过代理ip抓取网页表格,并保存为csv文件
# csvGet.py
import requests
import os
import time
import random
import csv
from bs4 import BeautifulSoup
from unittest import result
from html.parser import HTMLParser
from urllib.request import HTTPError
i = 0
time_start = time.time()
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/'
'537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'}
'''
#num获取num页 国内高匿ip的网页中代理数据
def fetch_proxy(num):
#修改当前工作文件夹
os.chdir(r'G:/code_py/proxyIP')
api = 'http://www.xicidaili.com/nn/{}'
fp = open('host.txt', 'a+', encoding=('utf-8'))
for i in range(num+1):
api = api.format(i+1)
respones = requests.get(url=api, headers=header)
soup = BeautifulSoup(respones.text, 'lxml')
container = soup.find_all(name='tr',attrs={'class':'odd'})
for tag in container:
try:
con_soup = BeautifulSoup(str(tag),'lxml')
td_list = con_soup.find_all('td')
ip = str(td_list[1])[4:-5]
port = str(td_list[2])[4:-5]
IPport = ip + '\t' + port + '\n'
fp.write(IPport)
except Exception as e:
print('No IP!')
time.sleep(1)
fp.close()
fetch_proxy(10)
'''
#ip池
def proxypool(num):
n = 1
os.chdir(r'G:/code_py/proxyIP')
fp = open('host.txt', 'r')
proxys = list()
ips = fp.readlines()
while n < num:
for p in ips:
ip = p.strip('\n').split('\t')
proxy = 'http:\\' + ip[0] + ':' + ip[1]
proxies = {'proxy': proxy}
proxys.append(proxies)
n += 1
return proxys
#获得表格数据链接
def linkGet(startpage, endpage, proxys):
global i
c = []
for j in range(startpage, endpage):
url = 'http://www.sxmwr.gov.cn/List.action?classfyid=61¤tPage=' + str(j) + '&turnHref=a0909d3e2591f4bad185dcc317ed1305ecd1ce6e561ebba08b16ce55831d5ab8a287dfa4a19beda0c6bc579659449f995714cf0f1d6601099d111aa8b2a942c122565fccc10321a12fa3875b48a46949d5c36fb26f106d16e54a688e17199bd5c4e6b68a622d3b2792ba2c781a2d4e17fffe1f9e8c4d6cdf6348d9a80dbcf0bdaea67d6bcc745b348c230d59c63a6576131bcee30514c0527ad244d7662c1922'
res = requests.get(url, headers=headers, proxies = random.choice(proxys))
res.encoding = 'utf-8'
html_sample = res.text
soup = BeautifulSoup(html_sample, 'html.parser')
for link in soup.select('.articlelist'):
s = (link)
soup2 = BeautifulSoup(str(s), 'html.parser')
alinks = soup2.select('a')
for links in alinks:
b = [[links['href']], i][0]
i = i + 1
c.append(b)
return c
#获取网页中的表格
def tbGet(url, proxys):
res = requests.get(url, headers=headers, proxies = random.choice(proxys))
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
table = soup.select('#mainNewsContent')[0]
return table
#将网页表格写入本地csv文件
def csvWrite(c, proxys):
for k in range(0,len(c)):
url = 'http://www.sxmwr.gov.cn/' + str(c[k][0])
try:
res = requests.get(url, headers=headers, proxies = random.choice(proxys))
tb = tbGet(url, proxys)
while res.status_code != 200:
tb = tbGet(url, proxys)
except:
continue
rows = tb.findAll("tr")
os.chdir(r'G:/code_py/csv') #文件保存文件夹
csvFile = open(str(k) + '.csv','wt',encoding='utf-8')
writer = csv.writer(csvFile)
try:
for row in rows:
csvRow = []
for cell in row.findAll('td'):
csvRow.append(cell.get_text(strip=True))
writer.writerow(csvRow)
finally:
csvFile.close()
print(k) #计数
proxyPool = proxypool(50)
c = linkGet(0, 62, proxyPool)
csvWrite(c, proxyPool)
time_end = time.time()
print('已完成,总共耗时' + str(time_end - time_start) + '秒')
运行结果结果
ps:才发现excel打开utf-8的csv会乱码。
将下载下的csv数据按大小分类
网站数据略乱,通过观察大概可以分成4类
分类
第二类文件最多
#p2.py
#-*- coding=utf-8 -*-
import pandas as pd
import numpy as np
import glob
import csv
import os
def num2String(stdf, name):
myformat = lambda x: '%.3f'%x
Tpart1 = stdf.iloc[:,0]
Tpart2 = stdf.iloc[:,1]
str1 = pd.DataFrame(Tpart2, columns=['流量'])
str2 = str1.to_string(float_format = myformat, index = False, header = False) #去除column name以及index name
str3 = str2.split('\n')
Tpart2 = pd.DataFrame(str3, columns=['流量'], index = [name]) #重新变为DataFrame
newstdf = pd.concat([Tpart1, Tpart2], axis = 1) #合并
return newstdf
def fomatCsv(csvdata,idname):
datavalue = csvdata.values
data = pd.DataFrame(datavalue[1:],columns=datavalue[0])
data.set_index(idname,inplace=True)
return data
def csvContent(name,lstin,skiprow,idname):
list1 = []
list2 = []
csvRow = []
stdf = pd.DataFrame()
for i in range(len(lstin)):
try:
csvdata = pd.read_csv(lstin[i],header = None, sep=',',skiprows = skiprow,
usecols=[0, 1, 2, 3, 4, 5, 6, 7],encoding='utf-8' )
data = fomatCsv(csvdata,idname)
select_index = data[data.index == name]
if select_index.empty:
lstin1 = lstin[i]
list1.append(lstin1)
else:
stdf = data.loc[[name],['时间', '流量']]
stdf = num2String(stdf,name)
print(lst[i])
csvRow.append(stdf)
except:
lstin1 = lstin[i]
list2.append(lstin1)
continue
list2 = list1 + list2
if len(csvRow) == 0:
return list2
else:
newstdf = pd.concat(csvRow,axis = 0)
os.chdir(r'G:/code_py/csv/result') #文件保存文件夹
newstdf.to_csv(stat+'.csv',mode='a',sep='\t',index=False,encoding='utf-8')
return list2
stats = ['枣园', '安塞', '杏河', '延安', '甘谷驿']
for idname in ['测站名称','站名']:
for skiprow in range(5):
for stat in stats:
os.chdir(r'G:/code_py/csv/2')
lst = glob.glob("*.csv")
list2 = csvContent(stat,lst,skiprow,idname)
print('end')
通过修改skiprows这个参数来匹配idname
其他两个文件
#p3.py
#-*- coding=utf-8 -*-
import pandas as pd
import numpy as np
import glob
import csv
import os
#浮点数保留小数点后3位
def num2String(stdf,name):
myformat = lambda x: '%.3f'%x
Tpart1 = stdf.iloc[:,0]
Tpart2 = stdf.iloc[:,1]
str1 = pd.DataFrame(Tpart2, columns=['流量'])
str2 = str1.to_string(float_format = myformat, index = False, header = False) #去除column name以及index name
str3 = str2.split('\n')
Tpart2 = pd.DataFrame(str3, columns=['流量'], index = ([name]*len(str3))) #重新变为DataFrame
newstdf = pd.concat([Tpart1, Tpart2], axis = 1) #合并
return newstdf
def fomatCsv(csvdata):
str1 = '河流名称 测站名称 时间 水位 流量 水势 警戒流量 保证流量'
str2 = str1.split(' ')
colname = list(str2)
csvdata.columns = colname
data = csvdata
data.set_index('测站名称',inplace=True)
return data
def csvContent(name):
os.chdir(r'G:/code_py/csv/3')
lst = glob.glob("*.csv")
stdf = pd.DataFrame()
for i in range(len(lst)):
try:
csvdata = pd.read_csv(lst[i],header = None, sep=',',skiprows = 3,
usecols=[0, 1, 2, 3, 4, 5, 6, 7],encoding='utf-8' )
data = fomatCsv(csvdata)
select_index=data[data.index == name]
if select_index.empty:
print('None')
else:
stdf = data.loc[[name],['时间', '流量']]
stdf = num2String(stdf,name)
print(lst[i])
except:
csvdata = pd.read_csv(lst[i],header = None, sep=',',skiprows = 2,
usecols=[0, 1, 2, 3, 4, 5, 6, 7],encoding='utf-8' )
data = fomatCsv(csvdata)
select_index=data[data.index == name]
if select_index.empty:
print('None')
else:
stdf = data.loc[[name],['时间', '流量']]
stdf = num2String(stdf,name)
print('except',lst[i])
continue
csvRow.append(stdf)
return csvRow
stats = ['枣园', '安塞', '杏河', '延安', '甘谷驿']
for stat in stats:
csvRow = [] #清楚上次循环产生的数据
csvRow = csvContent(stat)
newstdf = pd.concat(csvRow,axis = 0)
os.chdir(r'G:/code_py/csv/result') #文件保存文件夹
newstdf.to_csv(stat+'.csv',mode='a',sep='\t',index=False,encoding='utf-8')
大致原理相同
#p4.py
#-*- coding=utf-8 -*-
import pandas as pd
import numpy as np
import glob
import csv
import os
#浮点数保留小数点后3位
def num2String(stdf,name):
myformat = lambda x: '%.3f'%x
Tpart1 = stdf.iloc[:,0]
Tpart2 = stdf.iloc[:,1]
str1 = pd.DataFrame(Tpart2, columns=['流量'])
str2 = str1.to_string(float_format = myformat, index = False, header = False) #去除column name以及index name
str3 = str2.split('\n')
Tpart2 = pd.DataFrame(str3, columns=['流量'], index = ([name]*len(str3))) #重新变为DataFrame
newstdf = pd.concat([Tpart1, Tpart2], axis = 1) #合并
return newstdf
def fomatCsv(csvdata):
str1 = '流域,河流,站名,时间,水位,流量,水势,警戒水位,警戒流量,站码'
str2 = str1.split(',')
colname = list(str2)
csvdata.columns = colname
data = csvdata
data.set_index('站名',inplace=True)
return data
def csvContent(name):
os.chdir(r'G:/code_py/csv/4')
lst = glob.glob("*.csv")
stdf = pd.DataFrame()
for i in range(len(lst)):
try:
csvdata = pd.read_csv(lst[i],header = None, sep=',',skiprows = 3,
usecols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],encoding='utf-8' )
data = fomatCsv(csvdata)
select_index=data[data.index == name]
if select_index.empty:
print('None')
else:
stdf = data.loc[[name],['时间', '流量']]
stdf = num2String(stdf,name)
print(lst[i])
except:
csvdata = pd.read_csv(lst[i],header = None, sep=',',skiprows = 6,
usecols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],encoding='utf-8' )
data = fomatCsv(csvdata)
select_index=data[data.index == name]
if select_index.empty:
print('None')
else:
stdf = data.loc[[name],['时间', '流量']]
stdf = num2String(stdf,name)
print('except',lst[i])
continue
csvRow.append(stdf)
return csvRow
stats = ['枣园', '安塞', '杏河', '延安', '甘谷驿']
for stat in stats:
csvRow = [] #清楚上次循环产生的数据
csvRow = csvContent(stat)
newstdf = pd.concat(csvRow,axis = 0)
os.chdir(r'G:/code_py/csv/result') #文件保存文件夹
newstdf.to_csv(stat+'.csv',mode='a',sep='\t',index=False,encoding='utf-8')
整理生成的数据
#resultcsv.py
#-*- coding=utf-8 -*-
import pandas as pd
import numpy as np
import glob
import csv
import os
def csvData(id):
os.chdir(r'G:/code_py/csv/result')
f= open(lst[id],encoding='utf-8')
csvdata = pd.read_csv(f,header = None, sep='\t',skiprows = 0,encoding='utf-8')
str1 = '时间 流量'
str2 = str1.split(' ')
colname = list(str2)
csvdata.columns = colname
csvdata.set_index('时间', inplace = True)
csvdata.drop('时间',axis=0, inplace=True)
for i in range(len(csvdata.index)):
dates = csvdata.index[i]
try:
datetime = pd.to_datetime(dates, format = '%Y-%m-%d %H:%M')
csvdata.index.values[i] = datetime
except ValueError:
continue
os.chdir(r'G:/code_py/csv/resultcsv') #文件保存文件夹
csvdata = csvdata.drop_duplicates() #去重
csvdata.to_csv(lst[id],mode='a',sep='\t',index=True,encoding='utf-8')
os.chdir(r'G:/code_py/csv/result')
lst = glob.glob("*.csv")
for idname in range(len(lst)):
csvData(idname)
print('end')
PS:
1、最后发现,好像哪一步丢失了些数据。
2、从简单爬虫,xlrd,xlwt,csv,到代理ip访问,pandas。感觉这个网站可以玩一年呀。
网友评论