使用urllib3做网络部分
beautifulsoup4来解析网页内容
安装几个python包:
//bs4的包,用来解析网页内容
pip3 install beautifulsoup4
//支持https的包,不安装会报warning
pip3 install certifi
//安装urllib3
pip3 install urllib3
urllib3的使用
>>> import urllib3
>>> http = urllib3.PoolManager()
>>> r = http.request('GET', 'http://httpbin.org/robots.txt')
>>> r.status
200
>>> r.data
'User-agent: *\nDisallow: /deny\n'
beautifulSoup4的使用
我认为几个比较有用的方法:
//创建
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.data, 'html.parser')
//将节点内容格式化输出
soup.prettify()
//通过点或者中括号的方式只能访问到一个节点,如果想获取table里所有的<tr>可以使用find_all()
tables = soup.find_all('table')//获取到所有的table标签
table.find_all('tr')//获取到table底下所有的tr标签
//获取节点的string
title_tag.string
# u'The Dormouse's story'
//获取一个节点底下所有的text
th_all_str = table.tr.get_text()
抓取一个网页的完整例子
import urllib3
import certifi
from bs4 import BeautifulSoup
def parse_table(table):
ths = table.tr.find_all('th')
headers = []
for index, value in enumerate(ths):
th_str = value.string
if th_str and th_str.strip():
headers.append((index, th_str.strip()))
result = []
for index, tr in enumerate(table.find_all('tr')):
if index == 0:
continue
ele = {}
for idx, val in headers:
ele[val] = tr.find_all('td')[idx].string
if ele[val]:
ele[val] = ele[val].strip()
result.append(ele)
return result
http = urllib3.PoolManager(
cert_reqs='CERT_REQUIRED',
ca_certs=certifi.where()
)
# url = 'https://etherscan.io/address/0x379516f90c4ff1cb2bcffa1f24d366855e67f40c'
url = 'https://etherscan.io/token/generic-tokentxns2?contractAddress=0xb5a5f22694352c15b00323844ad545abb2b11028&a=0xd551234ae421e3bcba99a0da6d736074f22192ff'
r = http.request('GET', url)
soup = BeautifulSoup(r.data, 'html.parser')
tables = soup.find_all('table')
for table in tables:
th_all_str = table.tr.get_text()
print(th_all_str)
print('----------')
if 'TxHash' in th_all_str and 'ParentTxHash' not in th_all_str:
result = parse_table(table)
print(len(result))
print(result)
# print("----------------")
# print(th_str)
网友评论