1.解析xml
import xml.etree.ElementTree as ET
import pprint
tree = ET.parse('exampleResearchArticle.xml')
root = tree.getroot()
print "Children of root:"
for child in root:
print child.tag #使用tag属性打印出每个子元素的标签名
title = root.find('./fm/bibl/title') #xpath表达式
title_text=''
for p in title:
title_text += p.text
print "\nTitle:\n",title_text
print "\nAuthor email addresses:"
for a in root.findall('./fm/bibl/aug/au'):
email = a.find('email')
if email is not None:
print email.text
2.提取数据(xml)
从 xml 中提取关于文章作者的数据,并将其添加到列表中,一个作者对应一个条目。
如下是我们期望的格式。名字、姓氏和电子邮箱标签应该直接对应字典关键字
olution = [{'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'}, {'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'}, {'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'}, {'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'}, {'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'}, {'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'}, {'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'}, {'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]
import xml.etree.ElementTree as ET
article_file = "exampleResearchArticle.xml"
def get_root(fname):
tree = ET.parse(fname)
return tree.getroot()
root = get_root(article_file)
def get_authors(root):
authors = []
for author in root.findall('./fm/bibl/aug/au'):
data = {
"fnm": author.find('./fnm').text,
"snm": author.find('./snm').text,
"email": author.find('./email').text
}
authors.append(data)
return authors
get_authors(root)
3.处理属性(xml)
从"insr"标签中提取属性iid的值,并将其添加到字典关键字“insr”列表中
<insr iid="I2"/>
import xml.etree.ElementTree as ET
article_file = "exampleResearchArticle.xml"
def get_root(fname):
tree = ET.parse(fname)
return tree.getroot()
root = get_root(article_file)
def get_authors(root):
authors = []
for author in root.findall('./fm/bibl/aug/au'):
data = {
"fnm": author.find('./fnm').text,
"snm": author.find('./snm').text,
"email": author.find('./email').text,
"insr": []
}
insr = author.findall('./insr')
for i in insr:
data['insr'].append(i.attrib['iid'])
authors.append(data)
return authors
4.提取实体(html)
from bs4 import BeautifulSoup
def options(soup,id):
option_values = []
carrier_list = soup.find(id = id)
for option in carrier_list.find_all('option'):
option_values.append(option['value'])
return option_values
def print_list(label,codes):
print "\n%s:" % label
for c in codes:
print c
def main():
soup = BeautifulSoup(open('virgin_and_logan_airport.html'))
codes = options(soup,'CarrierList')
print_list('Carriers',codes)
codes = options(soup,'AirportList')
print_list('Airports',codes)
5.使用BeautifulSoup(html)
使用 BeautifulSoup 处理 HTML,提取出"__EVENTVALIDATION”和“__VIEWSTATE”的隐藏表格字段值,并在数据字典中设置相应的值。
from bs4 import BeautifulSoup
import requests
import json
html_page = "page_source.html"
def extract_data(page):
data = {"eventvalidation": "",
"viewstate": ""}
with open(page, "r") as html:
soup = BeautifulSoup(html,'lxml')
ev = soup.find(id = '__EVENTVALIDATION')
data['eventvalidation'] = ev['value']
vs = soup.find(id="__VIEWSTATE")
data['viewstate'] = vs['value']
# do something here to find the necessary values
return data
def make_request(data):
eventvalidation = data["eventvalidation"]
viewstate = data["viewstate"]
r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
data={'AirportList': "BOS",
'CarrierList': "VX",
'Submit': 'Submit',
"__EVENTTARGET": "",
"__EVENTARGUMENT": "",
"__EVENTVALIDATION": eventvalidation,
"__VIEWSTATE": viewstate
})
return r.text
6.抓取解法
from bs4 import BeautifulSoup
import requests
import json
s = requests.Session()
r=s.get('https://www.transtats.bts.gov/Data_Elements.aspx?Data=2')
soup = BeautifulSoup(r.text)
viewstate_element = soup.find(id='__VIEWSTATE')
viewstate = viewstate_element['value']
eventvalidation_element = soup.find(id='__EVENTVALIDATION')
eventvalidation= eventvalidation_element['value']
r=s.post('https://www.transtats.bts.gov/Data_Elements.aspx?Data=2',
data={'AirportList': "BOS",
'CarrierList': "VX",
'Submit': 'Submit',
"__EVENTTARGET": "",
"__EVENTARGUMENT": "",
"__EVENTVALIDATION": eventvalidation,
"__VIEWSTATE": viewstate
})
f = open('virgin_and_logan_airport.html','w')
f.write(r.text)
7.习题集
7.1运营商列表(html)
获取一个包含所有航空公司的列表。在你所返回的数据中要去掉所有类似 “All U.S. Carriers” 的组合。最终你应该返回一个含有运营商编码的列表。
from bs4 import BeautifulSoup
html_page = "options.html"
def extract_carriers(page):
data = []
with open(page, "r") as html:
# do something here to find the necessary values
soup = BeautifulSoup(html, "lxml")
carrier_list = soup.find(id="CarrierList")
carriers = carrier_list.find_all('option')
for carrier in carriers:
if len(carrier['value'])==2:
data.append(carrier['value'])
return data
def make_request(data):
eventvalidation = data["eventvalidation"]
viewstate = data["viewstate"]
airport = data["airport"]
carrier = data["carrier"]
r = s.post("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
data = (("__EVENTTARGET", ""),
("__EVENTARGUMENT", ""),
("__VIEWSTATE", viewstate),
("__VIEWSTATEGENERATOR",viewstategenerator),
("__EVENTVALIDATION", eventvalidation),
("CarrierList", carrier),
("AirportList", airport),
("Submit", "Submit")))
return r.text
7.2机场列表(html)
其返回机场代码列表,并删除任何组合内容,例如“All”。
from bs4 import BeautifulSoup
html_page = "options.html"
def extract_airports(page):
data = []
with open(page, "r") as html:
# do something here to find the necessary values
soup = BeautifulSoup(html, "lxml")
airport_list = soup.find(id='AirportList')
airports = airport_list.find_all('option')
for airport in airports:
if 'All' not in airport['value']:
data.append(airport['value'])
return data
7.3处理所有数据(html)
Let's assume that you combined the code from the previous 2 exercises with code
from the lesson on how to build requests, and downloaded all the data locally.
The files are in a directory "data", named after the carrier and airport:
"{}-{}.html".format(carrier, airport), for example "FL-ATL.html".
The table with flight info has a table class="dataTDRight". Your task is to
use 'process_file()' to extract the flight data from that table as a list of
dictionaries, each dictionary containing relevant data from the file and table
row. This is an example of the data structure you should return:
data = [{"courier": "FL",
"airport": "ATL",
"year": 2012,
"month": 12,
"flights": {"domestic": 100,
"international": 100}
},
{"courier": "..."}
]
Note - year, month, and the flight data should be integers.
You should skip the rows that contain the TOTAL data for a year.
from bs4 import BeautifulSoup
from zipfile import ZipFile
import os
datadir = "data"
def open_zip(datadir):
with ZipFile('{0}.zip'.format(datadir), 'r') as myzip:
myzip.extractall()
def process_all(datadir):
files = os.listdir(datadir)
return files
def process_file(f):
"""
This function extracts data from the file given as the function argument in
a list of dictionaries. This is example of the data structure you should
return:
data = [{"courier": "FL",
"airport": "ATL",
"year": 2012,
"month": 12,
"flights": {"domestic": 100,
"international": 100}
},
{"courier": "..."}
]
Note: create a new dictionary for each entry in the output data list.
If you use the info dictionary defined here each element in the list
will be a reference to the same info dictionary.
"""
with open("{}/{}".format(datadir, f), "r") as html:
data = []
info = {"courier": "",
"airport": "",
"year": "",
"month": "",
"flights": {"domestic": "",
"international": ""}
}
info["courier"], info["airport"] = f[:6].split("-")
soup = BeautifulSoup(html,'lxml')
table_data = soup.find('table',{'class':'dataTDRight'})
for tr in table_data.find_all('tr'):
td = tr.find_all('td')
if td[1].string =='Month' or td[1].string =='TOTAL':
continue
else:
info['year'] = int(td[0].string)
info['month'] = int(td[1].string)
info['flights']['domestic']= int(td[2].string.replace(',',''))
info['flights']['international']= int(td[3].string.replace(',',''))
data.append(info)
return data
7.4
xml文件不是有效的 XML,因为它有几个根元素和 XML 声明,是多个相连的 XML 文档构成的。
本题需要将文件拆分为多个文档,并将这些文档处理为有效的 XML 文档。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# So, the problem is that the gigantic file is actually not a valid XML, because
# it has several root elements, and XML declarations.
# It is, a matter of fact, a collection of a lot of concatenated XML documents.
# So, one solution would be to split the file into separate documents,
# so that you can process the resulting files as valid XML documents.
import xml.etree.ElementTree as ET
PATENTS = 'patent.data'
def get_root(fname):
tree = ET.parse(fname)
return tree.getroot()
def split_file(filename):
with open(filename) as infile:
n = -1
outfile = open('{}-{}'.format(filename, n), 'w')
for line in infile:
if line.startswith('<?xml'):
outfile.close()
n += 1
outfile = open('{}-{}'.format(filename, n),'w')
outfile.write(line)
outfile.close()
"""
Split the input file into separate files, each containing a single patent.
As a hint - each patent declaration starts with the same line that was
causing the error found in the previous exercises.
The new files should be saved with filename in the following format:
"{}-{}".format(filename, n) where n is a counter, starting from 0.
"""
def test():
split_file(PATENTS)
for n in range(4):
try:
fname = "{}-{}".format(PATENTS, n)
f = open(fname, "r")
if not f.readline().startswith("<?xml"):
print "You have not split the file {} in the correct boundary!".format(fname)
f.close()
except:
print "Could not find file {}. Check if the filename is correct!".format(fname)
test()
网友评论