美文网首页我爱编程
更复杂格式的数据(xml&html)

更复杂格式的数据(xml&html)

作者: esskeetit | 来源:发表于2018-01-23 23:00 被阅读0次

    1.解析xml

    import xml.etree.ElementTree as ET
    import pprint
    
    tree = ET.parse('exampleResearchArticle.xml') 
    root = tree.getroot()
    
    print "Children of root:"
    for child in root:
        print child.tag                #使用tag属性打印出每个子元素的标签名
        
    title = root.find('./fm/bibl/title')                    #xpath表达式
    title_text=''
    for p in title:   
        title_text += p.text
        print "\nTitle:\n",title_text
        
    print "\nAuthor email addresses:"
    for a in root.findall('./fm/bibl/aug/au'):
        email = a.find('email')
        if email is not None:
            print email.text
    

    2.提取数据(xml)

    从 xml 中提取关于文章作者的数据,并将其添加到列表中,一个作者对应一个条目。
    如下是我们期望的格式。名字、姓氏和电子邮箱标签应该直接对应字典关键字

    olution = [{'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'}, {'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'}, {'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'}, {'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'}, {'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'}, {'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'}, {'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'}, {'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]
    
    import xml.etree.ElementTree as ET
    article_file = "exampleResearchArticle.xml"
    def get_root(fname):
        tree = ET.parse(fname)
        return tree.getroot()
    root = get_root(article_file)
    def get_authors(root):
        authors = []
        for author in root.findall('./fm/bibl/aug/au'):
            data = {
                    "fnm": author.find('./fnm').text,
                    "snm": author.find('./snm').text,
                    "email": author.find('./email').text
            }
    
            authors.append(data)
    
        return authors
    get_authors(root)
    

    3.处理属性(xml)

    从"insr"标签中提取属性iid的值,并将其添加到字典关键字“insr”列表中
    <insr iid="I2"/>

    import xml.etree.ElementTree as ET
    
    article_file = "exampleResearchArticle.xml"
    
    
    def get_root(fname):
        tree = ET.parse(fname)
        return tree.getroot()
    
    root = get_root(article_file)
    
    def get_authors(root):
        authors = []
        for author in root.findall('./fm/bibl/aug/au'):
            data = {
                    "fnm": author.find('./fnm').text,
                    "snm": author.find('./snm').text,
                    "email": author.find('./email').text,
                    "insr": []
            }
            
            insr = author.findall('./insr')
            for i in insr:
                data['insr'].append(i.attrib['iid'])
    
    
            authors.append(data)
    
        return authors
    

    4.提取实体(html)

    from bs4 import BeautifulSoup
    
    def options(soup,id):
        option_values = []
        carrier_list = soup.find(id = id)
        for option in carrier_list.find_all('option'):
            option_values.append(option['value'])
        return option_values
    
    def print_list(label,codes):
        print "\n%s:" % label
        for c in codes:
            print c
            
    def main():
        soup = BeautifulSoup(open('virgin_and_logan_airport.html'))
        codes = options(soup,'CarrierList')
        print_list('Carriers',codes)
        
        codes = options(soup,'AirportList')
        print_list('Airports',codes)
    

    5.使用BeautifulSoup(html)

    使用 BeautifulSoup 处理 HTML,提取出"__EVENTVALIDATION”和“__VIEWSTATE”的隐藏表格字段值,并在数据字典中设置相应的值。

    from bs4 import BeautifulSoup
    import requests
    import json
    
    html_page = "page_source.html"
    
    def extract_data(page):
        data = {"eventvalidation": "",
                "viewstate": ""}
        with open(page, "r") as html:
            soup = BeautifulSoup(html,'lxml')
            ev = soup.find(id = '__EVENTVALIDATION')
            data['eventvalidation'] = ev['value']
            
            vs = soup.find(id="__VIEWSTATE")
            data['viewstate'] = vs['value']
            # do something here to find the necessary values
    
    
        return data
    
    def make_request(data):
        eventvalidation = data["eventvalidation"]
        viewstate = data["viewstate"]
    
        r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                        data={'AirportList': "BOS",
                              'CarrierList': "VX",
                              'Submit': 'Submit',
                              "__EVENTTARGET": "",
                              "__EVENTARGUMENT": "",
                              "__EVENTVALIDATION": eventvalidation,
                              "__VIEWSTATE": viewstate
                        })
    
        return r.text
    

    6.抓取解法

    from bs4 import BeautifulSoup
    import requests
    import json
    
    s = requests.Session()
    
    r=s.get('https://www.transtats.bts.gov/Data_Elements.aspx?Data=2')
    soup = BeautifulSoup(r.text)
    
    viewstate_element = soup.find(id='__VIEWSTATE')
    viewstate = viewstate_element['value']
    
    eventvalidation_element = soup.find(id='__EVENTVALIDATION')
    eventvalidation= eventvalidation_element['value']
    
    r=s.post('https://www.transtats.bts.gov/Data_Elements.aspx?Data=2',
             data={'AirportList': "BOS",
                              'CarrierList': "VX",
                              'Submit': 'Submit',
                              "__EVENTTARGET": "",
                              "__EVENTARGUMENT": "",
                              "__EVENTVALIDATION": eventvalidation,
                              "__VIEWSTATE": viewstate
                        })
    
    f = open('virgin_and_logan_airport.html','w')
    f.write(r.text)
    

    7.习题集

    7.1运营商列表(html)

    获取一个包含所有航空公司的列表。在你所返回的数据中要去掉所有类似 “All U.S. Carriers” 的组合。最终你应该返回一个含有运营商编码的列表。

    from bs4 import BeautifulSoup
    html_page = "options.html"
    
    
    def extract_carriers(page):
        data = []
    
        with open(page, "r") as html:
            # do something here to find the necessary values
            soup = BeautifulSoup(html, "lxml")
            carrier_list = soup.find(id="CarrierList")
            carriers = carrier_list.find_all('option')
            
            for carrier in carriers:
                if len(carrier['value'])==2:
                    data.append(carrier['value'])
            
        return data
    
    
    def make_request(data):
        eventvalidation = data["eventvalidation"]
        viewstate = data["viewstate"]
        airport = data["airport"]
        carrier = data["carrier"]
    
        r = s.post("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                   data = (("__EVENTTARGET", ""),
                           ("__EVENTARGUMENT", ""),
                           ("__VIEWSTATE", viewstate),
                           ("__VIEWSTATEGENERATOR",viewstategenerator),
                           ("__EVENTVALIDATION", eventvalidation),
                           ("CarrierList", carrier),
                           ("AirportList", airport),
                           ("Submit", "Submit")))
    
        return r.text
    

    7.2机场列表(html)

    其返回机场代码列表,并删除任何组合内容,例如“All”。

    from bs4 import BeautifulSoup
    html_page = "options.html"
    def extract_airports(page):
        data = []
        with open(page, "r") as html:
            # do something here to find the necessary values
            soup = BeautifulSoup(html, "lxml")
            airport_list = soup.find(id='AirportList')
            airports = airport_list.find_all('option')
            
            for airport in airports:
                if 'All' not in airport['value']:
                    data.append(airport['value'])
    
        return data
    

    7.3处理所有数据(html)

    Let's assume that you combined the code from the previous 2 exercises with code
    from the lesson on how to build requests, and downloaded all the data locally.
    The files are in a directory "data", named after the carrier and airport:
    "{}-{}.html".format(carrier, airport), for example "FL-ATL.html".

    The table with flight info has a table class="dataTDRight". Your task is to
    use 'process_file()' to extract the flight data from that table as a list of
    dictionaries, each dictionary containing relevant data from the file and table
    row. This is an example of the data structure you should return:

    data = [{"courier": "FL",
    "airport": "ATL",
    "year": 2012,
    "month": 12,
    "flights": {"domestic": 100,
    "international": 100}
    },
    {"courier": "..."}
    ]
    Note - year, month, and the flight data should be integers.
    You should skip the rows that contain the TOTAL data for a year.

    from bs4 import BeautifulSoup
    from zipfile import ZipFile
    import os
    
    datadir = "data"
    
    
    def open_zip(datadir):
        with ZipFile('{0}.zip'.format(datadir), 'r') as myzip:
            myzip.extractall()
    
    
    def process_all(datadir):
        files = os.listdir(datadir)
        return files
    
    
    def process_file(f):
        """
        This function extracts data from the file given as the function argument in
        a list of dictionaries. This is example of the data structure you should
        return:
    
        data = [{"courier": "FL",
                 "airport": "ATL",
                 "year": 2012,
                 "month": 12,
                 "flights": {"domestic": 100,
                             "international": 100}
                },
                {"courier": "..."}
        ]
    
        Note: create a new dictionary for each entry in the output data list.
        If you use the info dictionary defined here each element in the list 
        will be a reference to the same info dictionary.
        """
      
        with open("{}/{}".format(datadir, f), "r") as html:
            data = []
            info = {"courier": "",
                    "airport": "",
                    "year": "",
                    "month": "",
                    "flights": {"domestic": "",
                             "international": ""}
                }
            info["courier"], info["airport"] = f[:6].split("-")
    
            soup = BeautifulSoup(html,'lxml')
            table_data = soup.find('table',{'class':'dataTDRight'})
            for tr in table_data.find_all('tr'):
                td = tr.find_all('td')
                
                if td[1].string =='Month' or td[1].string =='TOTAL':
                    continue
                else:
                    info['year'] = int(td[0].string)
                    info['month'] = int(td[1].string)
                    info['flights']['domestic']= int(td[2].string.replace(',',''))
                    info['flights']['international']= int(td[3].string.replace(',',''))
                    
                data.append(info)
            
    
        return data
    

    7.4
    xml文件不是有效的 XML,因为它有几个根元素和 XML 声明,是多个相连的 XML 文档构成的。
    本题需要将文件拆分为多个文档,并将这些文档处理为有效的 XML 文档。

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # So, the problem is that the gigantic file is actually not a valid XML, because
    # it has several root elements, and XML declarations.
    # It is, a matter of fact, a collection of a lot of concatenated XML documents.
    # So, one solution would be to split the file into separate documents,
    # so that you can process the resulting files as valid XML documents.
    
    import xml.etree.ElementTree as ET
    PATENTS = 'patent.data'
    
    def get_root(fname):
        tree = ET.parse(fname)
        return tree.getroot()
    
    
    def split_file(filename):
        with open(filename) as infile:
            n = -1 
            outfile = open('{}-{}'.format(filename, n), 'w') 
            for line in infile:
                if line.startswith('<?xml'):
                    outfile.close() 
                    n += 1       
                    outfile = open('{}-{}'.format(filename, n),'w')
    
                outfile.write(line)
                
            outfile.close() 
        """
        Split the input file into separate files, each containing a single patent.
        As a hint - each patent declaration starts with the same line that was
        causing the error found in the previous exercises.
        
        The new files should be saved with filename in the following format:
        "{}-{}".format(filename, n) where n is a counter, starting from 0.
        """
    
    def test():
        split_file(PATENTS)
        for n in range(4):
            try:
                fname = "{}-{}".format(PATENTS, n)
                f = open(fname, "r")
                if not f.readline().startswith("<?xml"):
                    print "You have not split the file {} in the correct boundary!".format(fname)
                f.close()
            except:
                print "Could not find file {}. Check if the filename is correct!".format(fname)
    
    
    test()
    

    相关文章

      网友评论

        本文标题:更复杂格式的数据(xml&html)

        本文链接:https://www.haomeiwen.com/subject/fjheaxtx.html