HTMLParser解析网页主要是通过继承HTMLParser类来编写自定义函数
from html.parser import HTMLParser
from html.entities import name2codepoint
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.flag = 'N'
#通过标记或属性个数,属性值来判断获取需要的元素
def handle_starttag(self, tag, attrs):
if tag=='a' and len(attrs)==3 and attrs[0][1]=='title':
self.flag = 'Y'
#print('<%s>' % tag)
def handle_endtag(self, tag):
self.flag = 'N'
#print('</%s>' % tag)
def handle_startendtag(self, tag, attrs):
print('<%s/>' % tag)
#获取元素的文本值
def handle_data(self, data):
if self.flag == 'Y':
print("文章标题:{}".format(data))
#print(data)
#def handle_comment(self, data):
# pass
# #print('<!--', data, '-->')
# 解析类似这种
# def handle_entityref(self, name):
# pass
# #print('&%s;' % name)
# 解析Ӓ
# def handle_charref(self, name):
# pass
# #print('&#%s;' % name)
if __name_-='__main__':
with open('result.html','r',encoding='utf-8') as f:
files=f.read()
f.close()
parser = MyHTMLParser()
parser.feed(files)
网友评论