美文网首页
HTMLParser爬虫新闻内容

HTMLParser爬虫新闻内容

作者: huashen_9126 | 来源:发表于2020-04-18 22:24 被阅读0次
from html.parser import HTMLParser
from html.entities import name2codepoint

class MyHTMLParser(HTMLParser):

    # def handle_starttag(self, tag, attrs):
    #     print('<%s>' % tag)

    # def handle_endtag(self, tag):
    #     print('</%s>' % tag)

    # def handle_startendtag(self, tag, attrs):
    #     print('<%s/>' % tag)

    def handle_data(self, data):
        print(data.strip())

    # def handle_comment(self, data):
    #     print('<!--', data, '-->')

    # def handle_entityref(self, name):
    #     print('&%s;' % name)

    # def handle_charref(self, name):
    #     print('&#%s;' % name)

parser = MyHTMLParser()

import requests
import re

url = 'https://new.qq.com/omn/20200418/20200418A0QEEO00.html'
rep = requests.get(url)
#rep.encoding = rep.encoding
data = rep.text
x = re.search(r'<div class="LEFT">([\s\S]*)<div id="RIGHT" class="RIGHT">', data, re.M)
parser.feed(x.group(1))

输出:新闻的文本内容

相关文章

网友评论

      本文标题:HTMLParser爬虫新闻内容

      本文链接:https://www.haomeiwen.com/subject/sluzvhtx.html