美文网首页
只用re正则解析html

只用re正则解析html

作者: 是东东 | 来源:发表于2021-10-08 03:46 被阅读0次

    目前功能用于解析div、div内的a标签 (更新中.......)

    # -*- coding:utf-8 -*-
    import re
    
    
    def get_div_ele(pattern, any_str):
        ll = []
        tag = re.findall('//([a-z]+)\[', pattern)[0]
        ele = re.findall('@(\w+)?=', pattern)[0]
        va = re.findall('[",\'](.*)?[",\']', pattern)[0]
        if_continue = f'{ele}="{va}"'
        # print(tag)
        # print(ele)
        # print(va)
        # print(if_continue)
        if tag == 'a':
            pattern = f'{ele}="{va}".*?</{tag}>'
            pattern_1 = f'>(.*?)</{tag}>'
            _a = re.findall(pattern, any_str, re.S)
            if _a:
                for v in _a:
                    _d, _t = {}, {}
                    href = re.findall('href="(.*?)"', v)
                    text = re.findall(pattern_1, v)
                    texts = re.findall('>(.*?)<', v)
                    print(texts)
                    _d['href'] = href
                    _t['texts'] = texts
                    _t['ele'] = text
                    _d['text'] = _t
                    ll.append(_d)
            else:
                print(f'{tag} pattern没有匹配到')
        elif tag == 'div':
            pattern = f'<{tag} {ele}="{va}">.*</{tag}>'
            pattern_1 = pattern.split('>.*')[0]
            try:
                div = re.findall(pattern, any_str, re.S)[0]
                div = div.replace('\\\n', '').replace('\\\t', '').replace('\\\r', '').replace('\n', '').replace('\t', '') \
                    .replace('\r', '').replace('  ', '').replace('  ', '').replace('  ', '').replace('  ', '').replace('  ',
                                                                                                                       '')
                x1 = re.search('<(\w+)', pattern).group(0)
                x2 = re.search('</\w+>', pattern).group(0)
                x3 = f'{x1} {if_continue}'
                div1 = div.split(x3)
                n = 1
                for text in div1:
                    if text == '':
                        continue
                    text = f'{pattern_1}{text}'
                    dd = text.split(x2)
                    d1 = x2.join(dd[:n])
                    c1 = d1.count(x1)
                    ddd = x2.join(dd[:c1 + 1])
                    c2 = ddd.count(x2)
                    if c1 == c2:
                        ll.append(ddd)
            except IndexError:
                print(f'{tag} pattern没有匹配到')
    
        return ll
    
    
    result = {}
    with open('baidu_ad.html', 'r', encoding='utf-8') as rr:
        text = rr.read()
    ad = get_div_ele('//div[@class="_3te7bpt f13 c-gap-top-xsmall"]', text)
    for _ in ad:
        _d = {}
        ad = get_div_ele('//a[@class="c-showurl c-color-gray"]', _)
        _d['result'] = _
        _d['urls'] = ad
        result['data'] = _d
    print(result)
    

    相关文章

      网友评论

          本文标题:只用re正则解析html

          本文链接:https://www.haomeiwen.com/subject/qvmtoltx.html