目前功能用于解析div、div内的a标签 (更新中.......)
# -*- coding:utf-8 -*-
import re
def get_div_ele(pattern, any_str):
ll = []
tag = re.findall('//([a-z]+)\[', pattern)[0]
ele = re.findall('@(\w+)?=', pattern)[0]
va = re.findall('[",\'](.*)?[",\']', pattern)[0]
if_continue = f'{ele}="{va}"'
# print(tag)
# print(ele)
# print(va)
# print(if_continue)
if tag == 'a':
pattern = f'{ele}="{va}".*?</{tag}>'
pattern_1 = f'>(.*?)</{tag}>'
_a = re.findall(pattern, any_str, re.S)
if _a:
for v in _a:
_d, _t = {}, {}
href = re.findall('href="(.*?)"', v)
text = re.findall(pattern_1, v)
texts = re.findall('>(.*?)<', v)
print(texts)
_d['href'] = href
_t['texts'] = texts
_t['ele'] = text
_d['text'] = _t
ll.append(_d)
else:
print(f'{tag} pattern没有匹配到')
elif tag == 'div':
pattern = f'<{tag} {ele}="{va}">.*</{tag}>'
pattern_1 = pattern.split('>.*')[0]
try:
div = re.findall(pattern, any_str, re.S)[0]
div = div.replace('\\\n', '').replace('\\\t', '').replace('\\\r', '').replace('\n', '').replace('\t', '') \
.replace('\r', '').replace(' ', '').replace(' ', '').replace(' ', '').replace(' ', '').replace(' ',
'')
x1 = re.search('<(\w+)', pattern).group(0)
x2 = re.search('</\w+>', pattern).group(0)
x3 = f'{x1} {if_continue}'
div1 = div.split(x3)
n = 1
for text in div1:
if text == '':
continue
text = f'{pattern_1}{text}'
dd = text.split(x2)
d1 = x2.join(dd[:n])
c1 = d1.count(x1)
ddd = x2.join(dd[:c1 + 1])
c2 = ddd.count(x2)
if c1 == c2:
ll.append(ddd)
except IndexError:
print(f'{tag} pattern没有匹配到')
return ll
result = {}
with open('baidu_ad.html', 'r', encoding='utf-8') as rr:
text = rr.read()
ad = get_div_ele('//div[@class="_3te7bpt f13 c-gap-top-xsmall"]', text)
for _ in ad:
_d = {}
ad = get_div_ele('//a[@class="c-showurl c-color-gray"]', _)
_d['result'] = _
_d['urls'] = ad
result['data'] = _d
print(result)
网友评论