神马
from lxml import etree
replaces = lambda x: ''.join(x).replace('\n', '').replace(' ', '').replace(' ', '').replace(' ', '')
with open('sm_ad.html', encoding='utf-8') as rr:
text = rr.read()
class Parser():
def __init__(self):
pass
def parse_all(self):
urls = []
rank = 0
tree = etree.HTML(text)
details = tree.xpath('//div[@id="results"]/div[not(@name="ad_commonjs" or @class="ad-alert-info")]')
for detail in details:
item = {}
ad = False
ad_obj = detail.xpath('.//span[@class="c-padding-left-s cpc-adtext" and text()="广告"]')
if ad_obj:
ad = True
url1 = detail.xpath('.//a[contains(@class,"c-line-clamp")]/@href')
url2 = detail.xpath('.//div[contains(@class,"c-title-l")]/a[@class="c-header-inner c-flex-1"]/@href')
url3 = detail.xpath('.//a[contains(@class,"c-flex c-flex-between")]/@href')
url4 = detail.xpath('.//a[contains(@class,"c-title")]/@href')
url5 = detail.xpath('.//a[contains(@class,"c-e-button--v1_0_0 c-e-btn c-e-btn-l")]/@href')
url6 = detail.xpath('.//a[@class="c-title cpc-title-sc"]/@href')
url = replaces(url1 or url2 or url3 or url4 or url5 or url6)
title1 = detail.xpath('.//span[@class="title-text"]//text()')
title2 = detail.xpath('.//span[@click_area="title"]//text()')
title3 = detail.xpath('.//i[@c-bind="data.text"]//text()')
title = replaces(title1 or title2 or title3)
if title == '其他人还搜了' or title == '相关推荐':
continue
desc1 = detail.xpath('.//*[contains(@class,"c-line-clamp-3")]/text()')
desc2 = detail.xpath('.//span[@click_area="desc_text"]/text()')
desc3 = detail.xpath('.//span[@class="js-c-paragraph-text"]//text()')
desc = replaces(desc1 or desc2 or desc3)
rank += 1
item['ad'] = ad
item['rank'] = rank
item['title'] = title
item['desc'] = desc
item['url'] = url
urls.append(item)
print(item)
if __name__ == '__main__':
parser = Parser()
parser.parse_all()
sogou
from lxml import etree
replaces = lambda x: ''.join(x).replace('\n', '').replace(' ', '').replace(' ', '').replace(' ', '')
with open('sogou_ad.html', encoding='utf-8') as rr:
text = rr.read()
class Parser():
def __init__(self):
pass
def parse_all(self):
urls = []
rank = 0
tree = etree.HTML(text)
details = tree.xpath('//div[@id="results"]/div[not(@name="ad_commonjs" or @class="ad-alert-info")]')
for detail in details:
item = {}
ad = False
ad_obj = detail.xpath('.//span[@class="c-padding-left-s cpc-adtext" and text()="广告"]')
if ad_obj:
ad = True
url1 = detail.xpath('.//a[contains(@class,"c-line-clamp")]/@href')
url2 = detail.xpath('.//div[contains(@class,"c-title-l")]/a[@class="c-header-inner c-flex-1"]/@href')
url3 = detail.xpath('.//a[contains(@class,"c-flex c-flex-between")]/@href')
url4 = detail.xpath('.//a[contains(@class,"c-title")]/@href')
url5 = detail.xpath('.//a[contains(@class,"c-e-button--v1_0_0 c-e-btn c-e-btn-l")]/@href')
url6 = detail.xpath('.//a[@class="c-title cpc-title-sc"]/@href')
url = replaces(url1 or url2 or url3 or url4 or url5 or url6)
title1 = detail.xpath('.//span[@class="title-text"]//text()')
title2 = detail.xpath('.//span[@click_area="title"]//text()')
title3 = detail.xpath('.//i[@c-bind="data.text"]//text()')
title = replaces(title1 or title2 or title3)
if title == '其他人还搜了' or title == '相关推荐':
continue
desc1 = detail.xpath('.//*[contains(@class,"c-line-clamp-3")]/text()')
desc2 = detail.xpath('.//span[@click_area="desc_text"]/text()')
desc3 = detail.xpath('.//span[@class="js-c-paragraph-text"]//text()')
desc = replaces(desc1 or desc2 or desc3)
rank += 1
item['ad'] = ad
item['rank'] = rank
item['title'] = title
item['desc'] = desc
item['url'] = url
urls.append(item)
print(item)
if __name__ == '__main__':
parser = Parser()
parser.parse_all()
网友评论