美文网首页
解析sm和sogou

解析sm和sogou

作者: 是东东 | 来源:发表于2022-06-27 03:10 被阅读0次

神马

from lxml import etree

replaces = lambda x: ''.join(x).replace('\n', '').replace(' ', '').replace(' ', '').replace(' ', '')

with open('sm_ad.html', encoding='utf-8') as rr:
    text = rr.read()


class Parser():
    def __init__(self):
        pass

    def parse_all(self):
        urls = []
        rank = 0
        tree = etree.HTML(text)
        details = tree.xpath('//div[@id="results"]/div[not(@name="ad_commonjs" or @class="ad-alert-info")]')
        for detail in details:
            item = {}
            ad = False
            ad_obj = detail.xpath('.//span[@class="c-padding-left-s cpc-adtext" and text()="广告"]')
            if ad_obj:
                ad = True
            url1 = detail.xpath('.//a[contains(@class,"c-line-clamp")]/@href')
            url2 = detail.xpath('.//div[contains(@class,"c-title-l")]/a[@class="c-header-inner c-flex-1"]/@href')
            url3 = detail.xpath('.//a[contains(@class,"c-flex c-flex-between")]/@href')
            url4 = detail.xpath('.//a[contains(@class,"c-title")]/@href')
            url5 = detail.xpath('.//a[contains(@class,"c-e-button--v1_0_0 c-e-btn c-e-btn-l")]/@href')
            url6 = detail.xpath('.//a[@class="c-title cpc-title-sc"]/@href')
            url = replaces(url1 or url2 or url3 or url4 or url5 or url6)
            title1 = detail.xpath('.//span[@class="title-text"]//text()')
            title2 = detail.xpath('.//span[@click_area="title"]//text()')
            title3 = detail.xpath('.//i[@c-bind="data.text"]//text()')
            title = replaces(title1 or title2 or title3)
            if title == '其他人还搜了' or title == '相关推荐':
                continue
            desc1 = detail.xpath('.//*[contains(@class,"c-line-clamp-3")]/text()')
            desc2 = detail.xpath('.//span[@click_area="desc_text"]/text()')
            desc3 = detail.xpath('.//span[@class="js-c-paragraph-text"]//text()')
            desc = replaces(desc1 or desc2 or desc3)
            rank += 1
            item['ad'] = ad
            item['rank'] = rank
            item['title'] = title
            item['desc'] = desc
            item['url'] = url
            urls.append(item)
            print(item)


if __name__ == '__main__':
    parser = Parser()
    parser.parse_all()

sogou

from lxml import etree

replaces = lambda x: ''.join(x).replace('\n', '').replace(' ', '').replace(' ', '').replace(' ', '')

with open('sogou_ad.html', encoding='utf-8') as rr:
    text = rr.read()


class Parser():
    def __init__(self):
        pass

    def parse_all(self):
        urls = []
        rank = 0
        tree = etree.HTML(text)
        details = tree.xpath('//div[@id="results"]/div[not(@name="ad_commonjs" or @class="ad-alert-info")]')
        for detail in details:
            item = {}
            ad = False
            ad_obj = detail.xpath('.//span[@class="c-padding-left-s cpc-adtext" and text()="广告"]')
            if ad_obj:
                ad = True
            url1 = detail.xpath('.//a[contains(@class,"c-line-clamp")]/@href')
            url2 = detail.xpath('.//div[contains(@class,"c-title-l")]/a[@class="c-header-inner c-flex-1"]/@href')
            url3 = detail.xpath('.//a[contains(@class,"c-flex c-flex-between")]/@href')
            url4 = detail.xpath('.//a[contains(@class,"c-title")]/@href')
            url5 = detail.xpath('.//a[contains(@class,"c-e-button--v1_0_0 c-e-btn c-e-btn-l")]/@href')
            url6 = detail.xpath('.//a[@class="c-title cpc-title-sc"]/@href')
            url = replaces(url1 or url2 or url3 or url4 or url5 or url6)
            title1 = detail.xpath('.//span[@class="title-text"]//text()')
            title2 = detail.xpath('.//span[@click_area="title"]//text()')
            title3 = detail.xpath('.//i[@c-bind="data.text"]//text()')
            title = replaces(title1 or title2 or title3)
            if title == '其他人还搜了' or title == '相关推荐':
                continue
            desc1 = detail.xpath('.//*[contains(@class,"c-line-clamp-3")]/text()')
            desc2 = detail.xpath('.//span[@click_area="desc_text"]/text()')
            desc3 = detail.xpath('.//span[@class="js-c-paragraph-text"]//text()')
            desc = replaces(desc1 or desc2 or desc3)
            rank += 1
            item['ad'] = ad
            item['rank'] = rank
            item['title'] = title
            item['desc'] = desc
            item['url'] = url
            urls.append(item)
            print(item)


if __name__ == '__main__':
    parser = Parser()
    parser.parse_all()

相关文章

网友评论

      本文标题:解析sm和sogou

      本文链接:https://www.haomeiwen.com/subject/dmlnvrtx.html