美文网首页
基于传送门的微信公众号爬虫

基于传送门的微信公众号爬虫

作者: sixkery | 来源:发表于2018-10-25 14:21 被阅读48次

最后输出PDF格式,暂不稳定,有待改进。


import requests
from lxml import etree
from bs4 import BeautifulSoup
import pdfkit



headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
KEYWORD = 'duhaoshu'

# 判断该公众号是否被传送门网站收录。
def judge(nameid):
    url = 'http://chuansong.me/account/' + nameid + '?start=' + str(0)
    response =requests.get(url,headers=headers)
    html = etree.HTML(response.text)
    url = html.xpath('//div[contains(@class,"topic_page")]/h1/text()')
    if '404' in url:
        print('暂未收录该公众号的文章。')
    else:
        pass



def parse_one_page(url):
    r = requests.get(url,headers=headers)
    soup = BeautifulSoup(r.text,'lxml')
    data = soup.find_all('div',{'tabindex':'-1'})
    for html in data:
        detail_url = html.find_all('a')[0].get('href')
        full_url = 'http://chuansong.me' + detail_url
        parse_detail(full_url)
        break


def parse_detail(full_url):
    r = requests.get(full_url, headers=headers)
    data = etree.HTML(r.text)

    title = data.xpath('//h2[@class="rich_media_title"]/text()')[0]
    publish_time = data.xpath('//em[@id="publish_time"]/text()')[0]

    # url 装换成 pdf
    path_wk = r'F:\Downloads\html-pdf\wkhtmltopdf\bin\wkhtmltopdf.exe'  # 安装wkhtmltopdf的位置
    config = pdfkit.configuration(wkhtmltopdf=path_wk)
    print(title, publish_time)
    pdfkit.from_url(full_url,title + '.pdf', configuration=config)
    print('转换成功!')


def main():
    for page in range(1):
        url = 'http://chuansong.me/account/' + KEYWORD + '?start={}'.format(page)
        parse_one_page(url)
if __name__ == '__main__':
    main()



相关文章

网友评论

      本文标题:基于传送门的微信公众号爬虫

      本文链接:https://www.haomeiwen.com/subject/maaqtqtx.html