简书文章爬虫解析

作者: ZemelZhu | 来源:发表于2018-09-01 13:36 被阅读0次

简书文章爬虫解析
Swfit爬虫通过作者ID无接口获取简书文章列表，正则匹配HTM
js2x：简书 to Hexo 格式转换器
无标题文章
爬取简书数据生成api
新手向爬虫（三）别人的爬虫在干啥
我有新专题，而你有声音吗？
Python爬虫从0开始学（1）
爬虫技术(二)－新的思路
爬虫入门到放弃系列02：html网页如何解析

核心解析

# 解析url，获得标题与内容
def AnalysisUrl(url):
    try:
        request = urllib2.Request(url, headers=headers)
        response = urllib2.urlopen(request)
        resHtml = response.read()
        # 获取整个页面
        html = BeautifulSoup(resHtml)
        # 获取文章内容
        a = html.select('div[class="show-content-free"]')
        # 获取全部图片
        img = html.select('div[class="image-package"]')
        # 获取图片地址
        image = html.select('div[class="show-content-free"] img')
        # 第一段
        content = a[0].__str__()
        # 图片附加内容
        imgpatt = '?imageMogr2/auto-orient/strip%7CimageView2/2/w/'
        # 正则匹配出图片位置
        patternImgSize = re.compile(r'<div class="image-container" '
                                    r'style="max-width: (.*?)px;', re.S)
        flat = 0
        for i in img:
            # 图片格式拼接
            d = image[flat].attrs['data-original-src'] + imgpatt \
                + patternImgSize.findall(i.__str__())[0]
            # 图片替换
            content = content.replace(i.__str__(), "<img src=\"" + d + "\">")
            flat = flat + 1
        # 自定义格式需要
        content = content.replace("<b>", "<strong>")
        content = content.replace("</b>", "</strong>")
        content = content.replace('<div class="show-content-free">', "")
        content = content.replace("</div>", "")

        # 获得标题
        title = html.select('meta[property="og:title"]')[0].attrs['content']

        #解析出内容与标题
        writeArticle(content, title)
    except:
        print "该文章解析失败 url:" + url

解析过滤简书自定义标签，文章再用其他富文本逆向解析即可

用vue quill editor解析

vue quill editor解析显示.PNG

完整代码

# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
import re
import time

headers = {
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "X-Requested-With": "XMLHttpRequest",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/56.0",
    "Content-Type": "application/json;charset=UTF-8",
    "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
    "Cookie": "JSESSIONID=2D1E55287F8B056E83FD29B114FBA389"
}

# 解析url，获得标题与内容
def AnalysisUrl(url):
    try:
        request = urllib2.Request(url, headers=headers)
        response = urllib2.urlopen(request)
        resHtml = response.read()
        # 获取整个页面
        html = BeautifulSoup(resHtml)
        # 获取文章内容
        a = html.select('div[class="show-content-free"]')
        # 获取全部图片
        img = html.select('div[class="image-package"]')
        # 获取图片地址
        image = html.select('div[class="show-content-free"] img')
        # 第一段
        content = a[0].__str__()
        # 图片附加内容
        imgpatt = '?imageMogr2/auto-orient/strip%7CimageView2/2/w/'
        # 正则匹配出图片位置
        patternImgSize = re.compile(r'<div class="image-container" '
                                    r'style="max-width: (.*?)px;', re.S)
        flat = 0
        for i in img:
            # 图片格式拼接
            d = image[flat].attrs['data-original-src'] + imgpatt \
                + patternImgSize.findall(i.__str__())[0]
            # 图片替换
            content = content.replace(i.__str__(), "<img src=\"" + d + "\">")
            flat = flat + 1
        # 自定义格式需要
        content = content.replace("<b>", "<strong>")
        content = content.replace("</b>", "</strong>")
        content = content.replace('<div class="show-content-free">', "")
        content = content.replace("</div>", "")

        # 获得标题
        title = html.select('meta[property="og:title"]')[0].attrs['content']

        #解析出内容与标题
        writeArticle(content, title)
    except:
        print "该文章解析失败 url:" + url


# 文章写入，用其他富文本编辑器解析
def writeArticle(content, title):
    with open(title + ".txt", "w") as f:
        f.write(content)

if __name__ == "__main__":
    # auto.py解析出来的url，进行文章解析
    file = open("articleUrl.txt")
    myTime = 0
    while 1:
        line = file.readline()
        # 把这个字符串头和尾的空格，以及位于头尾的\n \t之类给删掉
        url = line.strip('\n')
        myTime = myTime + 1
        AnalysisUrl(url)
        if myTime > 10:
            # 休眠策略,10篇文章休眠3秒
            time.sleep(3)
            myTime = 0
        if not line:
            break