美文网首页
urllib、pyquery下载笔趣阁小说

urllib、pyquery下载笔趣阁小说

作者: sunny2786 | 来源:发表于2018-09-25 15:03 被阅读0次

    author: sunny
    title: urllib、pyquery下载笔趣阁小说
    date: 2018-09-25 14:28:04
    categories: 编程
    tags: python


    一、小说章节路径获取

    1、爬取的小说名为摸金天师,小说首页为http://www.biquge.com.tw/18_18128/,通过urllib.request.urlopen获取页面HTTPResposne类型的对象,在通过read()方法获取页面内容

    request = urllib.request.Request(url, headers=headers)
    try:
        content = urllib.request.urlopen(request)
        text = str(content.read(), encoding = 'gbk')
        content.close()
        return text
    except urllib.error.URLError as e:
        print(e.reason)
        return ''
    

    2、审查章节元素,获取章节路径

    chapters
    def get_all_chapter(self):
        html = self.request(self.url)
        doc = pq(html)
        all_chapters = doc('#list a').items()
        for a in all_chapters:
            text = a.text()
            href = self.domain + a.attr('href')
            self.chapter_titles.append(text)
            self.chapter_urls.append(href)
    

    3、审查页面元素,获取每个章节内容

    content
    def get_content(self, url):
        html = self.request(url)
        doc = pq(html)
        content = doc('#content').text()
        content = content.replace('\xa0'*4, '\n\n')
        return content
    

    4、将文章输出txt

    def write (self, name, path, txt):
            write_flag = True
            with open(path, 'a', encoding = 'utf-8') as f:
                f.write(name + '\n')
                f.writelines(txt)
                f.write('\n\n')
    

    二、完整代码

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    # author: sunny
    
    import urllib.request
    from pyquery import PyQuery as pq
    import sys, random, time
    
    class DownloadNovel():
        def __init__(self, url):
            self.url = url
            self.chapter_urls = []
            self.chapter_titles = []
            self.domain = 'http://www.biquge.com.tw'
            self.sleep_download_time = 5
        def get_all_chapter(self):
            html = self.request(self.url)
            doc = pq(html)
            all_chapters = doc('#list a').items()
            for a in all_chapters:
                text = a.text()
                href = self.domain + a.attr('href')
                self.chapter_titles.append(text)
                self.chapter_urls.append(href)
        def get_content(self, url):
            html = self.request(url)
            doc = pq(html)
            content = doc('#content').text()
            content = content.replace('\xa0'*4, '\n\n')
            return content
        def write(self, name, path, txt):
            with open(path, 'a', encoding='utf-8') as f:
                f.write(name + '\n')
                f.writelines(txt)
                f.write('\n\n')
        def request(self, url):
            time.sleep(self.sleep_download_time)
            # 动态userAgent
            user_agent_list = [ \
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
                "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
                "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
                "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
                "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
                "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
                "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
            ]
            ua = random.choice(user_agent_list)
            headers = {
                'User-Agent': ua
            }
            request = urllib.request.Request(url, headers=headers)
            try:
                content = urllib.request.urlopen(request)
                text = str(content.read(), encoding = 'gbk')
                content.close()
                return text
            except urllib.error.URLError as e:
                print(e.reason)
                return ''
    
    if __name__ == '__main__':
        dl = DownloadNovel('http://www.biquge.com.tw/18_18128/')
        dl.get_all_chapter()
        for i in range(len(dl.chapter_titles)):
            print('url=%s, title=%s' %(dl.chapter_urls[i],dl.chapter_titles[i]))
            txt = dl.get_content(dl.chapter_urls[i])
            dl.write(dl.chapter_titles[i], '摸金天师.txt', txt)
            sys.stdout.write('  已下载:%.3f%%' % float(i/len(dl.chapter_titles)) + '\r')
            sys.stdout.flush()
        print('下载完成')
    

    三、效果

    效果

    四、源码

    源码链接

    相关文章

      网友评论

          本文标题:urllib、pyquery下载笔趣阁小说

          本文链接:https://www.haomeiwen.com/subject/tgkloftx.html