美文网首页
以python类实现下载古诗文网史书和部分诗集

以python类实现下载古诗文网史书和部分诗集

作者: 鱿鱼炸酱面 | 来源:发表于2019-05-24 21:50 被阅读0次

闲暇之余,利用网络请求库requests结合BeautifulSoup实现对古诗文网的史书和部分诗集的爬取,已封装为一个类。

实现功能:

1.可以对输入的史书名(如'史记'、'汉书'、'明史',)进行搜寻和下载。
2.可以对输入的诗集名(如'唐诗三百首'、'爱情'、'小学古诗')进行搜寻和下载。
3.下载成的文件夹名称以找到的实际名字为准(如是诗集,可能比输入的名字多),路径默认为当前脚本同级目录。

代码如下:

import os
import random
import requests
from bs4 import BeautifulSoup
import re
import time
from xpinyin import Pinyin


class GuShiWen(object):
    """
      古诗文网操作类
    """
    def __init__(self):
        """
        初始化
        """
        # 浏览器标识集
        user_agent = [
            "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) "
            "Version/5.1 Safari/534.50",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) "
            "Version/5.1 Safari/534.50",
            "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
            "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; ."
            "NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
        ]
        # 随机获取一个浏览器
        rand_agent = random.choice(user_agent)
        # 拼装请求头
        self.header = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,images/webp,images/apng,*/*;q=0.8",
            "User-Agent": rand_agent,
        }

    def get_proxy_content(self):
        """
        获取代理ip
        :return:代理ip
        """
        with open('hosts.txt','r') as fp:
            content = fp.read().split('\n')
        return content

    def get_cate_div_list(self, url, cate_lable):
        """
        获取分类的列表
        :param url: 详情页url
        :return:分类的列表/None
        """
        try:
            response = requests.get(url, headers=self.header,)
        except:
            return None
        response.encoding='utf8'
        result = response.content
        su = BeautifulSoup(result, 'lxml')
        sonsdiv= su.find('div', class_="sons")
        # print(sonsdiv)
        catedivs = sonsdiv.find_all('div', class_=cate_lable)
        # print(catedivs)
        return catedivs

    def get_gushi(self, url):
        """
        获得古诗或者文章的详情页
        :param url: url
        :return: 古诗url返回的页面的BeautifulSoup对象格式/None
        """
        try:
            response = requests.get(url, headers=self.header)
        except:
            return None
        response.encoding = 'utf8'
        result = response.content
        su = BeautifulSoup(result, 'lxml')
        return su

    def get_gushi_url(self, name):
        """
        判断专题的页面是否存在
        :param url: url
        :return: 古诗专题url/False,名称/None
        """
        # print(name)
        name_len = len(name)

        # print(pinyin)
        if len(name) == 2:
            pinyin = Pinyin().get_pinyin(name).replace('-', '')
            url = 'https://so.gushiwen.org/gushi/%s.aspx' % pinyin
            response = requests.get(url, headers=self.header)
            # print(response.status_code)
            content = response.content.decode('utf8')
            # print(content)
            if '该网页不存在或存在错误' in content:
                print('没有找到‘%s’相关古诗文专题!' % name)
                return False
            else:
                # print(url)
                return url
        else:
            rand_name = []
            for i in range(name_len-1):
                for y in range(i+1, name_len):
                    rand_name.append(name[i] + name[y])
            # print(rand_name)
            for r_name in rand_name:
                pinyin = Pinyin().get_pinyin(r_name).replace('-', '')
                # print(pinyin)
                url = 'https://so.gushiwen.org/gushi/%s.aspx' % pinyin
                response = requests.get(url, headers=self.header)
                # print(response.status_code)
                content = response.content.decode('utf8')
                # print(content)
                if '该网页不存在或存在错误' in content:
                    if r_name == rand_name[-1]:
                        print('没有找到‘%s’相关古诗文专题!' % name)
                        return False
                    continue
                else:
                    # print(url)
                    su = BeautifulSoup(content, 'lxml')
                    title = su.find('div', class_='title').get_text().strip()
                    # print(title)
                    if name in title:
                        # print(name, url)
                        return url
                    else:
                        if r_name == rand_name[-1]:
                            print('没有找到‘%s’相关古诗文专题!' % name)
                            return False
                        continue

    def get_guji_url(self, dirname):
        """
        判断史籍存在与否并返回目录页url
        :param dirname: 史籍名
        :return: 古籍目录页面url/False
        """
        url = 'https://so.gushiwen.org/search.aspx?value=' + dirname
        try:
            response = requests.get(url, headers=self.header)
        except:
            return None
        response.encoding = 'utf8'
        result = response.content
        # print(result)
        su = BeautifulSoup(result, 'lxml')
        div = su.find('div', class_="sonspic")
        # print(div)
        if div is None:
            print('没有找到"%s"相关史籍!' % dirname)
            return False
        find_p = div.find('div',class_='cont').find('p')
        find_name = find_p.find('b').get_text()
        if find_name != dirname:
            print('没有找到"%s"相关史籍!' % dirname)
            return False
        url = find_p.find('a')['href']
        # print(url)
        # print(find_name)
        url = 'https://so.gushiwen.org/' + url
        # print(url)
        return url

    def download_poem(self):
        """
        下载专题诗集
        :return:无
        """
        name = input('请输入要下载的古诗集专题名:')
        # 查询并获取输入诗集名的目录页面url
        url= self.get_gushi_url(name)
        # print(url)
        # 如果找不到诗集,下载失败
        if not url:
            print('下载诗集《%s》失败!' % name)
            return False
        # 创建路径准备下载
        if not os.path.exists(name):
            os.mkdir(name)
        # 将目录页面分解为大章节块
        cate_div_list = self.get_cate_div_list(url, "typecont")
        # 按章节进行下载
        self.download_cate(cate_div_list, name)

    def download_guji(self):
        """
        下载史籍
        :return: 无
        """
        name = input('请输入要下载史籍名:')
        # 查询并获取输入史籍名的目录页面url
        url = self.get_guji_url(name)
        # print(url)
        # 如果找不到史籍,下载失败
        if not url:
            print('下载史籍《%s》失败!' % name)
            return False
        # 创建路径准备下载
        if not os.path.exists(name):
            os.mkdir(name)
        # 将目录页面分解为大章节块
        cate_div_list = self.get_cate_div_list(url, "bookcont")
        # 按大章节进行下载
        self.download_cate(cate_div_list,name)

    def download_detail(self, gushilist, dir_name, catename = None):
        """
        具体下载某篇文章或诗文
        :return: 无
        """
        s = 1
        for x in gushilist:
            tittle = x.get_text()
            try:
                tittle = re.findall('(.*?)\(.*?\)', tittle)[0]
            except:
                pass
            detailurl = 'https://so.gushiwen.org' + x.find('a')['href']
            gushi = self.get_gushi(detailurl)
            age_author = gushi.find('p', class_='source').get_text()
            content = gushi.find('div', class_='contson').get_text()
            allcontent = tittle + '\n--' + age_author + '\n' + content
            if not catename:
                f = open(os.path.join('./' + dir_name, tittle + '.txt'), 'wb')
            else:
                f = open(os.path.join('./' + dir_name, catename, tittle + '.txt'), 'wb')
            f.write(allcontent.encode('utf8'))
            if catename:
                print('《%s》之《%s》第%d篇《%s》下载成功!!!' % (dir_name, catename, s, tittle))
            else:
                print('《%s》第%d篇《%s》下载成功!!!' % (dir_name, s, tittle))
            s += 1
        if catename:
            print('"%s"之----"%s"下载完成!----------------' % (dir_name, catename))
        time.sleep(2)

    def download_cate(self, cate_div_list, dir_name):
        """
        下载详情章节下的内容
        :return: 无
        """
        # print(cate_div_list)
        if len(cate_div_list) == 1:
            gushilist = cate_div_list[0].find_all('span')
            self.download_detail(gushilist, dir_name)
        else:
            for i in range(len(cate_div_list)):
                catediv = cate_div_list[i]
                catename = catediv.find('div',class_="bookMl").get_text()
                if not os.path.exists(os.path.join(os.path.dirname(__file__),dir_name,catename)):
                    os.mkdir(os.path.join(os.path.dirname(__file__),dir_name,catename))
                gushilist = catediv.find_all('span')
                self.download_detail(gushilist, dir_name, catename)
        print('下载《%s》成功!' % dir_name)

    def download(self):
        """
        选择要下载的类型进行下载
        :return: 无
        """
        choice = input('请选择你要下载的类型(1.史书 2.诗集),输入对应数字:')
        if choice == '1':
            self.download_guji()
        if choice == '2':
            self.download_poem()


if __name__ == '__main__':
    # 实例化古诗文对象
    gushiwen = GuShiWen()

    # 调用下载方法
    gushiwen.download()

    # ***************测试*******************************
    # gushiwen.get_gushi_url('小学古诗')
    # gushiwen.get_guji_url('元史')

运行的效果如下:

运行效果.png

下载后的文件如下:

下载文件夹.png

相关文章

网友评论

      本文标题:以python类实现下载古诗文网史书和部分诗集

      本文链接:https://www.haomeiwen.com/subject/lcgnzqtx.html