美文网首页
python爬虫-笔趣阁

python爬虫-笔趣阁

作者: 小明童鞋的大哥的弟弟 | 来源:发表于2018-07-02 17:37 被阅读0次
    from urllib.request import quote, unquote
    import requests
    from bs4 import BeautifulSoup
    import sys
    import time
    
    content = input('请输入你想要查找的小说名:')
    initial_content = content
    keyword = quote(initial_content,encoding='gb2312')#笔趣阁的是gb2312编码
    url = 'http://www.biquge.com.tw/modules/article/soshu.php?searchkey='+keyword
    print(url)
    re = requests.get(url)   #笔趣阁搜索url
    retype=re.apparent_encoding
    re.encoding = retype
    print(re.status_code)
    html = re.text
    soup = BeautifulSoup(html, 'html.parser')
    fileName = '/Users/john/Desktop/小说/'+initial_content+'.txt'
    print(fileName)
    file = open(fileName, 'a', encoding='utf-8')
    
    chapters = soup.find_all(id='list')
    info = soup.find_all(id='info')
    for link in info:
        file.write(link.get_text())#书籍作者信息
    download_soup = BeautifulSoup(str(chapters), 'html.parser')
    
    arr = []
    for child in download_soup.dl.children:    #dl下所有子节点
        if hasattr(child, 'href') and child.a != None:
            arr.append(child.get_text())
    numbel = len(arr)
    print(numbel)
    index= 1
    time1 = time.time ()#获取当前时间(秒)
    for child in download_soup.dl.children:     #dl下所有子节点
        if hasattr(child, 'href') and child.a != None:
            file.write(child.get_text() + '\n' + '-----------------------------------------------' + '\n')
            url = 'http://www.biquge.com.tw/' + child.a['href']
            # print(url)
            reponse_dl = requests.get(url)
            type_dl = reponse_dl.apparent_encoding
            reponse_dl.encoding = type_dl
            html_dl = reponse_dl.text
            soup_dl = BeautifulSoup(html_dl, 'html.parser')
            contents = soup_dl.find_all(id='content')   #带着<div id="content">
            # soup_text = BeautifulSoup(str(contents), 'lxml')
           # print(soup_text)
            for link in contents:
                #print(link.get_text())
                file.write(link.get_text() + '\n\n')
            print("已下载:%.3f%%" % float(index / numbel*100))#爬取进度
            index += 1
    time2 = time.time()
    tt = (time2 - time1)
    print('花费时间:' + str(tt) + '秒')
    file.close()
    
    主要是根据笔趣阁的搜索url,加上书名,拿到目录,然后一章章下载到本地,我是mac,window需要改下保存地址
    
    

    相关文章

      网友评论

          本文标题:python爬虫-笔趣阁

          本文链接:https://www.haomeiwen.com/subject/criluftx.html