美文网首页
python爬虫(4)-多线程

python爬虫(4)-多线程

作者: 初灬终 | 来源:发表于2017-11-27 21:22 被阅读3次

环境:python3.6
目标:下载整本小说。(使用多线程)

import requests
import re
import threading

#下载小说章节
class MyThread(threading.Thread):

    def __init__(self, threadID, threadName, url):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.threadName = threadName
        self.url = url

    def run(self):
        itemurl = self.url + '.html'
        itemreq = requests.get(baselink + itemurl)
        itemreq.encoding = 'utf-8'
        itemf = open(itemurl, 'w')
        itemf.write(itemreq.text)
        itemf.close()

#找到所有的html字段
def findLinks(htmlstring):
    linkPattern = re.compile("href=\"(.+?).html\"")
    return linkPattern.findall(htmlstring)

# 抓去网页内容
baselink = 'http://chuanyuezhe.yetianzi.com'
req = requests.get(baselink)
req.encoding = 'utf-8'

# 找到章节链接
alllink = findLinks(req.text)

# 抓取所有章节的内容
for item in alllink:
    thread = MyThread(alllink.index(item),item+"%s"% alllink.index(item),item)
    thread.start()
    thread.join()

相关文章

网友评论

      本文标题:python爬虫(4)-多线程

      本文链接:https://www.haomeiwen.com/subject/etkjbxtx.html