美文网首页
循环获取链接

循环获取链接

作者: IthinkIcanfly | 来源:发表于2018-10-09 15:40 被阅读0次
    from urllib.request import urlopen
    from urllib.error import HTTPError
    from urllib.error import URLError
    from bs4 import BeautifulSoup
    import random
    import datetime
    import re
    
    pages = set()
    random.seed(datetime.datetime.now())
    
    def getInternalLinks(bsObject,includeUrl):
            internalLinks = []
            for link in bsObject.findAll('a',href = re.compile('^(/|.{0,2}' + includeUrl + ')')):
                    if link.attrs['href'] is not None:
                            href = 'http://' + re.sub('^(.|/)+','',link.attrs['href'])
                            if href not in internalLinks:
                                    internalLinks.append(href)
            return(internalLinks)
    
    def splitAddress(address):
        addressParts = address.replace('http://','').split('/')
        return(addressParts)
    
    def getExternalLinks(bsObject,excludeUrl):
        externalLinks = []
        for link in bsObject.findAll('a',{'href': re.compile('^(http)((?!' + excludeUrl + ').)*$')}):
            if link.attrs['href'] is not None:
                href = link.attrs['href']
                if href not in externalLinks:
                    externalLinks.append(href)
        return(externalLinks)
    
    def getRandomExternalLink(startingPage):
            html = urlopen(startingPage)
            bsObject = BeautifulSoup(html,'html.parser')
            externalLinks = getExternalLinks(bsObject,splitAddress(startingPage)[0])
            if len(externalLinks) == 0:
                    internalLinks = getInternalLinks(bsObject,splitAddress(startingPage)[0])
                    return(getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks) - 1)]))
            else:
                    return(externalLinks[random.randint(0,len(externalLinks) - 1)])
    
    def followExternalOnly(startingSite):
            externalLink = getRandomExternalLink(startingSite)
            print('随机外链是: ' + externalLink)
            try:
                followExternalOnly(externalLink)
            except (HTTPError,ValueError,URLError):
                followExternalOnly(startingSite)
    
    followExternalOnly('http://www.baidu.com/')
    
    

    相关文章

      网友评论

          本文标题:循环获取链接

          本文链接:https://www.haomeiwen.com/subject/clxyaftx.html