美文网首页
python简单爬虫实现

python简单爬虫实现

作者: 大道至简_Andy | 来源:发表于2017-11-15 00:02 被阅读19次

    功能点

    1. 维护待爬取的url列表
    2. url去重
    3. 提取想要的url
    #!/user/bin/env python
    # coding=utf-8
    
    import urllib2
    import re
    import urlparse
    
    
    def download(url, num_reties=2):
        print 'downloading:', url
    
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
        try:
            request = urllib2.Request(url, headers=headers)
            html = urllib2.urlopen(request).read()
        except urllib2.URLError as e:
            print 'download error:', e.reason
            html = None
            if num_reties > 0:
                if hasattr(e, 'code') and 500 <= e.code < 600:
                    return download(url, num_reties - 1)
        return html
    
    
    def link_crawler(seed_url, link_regex):
        # 待爬取的列表
        crawl_queue = [seed_url]
        # url去重
        seen_url = set(crawl_queue)
    
        while crawl_queue:
            url = crawl_queue.pop()
            html = download(url)
            for link in get_links(html):
                if re.match(link_regex, link):
                    base_url = 'http://bbs.mumayi.com'
                    link = urlparse.urljoin(base_url, link)
                    if link not in seen_url:
                        print 'link=', link
                        crawl_queue.append(link)
    
    
    def get_links(html):
        webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
        return webpage_regex.findall(html)
    
    
    # 'thread-6339929-1-1.html'
    if __name__ == '__main__':
        link_crawler("http://bbs.mumayi.com/forum-8-1.html", 'thread-[0-9]{1,}-1-1.html')
    

    相关文章

      网友评论

          本文标题:python简单爬虫实现

          本文链接:https://www.haomeiwen.com/subject/plbqvxtx.html