功能点
- 维护待爬取的url列表
- url去重
- 提取想要的url
#!/user/bin/env python
# coding=utf-8
import urllib2
import re
import urlparse
def download(url, num_reties=2):
print 'downloading:', url
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
try:
request = urllib2.Request(url, headers=headers)
html = urllib2.urlopen(request).read()
except urllib2.URLError as e:
print 'download error:', e.reason
html = None
if num_reties > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url, num_reties - 1)
return html
def link_crawler(seed_url, link_regex):
# 待爬取的列表
crawl_queue = [seed_url]
# url去重
seen_url = set(crawl_queue)
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
for link in get_links(html):
if re.match(link_regex, link):
base_url = 'http://bbs.mumayi.com'
link = urlparse.urljoin(base_url, link)
if link not in seen_url:
print 'link=', link
crawl_queue.append(link)
def get_links(html):
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
return webpage_regex.findall(html)
# 'thread-6339929-1-1.html'
if __name__ == '__main__':
link_crawler("http://bbs.mumayi.com/forum-8-1.html", 'thread-[0-9]{1,}-1-1.html')
网友评论