简单爬虫项目(一)
简单爬虫的基础模块
url下载器
from urllib import request
class HtmlDownLoader(object):
"""docstring for HtmlDownLoader"""
def __init__(self):
super(HtmlDownLoader, self).__init__()
def download(self,url):
if url is None :
return None
response = request.urlopen(url)
if response.getcode()!=200:
return None
pass
print('downloader sccuess')
return response.read()
pass
url管理器
class UrlManager(object):
"""docstring for UrlManager"""
def __init__(self):
super(UrlManager, self).__init__()
self.new_urls = set()
self.old_urls = set()
def add_new_url(self,url):
if url is None:
return
pass
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
pass
def add_new_urls(self,urls):
if urls is None or len(urls) == 0:
return
pass
for url in urls:
self.new_urls.add(url)
pass
pass
def get_new_url(self):
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
pass
def has_new_url(self):
return len(self.new_urls) != 0
pass
网页解析器
import re
import urllib.request
from bs4 import BeautifulSoup
class HtmlParser(object):
"""docstring for HtmlParser"""
def __init__(self):
super(HtmlParser, self).__init__()
def parse(self,page_url,html_cont):
if html_cont == None:
return
pass
soup = BeautifulSoup(html_cont, "html.parser", from_encoding="utf-8")
print(soup)
pass
def get_new_data(self,page_url):
return data
pass
def get_new_urls(self,page_url,soup):
new_urls = set()
return new_urls
pass
网页解析方式有很多种,可自行百度,我这个解析模块并没有用上,所以没有写处理逻辑进去
爬取的资源存储
class HtmlOutputer(object):
"""docstring for HtmlOutputer"""
def __init__(self):
super(HtmlOutputer, self).__init__()
self.datas = set()
def collect_data(self,data):
if data is None:
print('collect fail')
return
pass
self.datas.add(data)
pass
def output_html(self):
print(len(self.datas))
fout = open("output.html","w",encoding = 'utf-8')
fout.write("<html>")
fout.write("<meta charset = 'utf-8'>")
fout.write("<body>")
fout.write("<table>")
for data in self.datas:
fout.write("<tr>")
fout.write("<td>工作:%s</td>"% data.jobName)
fout.write("<td>薪酬:%s</td>"% data.salary)
fout.write("<td>任职公司:%s</td>"% data.company)
fout.write("</tr>")
fout.write("</table>")
fout.write("</body>")
fout.write("</html>")
fout.close()
pass
def output_txt(self):
pass
用解析器解析出数据之后,接下来就是存储数据了。保存的形式可以多种多样,最简单的形式是直接保存为文本文件,如TXT、JSON、CSV等。另外,还可以保存到数据库中,如关系型数据库MySQL,非关系型数据库MongoDB、Redis等。
如果对爬虫很有兴趣的同学可以去
静觅
这个网站对爬虫相关知识涉及非常广泛,可以学习和参考
网友评论