URL统一资源定位符
URL一般格式如下(带方括号[]的为可选项):
protocol :// hostname[:port] / path / [;parameters][?query]#fragment
URL的格式由三部分组成:
(1)protocol:第一部分就是协议,例如百度使用的就是https协议;
(2)hostname[:port]:第二部分就是主机名(还有端口号为可选参数),一般网站默认的端口号为80,例如百度的主机名就是www.baidu.com,这个就是服务器的地址;
(3)path:第三部分就是主机资源的具体地址,如目录和文件名等。
网络爬虫就是根据这个URL来获取网页信息的。
urllib库
urllib是基于http的高层库,它有以下三个主要功能:
-
request处理客户端的请求
-
response处理服务端的响应
-
parse会解析url
-
主要用来识别网站的robots.txt文件,用得较少
获取响应信息
# 获取网页内容
import urllib.request
response = urllib.request.urlopen('http://www.baidu.com/')
html = response.read().decode("utf-8")
print(html)
# 取响应状态码和头信息
print(response.status)
print(response.getheaders())
print(response.getheader("Server"))
设置超时时间
import urllib.request
response = urllib.request.urlopen("http://2018.sina.com.cn/", timeout=1)
html = response.read().decode("utf-8")
print(html)
设置请求头和参数
from urllib import request, parse
url = "http://2018.sina.com.cn/"
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Host": "2018.sina.com.cn",
}
dict = {
"name": "Question"
}
data = bytes(parse.urlencode(dict), encoding="utf8")
req = request.Request(url=url, data=data, headers=headers, method="GET")
response = request.urlopen(req)
print(response.read().decode("utf-8"))
异常处理
from urllib import request, error
try:
response = request.urlopen("https://cuiqingcai.com/index.htm")
except error.URLError as e:
print(e.reason)
用爬虫下载图片
pip install request
import requests
r = requests.get("http://www.baidu.com")
print(r.status_code)
print(r.text)
print(r.cookies)
以爬取猫眼电影为例
import requests
import re
def get_one_page(offset):
url = "http://maoyan.com./board/4?offset=%d" % offset
headers = {
"User-Agent":"Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 5.1;360SE)"
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
# print(response.text)
return response.text
return None
def parse_one_page(html):
# 排名信息
#pattern = re.compile("<dd>.*?board-index.*?>(.*?)</i>",re.S)
# 加 图片链接
# pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?<img.*?<img.*?src="(.*?)"', re.S)
# 加 主演 上映时间
# pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?<img.*?<img.*?src="(.*?)".*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>', re.S)
# pattern = re.compile("movieId.*?>(.*?)</a>", re.S)
# pattern = re.compile('<p class="star">(.*?)</p>', re.S)
pattern = re.compile('movieId.*?>.*?<img.*?<img.*?src="(.*?)"',re.S)
items = re.findall(pattern,html)
return items
def get_all_page():
for i in range(10):
offset = i*10
html = get_one_page(offset)
items = parse_one_page(html)
# print(items)
for item in items:
write_img(item)
#http://p0.meituan.net/movie/283292171619cdfd5b240c8fd093f1eb255670.jpg@160w_220h_1e_1c
def write_img(url):
url_parts = url.split("@")
url_result = url_parts[0]
filename = "./images/%s" % url_result.split("/")[-1]
print(filename)
r = requests.get(url)
with open(filename,"wb") as f:
f.write(r.content)
get_all_page()
'''
def main():
get_all_page()
if __name__ == "__main__":
main()
'''
爬虫框架 Beautiful Soup
一款强大的网页解析工具,不用复杂的正则表达式,依靠网页结构和属性进行解析
安装
pip install beautifulsoup4
引入BeautifulSoup
from bs4 import BeautifulSoup
获取方法
soup = BeautifulSoup(html, "lxml") # 试用lxml解析器构造beautifulsoup
print(soup.prettify()) # 取网页缩进格式化输出
print(soup.title.string) # 取网页title内容
print(soup.head)
print(soup.p)
# 获取节点的名字
print(soup.title.name)
# 获取节点属性
soup.img.attrs["src"]
print(soup.p.attrs)
print(soup.p.attrs["name"])
print(soup.p["class"])
# 获取节点包含的内容
print(soup.p.string)
<p class="c1"><span>asdf</span>asdfasdfasdfasdfadsfad</p>
嵌套选择
<head>
<title>this is title</title>
</head>
# soup的节点都为 bs4.element.Tag类型,可以继续选择
print(soup.head.title.string)
关联选择
有些元素没有特征定位,可以先选择有办法定位的,然后以这个节点为准选择它的子节点、父节点、兄弟节点等
<p class="p1"></p>
<p></p>
<p></p>
print(soup.p.contents) # 取p节点下面所有子节点列表
print(soup.p.descendants) #取p节点所有子孙节点
print(soup.a.parent) # 取父节点
print(soup.a.parents) # 取所有祖先节点
print(soup.a.next_sibling) # 同级下一节点
print(soup.a.previous_sibling) # 同级上一节点
print(soup.a.next_siblings) # 同级所有后面节点
print(soup.a.previous_siblings) # 同级所有前面节点
print(list(soup.a.parents)[0].attrs['class'])
方法选择器
根据属性和文本进行查找
<ul><li></li></ul>
<ul><li></li>jjj<li></li></ul>
print(soup.find_all(name="ul"))
for ul in soup.find_all(name="ul"):
print(ul.find_all(name="li"))
for li in ul.find_all(name="li"):
print(li.string)
soup.find_all(attrs={"id": "list-1"})
css 选择器
通过select()直接传入CSS选择器就可以完成选择
熟悉前端的人对CSS可能更加了解,其实用法也是一样的
.表示class #表示id
标签1,标签2 找到所有的标签1和标签2
标签1 标签2 找到标签1内部的所有的标签2
[attr] 可以通过这种方法找到具有某个属性的所有标签
[atrr=value] 例子[target=_blank]表示查找所有target=_blank的标签
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.select('.panel .panel-heading'))
print(soup.select('ul li'))
print(soup.select('#list-2 .element'))
print(type(soup.select('ul')[0]))
以豆瓣为例
from bs4 import BeautifulSoup
import requests
def get_page():
url = "http://www.douban.com/group/explore"
headers = {
"User-Agent":"Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 5.1;360SE)"
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
# print(response.text)
return response.text
return None
def parse_page(html):
result_list = []
# soup = BeautifulSoup(html,'lxml')
soup = BeautifulSoup(html,'html.parser')
items = soup.select('.channel-item')
for item in items:
result_dict = {}
title = item.select('h3 a')[0].string
result_dict["title"] = title
like = item.select('.likes')[0].contents[0]
result_dict['like'] = int(like)
result_list.append(result_dict)
print(result_list)
return result_list
def main():
html = get_page()
parse_page(html)
main()
爬取豆瓣电影评分
Beautiful Soup 4.2.0 文档
python爬虫从入门到放弃(六)之 BeautifulSoup库的使用
Python爬虫利器二之Beautiful Soup的用法
网友评论