安装BeautifulSoup
- *Linux *
#debian
$sudo apt-get install python-pip
#redhat
$sudo yum install pip
$pip install beautifulsoup4
- Windows
安装Windows版本的pip
>pip install beautifulsoup4
运行BeautifulSoup
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page1.html")
bs0bj = BeautifulSoup(html.read())
print bs0bj.h1
可靠的网络连接
try:
html = urlopen("http://www.pythonscraping.com/pages/page1.html")
except HTTPError as e:
print e
#返回空值,中断程序,或者执行另一方案
else:
#程序继续。注意:如果已经在上面异常捕获那一段代码里返回或中断(break),那么就不需要使用else语句,这段代码也不会执行
示例代码
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
def getTitle(url):
try:
html = urlopen(url)
except HTTPError as e:
return None
try:
bs0bj = BeautifulSoup(html.read())
titile = bs0bj.body.h1
except AtrributeError as e:
return None
return title
title = gettitle("http://www.pythonscraping.com/pages/page1.html")
if title == None:
print "Title could not be found"
else:
print title
BeautifulSoup的find()和findAll()
BeautifulSoup的find()和findall() 可能是你最常用的两个函数,借助它们,你可以通过标签的不同属性过滤HTML页面,查找需要的标签组或者单个标签。
BeautifulSoup文档里两者的定义就是这样:
findAll(tag, attributes, recursive, text, limit, keywords)
find(tag, attributes, recursive,text,keywords)
# 95%的时间 只需使用前两个参数: tag和attribute
# findAll 默认是支持递归查找的(recursive 默认值是True)
#文本参数text有点不同,它是用标签的文本内容去匹配,而不是用标签的属性。
处理子标签和其他后代标签
在BeautifulSoup库里,孩子 child
和后代 descendant
有显著的不同:和人类的家谱一样,子标签就是一个父标签的下一级,而后代标签是指一个父标签下面所有级别的标签。
处理兄弟标签
BeautifulSoup的处理兄弟标签的函数可以让收集表格数据成为简单的事情,尤其是处理带标题行的表格
next_siblings()函数
next_sibling()函数
previous_sibling()函数
previous_siblings()函数
他们之间的区别只是返回的是单个标签 和返回一组标签的区别。
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj = BeautifulSoup(html)
for sibling in bs0bj.find("table",{"id":"giftList"}).tr.next_siblings:
print sibling
print bs0bj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text()
解析JSON数据
import json
from urllib.request import urlopen
def getCountry(ipAddress):
response = urlopen("http://freegeoip.net/json/"+ipAddress).read().decode('utf-8')
responseJson = json.loads(response)
return responseJson.get('country_code')
print getCountry("50.58.253.58")
import json
jsonString = '{"arrayOfNums":[{"number":0},{"number":1},{"number":2}],
"arrayOfFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]}'
json0bj = json.loads(jsonString)
print json0bj.get("arrayOfNums")
print json0bj.get("arrayOfNums")[1]
print json0bj.get("arrayOfNums")[1].get("number") + json0bj.get("arrayOfNums")[2].get("number")
print json0bj.get("arrayOfFruits")[2].get("fruit")
存储数据
下载小文件
# 方法一:使用urllib库
# -*- coding:utf-8 -*-
import urllib
import time
url = 'http://mvideo.spriteapp.cn/video/2017/0414' \
'/697de826-20b5-11e7-9c72-1866daeb0df1cut_wpcco.mp4'
print "downloading with urllib"
start = time.time()
urllib.urlretrieve(url, "video.mp4")
end = time.time()
print 'Finish in :', end - start
# 方法二:使用urllib2库
# -*- coding:utf-8 -*-
import urllib2
import time
url = 'http://mvideo.spriteapp.cn/video/2017/0414/' \
'697de826-20b5-11e7-9c72-1866daeb0df1cut_wpcco.mp4'
print "downloading with urllib2"
start = time.time()
data = urllib2.urlopen(url).read()
with open('video.mp4', 'wb') as video:
video.write(data)
end = time.time()
print 'Finish in :', end - start
# 方法三:使用requests库
# -*- coding:utf-8 -*-
import requests
import time
url = 'http://mvideo.spriteapp.cn/video/2017/0414/' \
'697de826-20b5-11e7-9c72-1866daeb0df1cut_wpcco.mp4'
print "downloading with requests"
start = time.time()
r = requests.get(url)
with open('video.mp4', 'wb') as video:
video.write(r.content)
end = time.time()
print 'Finish in :', end - start
下载大文件
# 方法一:使用urllib2库
# -*- coding:utf-8 -*-
import urllib2
import time
url = 'http://mvideo.spriteapp.cn/video/2017/0414/' \
'697de826-20b5-11e7-9c72-1866daeb0df1cut_wpcco.mp4'
r = urllib2.Request(url)
u = urllib2.urlopen(r)
start = time.time()
with open('video.mp4', 'w') as f:
while True:
tmp = u.read(1024)
if not tmp:
break
f.write(tmp)
end = time.time()
print 'Finish in :', end - start
# 方法二:使用requests库
# -*- coding:utf-8 -*-
import requests
import time
url = 'http://mvideo.spriteapp.cn/video/2017/0414/' \
'697de826-20b5-11e7-9c72-1866daeb0df1cut_wpcco.mp4'
# 当把get函数的stream参数设置成False时,
# 它会立即开始下载文件并放到内存中,如果文件过大,有可能导致内存不足。
# 当把get函数的stream参数设置成True时,它不会立即开始下载,
# 使用iter_content或iter_lines遍历内容或访问内容属性时才开始下载
r = requests.get(url, stream=True)
f = open("file_path", "wb")
start = time.time()
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
# iter_content:一块一块的遍历要下载的内容
# iter_lines:一行一行的遍历要下载的内容
# 这两个函数下载大文件可以防止占用过多的内存,因为每次只下载小部分数据
end = time.time()
print 'Finish in :', end - start
把数据存储到CSV
import csv
csvFile = open("../files/test.csv",'w+')
try:
writer = csv.write(csvFile)
writer.writerow(('number','number plus 2','number times 2'))
for i in range(10):
writer.writerow((i, i+2, i*2))
finally:
csvFile.close()
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")
bs0bj=BeautifulSoup(html)
tables = bs0bj.findAll("table",{"class":"wikitable"})[0]
rows = tables.findAll("tr")
csvFile = open("../files/editors.csv",'wt',newline='',encoding='utf-8')
writer = csv.writer(csvFile)
try:
for row in rows:
csvRow = []
for cell in row.findAll(['td','th']):
csvRow.append(cell.get_text())
writer.writerow(csvRow)
finally:
csvFile.close()
读取CSV文件
- 手动把csv文件下载到本地,然后用python定位文件位置
- 写python程序下载文件,读取之后再把源文件删除
- 从网上直接把文件读成一个字符串,然后转换成一个
stringIO
对象,使它具有文件的属性
from urllib import urlopen
from io import stringIO
import csv
data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii','ignore')
dataFile = stringIO(data)
csvReader = csv.reader(dataFile)
for row in csvReader:
print row
from urllib.request import urlopen
from io import stringIO
import csv
data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii','ignore')
dataFile = stringIO(data)
dictReader = csv.DictReader(dataFile)
#打印表头第一行
print dictReader.fieldnames
for row in dictReader:
print row
网友评论