@[toc]
实例1:京东商品页面的爬取
import requests
url = "https://item.jd.com/2967929.html"
try:
r = requests.get(url)
print(r.status_code)
r.encoding = r.apparent_encoding
print(r.text[:1000])
except:
print("爬取失败")
200
<script>window.location.href='https://passport.jd.com/new/login.aspx?ReturnUrl=http%3A%2F%2Fitem.jd.com%2F2967929.html'</script>
实例2:亚马逊商品页面的爬取
r.status_code
r.request.headers
200
{'User-Agent': 'python-requests/2.25.1', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
需要伪造请求头
import requests
url = "https://www.amazon.cn/gp/product/B01M8L5Z3Y"
try:
r = requests.get(url,headers={'user-agent':'Mozilla/5.0'})
print(r.status_code)
print(r.request.headers)
r.encoding = r.apparent_encoding
print(r.text[:200])
except:
print("爬取失败")
200
{'user-agent': 'Mozilla/5.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
<!doctype html><html class="a-no-js" data-19ax5a9jf="dingo
实例3:百度/360搜索关键字提交
百度
import requests
keyword = "Python"
try:
kv = {'wd':keyword}
r = requests.get("http://www.baidu.com/s",params=kv)
print(r.request.url)
print(r.raise_for_status)
print(len(r.text))
except:
print("爬取失败")
https://wappass.baidu.com/static/captcha/tuxing.html?&logid=9744966515589624158&ak=c27bbc89afca0463650ac9bde68ebe06&backurl=https%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3DPython&signature=06031158f14027893f369c0fda89741f×tamp=1630056169
<bound method Response.raise_for_status of <Response [200]>>
1545
360搜索
import requests
keyword = "Python"
try:
kv = {'q':keyword}
r = requests.get("http://www.so.com/s",params=kv)
print(r.request.url)
r.raise_for_status
print(len(r.text))
except:
print("爬取失败")
https://www.so.com/s?q=Python
404954
实例4:网络图片的爬取和存储
import requests
import os
path = "abc.jpg"
url = "http://pic40.nipic.com/20140416/17763166_232812451127_2.jpg"
try:
r = requests.get(url)
print(r.request.url)
print(r.status_code)
with open(path,'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
except:
print("爬取失败")
http://pic40.nipic.com/20140416/17763166_232812451127_2.jpg
200
文件保存成功
实例5:IP地址归属地的自动查询
import requests
url = "http://m.ip138.com/ip.asp?ip="
try:
r = requests.get(url+'192.168.88.103')
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text[-500:])
except:
print("爬取失败")
网友评论