爬虫"模板"
推荐谷歌浏览器:F12 -> 网络 ->【XHR】重新加载查看 json文件信息等 -> 获取 URL + 响应头 + 参数
import requests
import json
from tqdm import trange
url = 'http://...' # 点击要爬取的json文件,将会显示实际的url,全部复制即可
headers = { # 请求头部信息,直接全部复制即可(注意不同类型的网页会不一样)
"Accept": "......",
"Accept-Language": "......",
"Connection": "keep-alive",
"Content-Length": "......",
"Content-Type": "application/json",
"User-Agent": "Mozilla/5.0 ......",
}
for index in trange(1, 11): # 假如爬虫前10页,其中 params 是请求参数(根据实际完成参数字典)
params = {"page": index, "rows": 100} # 根据实际情况填写
temp = requests.post(url, headers=headers, json=params)
temp = temp.json()
with open(r"dir_path\{}.json".format(index), "w", encoding="utf-8") as f: # 保存
json.dump(temp, f, ensure_ascii=False)
练习:获取网页信息
import urllib.request
response = urllib.request.urlopen('https://www.***.com')
html = response.read()
print(html)
# (显示忽略)
# 格式化
html = html.decode('utf-8')
print(html)
# (显示忽略)
练习:爬取图片并保存
import urllib.request
url = 'http://***./13928177_195158772185_2.jpg'
response = urllib.request.urlopen(url)
img = response.read()
with open('img.jpg', 'wb') as f:
f.write(img)
print(response.geturl())
# http://***./13928177_195158772185_2.jpg
print(response.info())
# (各种信息)
print(response.code)
# 200
实战:有道翻译爬虫程序(仅供学习使用)
有道翻译有反爬虫机制,所以简单的爬肯定不行,还需要改你得到的POST请求的URL(否则会出现:{"errorCode":50})
我的URL:http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule
需要修改成:http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule
就是把_o
去掉
import urllib.request
import urllib.parse
import json
import random
print('欢迎使用爬虫翻译程序!')
while True:
text = input('请输入带翻译的内容(输入"q!"退出程序)')
if text == 'q!':
break
# url
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
# data
data = {}
data['i'] = text
data['from'] = 'AUTO'
data['to'] = 'AUTO'
data['smartresult'] = 'dict'
data['client'] = 'fanyideskweb'
data['salt'] = '15711301787743'
data['sign'] = 'a802ca9da2f6603860996f28af946897'
data['ts'] = '1571130178774'
data['bv'] = 'e2a78ed30c66e16a857c5b6486a1d326'
data['doctype'] = 'json'
data['version'] = '2.1'
data['keyfrom'] = 'fanyi.web'
data['action'] = 'FY_BY_CLICKBUTTION'
data = urllib.parse.urlencode(data).encode('UTF-8')
# head
head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
# ip代理
proxies = ['122.116.232.79:33452', '116.62.221.139:3128']
proxy = random.choice(proxies)
proxy_support = urllib.request.ProxyHandler({'http':proxy})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
# 爬取
req = urllib.request.Request(url, data, head)
response = urllib.request.urlopen(req)
html = response.read().decode('UTF-8')
# print(html)
target = json.loads(html)
print(target['translateResult'][0][0]['tgt'])
print('程序结束!')
运行展示
网友评论