原网页编码
原网页编码问题:
直接获取到的数据是乱码,用response.text.encode('SHIFT_JIS')进行解码会有些特殊字符无法解码报错。
import requests
import chardet
url = "https://worldjpn.grips.ac.jp/documents/indices/pm/3.html"
payload={}
headers = {}
response = requests.request("GET", url, headers=headers, data=payload)
guess = chardet.detect(response.content)
print(guess)
print(response.text)
直接获取到的乱码
request解决方法:
response.encoding = response.apparent_encoding
import requests
import chardet
url = "https://worldjpn.grips.ac.jp/documents/indices/pm/3.html"
payload={}
headers = {}
response = requests.request("GET", url, headers=headers, data=payload)
response.encoding = response.apparent_encoding
guess = chardet.detect(response.content)
print(guess)
print(response.text)
request
scrapy解决方法:
response._encoding = "SHIFT_JIS"
response._cached_ubody = None
response._encoding = "SHIFT_JIS"
response._cached_ubody = None # 清理缓存
或者
response._encoding = response.encoding
response._cached_ubody = None # 清理缓存
response._encoding = response.encoding
response._cached_ubody = None # 清理缓存
scrapy
再或者在scrapy中添加编码补丁
参考:https://www.jianshu.com/p/bb268312839b
# encoding.py
from w3lib import encoding
import chardet
import chardet.charsetprober
_html_body_declared_encoding = encoding.html_body_declared_encoding
def html_body_declared_encoding(html_body_str):
res = _html_body_declared_encoding(html_body_str)
if res:
return res
guess = chardet.detect(html_body_str)
if guess and guess['confidence'] > 0.2:
return guess["encoding"]
encoding.html_body_declared_encoding = html_body_declared_encoding
在spider同级目录init引入encoding(或者把上面补丁直接放在init文件中)
import encoding as _
网友评论