List of User Agent Strings
这是一个英文网站,里面记录着很多很多浏览器的user agent
写了个爬虫把这些user agent
都抓了下来,然后写了个随机函数,随机获取user agent
from bs4 import BeautifulSoup as bs
from urllib import request
import json
import random
class UserAgent:
_url = "http://useragentstring.com/pages/useragentstring.php?name=All"
def __init__(self, cache=True, update=False):
self._update = update
if cache:
self._save_as_json()
pass
def user_agent(self) -> list:
try:
user_agent = self._read_json()
except FileNotFoundError as fnf:
user_agent = self._get_user_agent_from_html()
return user_agent
pass
def _get_user_agent_from_html(self) -> list:
# 解析网页并获取 user agent 条目
req = request.Request(url=self._url)
html = request.urlopen(req).read().decode("iso-8859-1")
soup = bs(html, "lxml")
return [li.text for li in soup.select("#liste ul li a")]
def _save_as_json(self):
# 缓存为 user_agent.json 文件
try:
with open("user_agent.json", "x", encoding="utf-8") as fp:
json.dump(self._get_user_agent_from_html(), fp)
except FileExistsError as er:
if self._update:
with open("user_agent.json", "w", encoding="utf-8") as fp:
json.dump(self._get_user_agent_from_html(), fp)
pass
def _read_json(self) -> list:
# 从缓存文件 user_agent.json 中读取 user_agent
try:
with open("user_agent.json", "r") as fp:
user_agent = json.load(fp)
if len(user_agent) > 0:
return user_agent
else:
self._update = True
self._save_as_json()
raise FileNotFoundError
except FileNotFoundError as er:
raise FileNotFoundError
pass
pass
def random(self) -> str:
return random.choice(self.user_agent())
pass
if __name__ == '__main__':
ua = UserAgent()
print(ua.random())
pass
网友评论