1、程序化结构
# 程序结构
class xxxSpider(object):
def __init__(self):
# 定义常用变量,比如url或计数变量等
def get_html(self):
# 获取响应内容函数,使用随机User-Agent
def parse_html(self):
# 使用正则表达式来解析页面,提取数据
def write_html(self):
# 将提取的数据按要求保存,csv、MySQL数据库等
def run(self):
# 主函数,用来控制整体逻辑
if __name__ == '__main__':
# 程序开始运行时间
spider = xxxSpider()
spider.run()
2、练习
# -*- coding:utf-8 -*-
"""
@author:百草Lily
@file:test_spider.py
@time:2022/7/25
"""
import os.path
import random
from urllib.request import Request, urlopen
from urllib import parse
import time
from faker import Faker
# 定义一个爬虫类
class TiebaSpider:
# 初始化url属性
def __init__(self):
self.url = "https://tieba.baidu.com/f?{}"
# 1、请求函数,得到页面,传统3部
def get_html(self, url):
fake = Faker(locale="zh_CN")
ua = fake.user_agent()
req = Request(url, headers={"User-Agent": ua})
res = urlopen(req)
html = res.read().decode("utf-8") # 避免中文乱码
return html
# 2、解析函数
def parae_html(self):
pass
# 3、保存文件函数
def save_html(self, filename, html):
with open(filename, "w", encoding="utf-8") as f:
f.write(html)
# UnicodeEncodeError: 'gbk' codec can't encode character '\xa9' in position 5283: illegal multibyte sequence
# 添加encoding='utf-8'
# 4、入口函数
def run(self):
name = input("请输入贴吧名:")
begin = int(input("请输入起始页:"))
stop = int(input("请输入终止页:"))
params = {
"ie": "utf-8",
"kw": name
}
# +1 保证能够取到stop
for page in range(begin, stop + 1):
# 处理请求参数
pn = (page - 1) * 50 # 开始条数;50条一页
if pn == 0:
params["fr"] = "search"
else:
params["pn"] = pn
# 拼接ul
url = self.url.format(parse.urlencode(params))
# 发起请求
html = self.get_html(url)
# 定义文件名及路径
basedir = os.path.dirname(__file__)
filename = os.path.join(os.path.join(basedir, "html"), "{}_{}页.html".format(name, page))
# 保存文件
self.save_html(filename, html)
# 打印日志
print(f"第{page}页抓取成功!")
# 等待:每爬取一个页面随机休眠1-2秒的时间
time.sleep(random.randint(1, 2)) # 包括首尾值
if __name__ == "__main__":
start = time.time()
spider = TiebaSpider() # 实例化一个对象
spider.run() # 调用入口函数
end = time.time()
# 查看程序执行时间
print(f"执行时间:{end-start}秒")
3、随机休眠
time.sleep(random.randint(1, 2))
爬虫程序访问网站会非常快,这与正常人类的点击行为非常不符。因此,通过随机休眠可以使爬虫程序模仿成人类的样子点击网站,从而让网站不易察觉是爬虫访问网站,但这样做的代价就是影响程序的执行效率。
参考
个人练习
# -*- coding:utf-8 -*-
"""
@author:百草Lily
@file:test_tieba.py
@time:2022/7/22
"""
from urllib import parse
import os
import time
from urllib.request import Request, urlopen
from faker import Faker
from typing import AnyStr
# https://tieba.baidu.com/f?ie=utf-8&kw=ces&fr=search
# https://tieba.baidu.com/f?kw=ces&ie=utf-8&pn=50
# https://tieba.baidu.com/f?kw=ces&ie=utf-8&pn=100
word = input("请输入希望在贴吧搜索的内容:")
urls = [] # 待搜索url
base_url = "https://tieba.baidu.com/f"
for i in range(0, 201, 50): # 前5页 0,50,100,150,200
params = {
"ie": "utf-8",
"kw": word
}
if i == 0:
params["fr"] = "search"
else:
params["pn"] = i
urls.append("{}?{}".format(base_url, parse.urlencode(params)))
print(urls)
# <a rel="noopener" href="/p/6689385508" title="开学了" target="_blank" class="j_th_tit ">开学了</a>
# <a rel="noopener" href="/p/7849277087" title="勇士第四节后段换回主力柯尔" target="_blank" class="j_th_tit ">勇士第四节</a>
# 发送请求
def get_req(url: AnyStr):
"""
发送请求
:param url:
:return:
"""
fake = Faker(locale="zh_CN")
ua = fake.user_agent()
headers = {
"User-agent": ua
}
req = Request(url, headers=headers)
resp = urlopen(req)
# text = resp.read().decode("utf-8")
# resp_header = resp.info() # Variable in function should be lowercase
return resp
# 获取文件后缀
def get_extension(resp):
"""
获取文件类型即后缀名
:param resp:
:return:
"""
"Content-Type: text/html"
content_type = resp.info()["Content-Type"] # response Header
if "text/html" in content_type:
# html类型
return ".html"
else:
print("不支持的类型")
# 保存文件
def save_file(resp):
"""
返回信息(字节)保存为文件
:param resp:
:return:
"""
ext = get_extension(resp)
if not ext:
return "不支持的文件类型"
filename = os.path.join(os.path.dirname(__file__), f"{time.strftime('%Y%m%d%H%M%S')}{ext}")
content = resp.read().decode("utf-8")
with open(filename, "w", encoding="utf-8") as f:
f.write(content)
return filename
filenames = [] # 存储文件
for url in urls:
resp = get_req(url)
filenames.append(save_file(resp))
# 解析
from pyquery import PyQuery
for file in filenames:
with open(file, encoding="utf-8") as f:
content = f.read()
doc = PyQuery(content)
tags = doc(".j_th_tit") # .class名,获取元素;返回list类型
# text = [tag.text() for tag in tags] # 获取所有元素的文本
for tag in tags:
# print(tag.text()) # TypeError: 'str' object is not callable
with open("res.txt", "w+", encoding="utf-8") as f:
f.write(tag.text)
网友评论