爬取百度贴吧python文件源代码如下(欢迎点赞哦)
import urllib.request
import urllib.parse
def load_page(url, filename):
'''
作用:根据url发送请求,获取服务器响应文件
url:需要爬取的url地址
'''
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident / 5.0;"}
request = urllib.request.Request(url, headers=headers)
return urllib.request.urlopen(request).read()
def write_page(html, filename):
'''
作用:将html内容写入本地文件
html:服务器响应文件内容
'''
print("正在保存" + filename)
with open(filename, 'w', encoding='utf-8') as file:
file.write(html.decode('utf-8'))
def tieba_spider(url, begin_page, end_page):
'''
作用:贴吧爬虫调度器,负责组合处理每个页面的url
url:贴吧url的前半部分
begin_page:起始页码
end_page:结束页
'''
for page in range(begin_page, end_page + 1):
pn = (page - 1) * 50
file_name = "第" + str(page) + "页.html"
full_url = url + "&pn=" + str(pn)
html = load_page(full_url, file_name)
write_page(html, file_name)
if __name__ == "__main__":
kw = input("请输入需要爬取的贴吧名:")
begin_page = int(input("请输入起始页:"))
end_page = int(input("请输入结束页:"))
url = 'http://tieba.baidu.com/f?'
key = urllib.parse.urlencode({"kw": kw})
# 组合后的url示例:http://tieba.baidu.com/f?kw=lol
url = url + key
tieba_spider(url, begin_page, end_page)
程序运行图:
源代码及对话框
爬取成功的html页面存储在源代码同目录下,如下所示
爬取网页与原网页对比图
贴吧第一页
贴吧第二页
这样我们就可以成功看到爬取到的网页啦!
喜欢就点个赞吧~
网友评论