from bs4 import BeautifulSoup
from urllib import request
from http import cookiejar
import re
from openpyxl import workbook# 写入Excel表所用
from openpyxl import load_workbook# 读取Excel表所用
url= "http://www.qianyan001.com/"
# 使用cookie管理
cookie_jar= cookiejar.CookieJar() # 构建一个CookieJar对象实例来保存cookie
# 使用HTTPCookieProcessor()来创建cookie处理器对象,参数为CookieJar()对象
handler= request.HTTPCookieProcessor(cookie_jar)
opener= request.build_opener(handler) # 构建打开器
request.install_opener(opener)
response= request.urlopen(url)
html= response.read().decode()
print(response.getcode()) # 状态
print(cookie_jar)
# print(html)
soup= BeautifulSoup(html, "html.parser")
find_html= soup.find(class_="r_new_list")
find_li= find_html.find_all("li")
# 填充常跟对齐一起使用
# ^、<、>分别是居中、左对齐、右对齐,后面带宽度
# :号后面带填充的字符,只能是一个字符,不指定的话默认是用空格填充
with open("study_1_bs4.txt", "w") as f:
for liin find_li:
f.write(
li.find("em").get_text() + " " + "{0:{1}<22}".format(li.find("a").string, chr(12288)) + "\t" + li.find("a")[
"href"] + "\n")
print("成功爬取数据:")
with open("study_1_bs4.txt", "r") as f:
print(f.read())
# 创建Excel表并写入数据
wb= workbook.Workbook() # 创建Excel对象
wb["Sheet"].title= "军事新闻"
# ws = wb.create_sheet("军事新闻", 0) # 插入到最开始的位置
ws= wb.active# 获取当前正在操作的表对象
# 往表中写入标题行,以列表形式写入!
ws.append(['序号', '标题', '链接'])
for liin find_li:
ws.append([li.find("em").get_text(), li.find("a").string, li.find("a")[
"href"]])
wb.save('军事新闻.xlsx')
# 读取Excel
print("===========读取Excel===========")
wb_load= load_workbook('军事新闻.xlsx')
sheet= wb['军事新闻']
num= 1
for rowin sheet.rows:
for cellin row:
if num== 2:
print("{0:{1}^30}".format(cell.value, chr(12288)), "\t", end="")
else:
print(cell.value, "\t", end="")
num+= 1
print("")
网友评论