import requests
from bs4 import BeautifulSoup
import pandas as pd
r = requests.get(url = 'https://book.douban.com/latest')
#查看状态
# print(r.encoding, r.status_code, r.encoding)
# print(r.url)
#查看页面内容
# print(r.text)
#解析网址
# soup = BeautifulSoup(r.text,'lxml')
#提取标签
# print(soup.head)
# print(soup.title)
# print(soup.a) # 提取的第一个a标签
# 标签的类型是什么?
# print(type(soup.title))
# # print(type(soup.a))
#标签 属性 元素
# print(soup.a.name,type(soup.a.name))
# print(soup.a.attrs,type(soup.a.attrs))
# print(soup.a.text,type(soup.a.text))
# 如何查找标签? → find() /find_all()
# find() → 查找单个标签
# 如果标签属性唯一,则确定位置;不唯一情况下选取第一个
# find_all() → 查找所有标签
# urls = soup.find('div',class_="grid-12-12 clearfix").find_all('a')
# url_lst = []
#
# for url in urls[::2]:
# url_lst.append(url['href'])
# # 保存所有url
#
# print(len(url_lst))
# print(url_lst[:5])
# 创建函数,采集页面信息
def get_data(ui):
ri = requests.get(url = ui)
soupi = BeautifulSoup(ri.text,'lxml')
# 访问页面 + 页面解析
infors = soupi.find_all('div',class_="detail-frame")
lst = []
for i in infors:
dic = {}
dic['书名'] = i.find('h2').text.replace('\n','')
dic['评分'] = i.find_all('p')[0].text.replace('\n','').replace(' ','')
dic['其他信息'] = i.find_all('p')[1].text.replace('\n','').replace(' ','')
dic['简介'] = i.find_all('p')[2].text.replace('\n','').replace(' ','')
lst.append(dic)
return lst
# 函数构建完成
url = 'https://book.douban.com/latest'
result = get_data(url)
# 调用函数采集数据
# 数据转换 - dataframe
df = pd.DataFrame(result)
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100,默认为50
pd.set_option('max_colwidth',100)
print(df)
网友评论