pip install bs4
pip install lxml(用c语言库)
find_all和find找






find_all找所有,find找第一个
获得标签属性
image.png
获得标签下的文字
css选择器


select找


string多行就获取不到了,要用contents


爬取天气预报




pip install html5lib,这个解析器能自动补充不完整的html标签,但是没有lxml快


完整代码
import requests
from bs4 import BeautifulSoup
from pyecharts.charts import Bar
ALL_DATA = []
def parse_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
response = requests.get(url,headers=headers)
text = response.content.decode('utf-8')
soup = BeautifulSoup(text,'html5lib')
conMidtab = soup.find('div',class_='conMidtab')
tables = conMidtab.find_all('table')
for table in tables:
trs = table.find_all('tr')[2:]
for index,tr in enumerate(trs):
tds = tr.find_all('td')
city_td = tds[0]
if index == 0:
city_ed = tds[1]
high_temp = tds[-5]
city = list(city_td.stripped_strings)[0]
temp = list(high_temp.stripped_strings)[0]
#print({'city':city,'temp':int(temp)})
ALL_DATA.append({'city':city,'temp':int(temp)})
ALL_DATA.sort(key=lambda data:data['temp'],reverse=True)
data = ALL_DATA[0:10]
cities = list(map(lambda x:x['city'],data))
temps = list(map(lambda x:x['temp'],data))
bar = Bar()
bar.add_xaxis(cities)
bar.add_yaxis("高温城市TOP10", temps)
bar.render('temperture.html')
def main():
urls = ['http://www.weather.com.cn/textFC/hb.shtml',
'http://www.weather.com.cn/textFC/db.shtml',
'http://www.weather.com.cn/textFC/hd.shtml',
'http://www.weather.com.cn/textFC/hz.shtml',
'http://www.weather.com.cn/textFC/hn.shtml',
'http://www.weather.com.cn/textFC/xb.shtml',
'http://www.weather.com.cn/textFC/xn.shtml',
'http://www.weather.com.cn/textFC/gat.shtml',]
for url in urls:
parse_page(url)
if __name__ == "__main__":
main()

放一个列表

排序

之前要把temp变成int


pyecharts文档
注意pyecharts写法和图中不一样了,详见文档,高温要倒序,reverse=True

网友评论