-
一、bs4模块解析的用法复习
-
二、抓取逻辑的复习
-
三、http://www.51hao.cc/ 网站全站抓取 抓取存储csv文件
from bs4 import BeautifulSoup
import requests
import csv
def get_city():
url='http://www.51hao.cc/'
req=requests.get(url)
req.encoding='gb2312'
soup=BeautifulSoup(req.text,'lxml')
provinces_info=soup.find_all('div',class_='fkbj')
city_list=[]
for item in provinces_info:
province_info=item.find('a')
province_name=province_info.text
# print(province_name)#省份的名字
cities=item.next_sibling.next_sibling #item.next_sibling,
cities_info=cities.find_all('a')
for city_info in cities_info:
city_dict={}
city_dict['province']=province_name
city_dict['city']=city_info.text
city_dict['city_url']=city_info['href']#h获取href
city_list.append(city_dict)
# print(province_name,city_info.text,city_info['href'])#h获取href
return city_list
def get_number(city):
number_info=[]
city_url=city['city_url']
req=requests.get(city_url)
req.encoding='gb2312'
soup=BeautifulSoup(req.text,'lxml')
numbers_info=soup.find_all('div',class_='num_bg')
for item in numbers_info:
# print(item.span.text,item.span.next_sibling)#131 联通号段 (共42个)
first5_info=item.parent.next_sibling.next_sibling
for one_first5 in first5_info.find_all('li'):
number_dict={}
number_dict['province']=city['province']
number_dict['city'] = city['city']
number_dict['first3'] = item.span.text
number_dict['type']=item.span.next_sibling.string.split('(')[0].strip()
number_dict['first5']=one_first5.a.text
number_info.append(number_dict)
print(number_dict)
return number_info
if __name__=='__main__':
city_info=get_city()
all_city_number_info=[]
for city in city_info:
a_city_number_info = []
a_city_number_info=get_number(city)
all_city_number_info.extend(a_city_number_info)
ff=open('all_city_number.csv','w',newline='',encoding='gb2312')#用wb不行
f_csv=csv.writer(ff)
for i in range(len(all_city_number_info)):
a_list=[]
a_list.append(all_city_number_info[i]['province'])
a_list.append(all_city_number_info[i]['city'])
a_list.append(all_city_number_info[i]['first3'])
a_list.append(all_city_number_info[i]['type'])
a_list.append(all_city_number_info[i]['first5'])
f_csv.writerow(a_list)
ff.close()
网友评论