处理详情页面的字符串费了些劲,其他还好。
我的成果
屏幕快照 2016-09-01 下午9.23.04.jpg我的代码
from bs4 import BeautifulSoup
import requests
import time
import pymongo
client=pymongo.MongoClient('localhost',27017)
phone_num=client['phone_num']
sheet_num=phone_num['sheet_num']
sheet_info=phone_num['sheet_info']
def phone_links(num):
url='http://bj.58.com/shoujihao/pn{}/?'.format(num)
time.sleep(1)
wb_data=requests.get(url)
soup=BeautifulSoup(wb_data.text,'lxml')
titles=soup.select('a.t > strong')
phonenum_links=soup.select('#infolist > div > ul > div > ul > li > a.t')
if soup.select('#infocont > span > b')[0].text=='0':
return
else:
for title,phonenum_link in zip(titles,phonenum_links):
data={
'title':title.get_text(),
'phonenum_links':phonenum_link.get('href').split('?')[0]
}
sheet_num.insert_one(data)
print(data)
def get_num_info(url):
time.sleep(1)
wb_data=requests.get(url)
soup=BeautifulSoup(wb_data.text,'lxml')
title=soup.select('div.col_sub.mainTitle > h1')[0]
price=soup.select('div.su_con > span')[0]
data={
'url':url,
'title':title.get_text().replace("\n","").replace(" ","").replace("\t","")[:11],
'price':price.get_text().replace("\n","").replace(" ","").replace("\t","")
}
sheet_info.insert_one(data)
print(data)
for i in range(1,1000):
phone_links(i)
for info in sheet_info.find():
get_num_info(info['url'])
总结
- 网页各有不同,需要根据具体的特点分析自己需要爬取的数据,应该采用什么方法才能有效的获取。
网友评论