成果展示
屏幕快照 2017-03-25 下午7.19.13.pngfrom bs4 import BeautifulSoup
import requests
import time
def get_links(who_sells):
url = 'http://bj.58.com/pbdn/{}/pn2'.format(who_sells)
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
link_tags = soup.select('td.t a.t')
urls_datas = []
for link_tag in link_tags:
if link_tag.get('href').find('jump') < 0:
#转转的link
urls_datas.append(link_tag.get('href').split('?')[0])
else:
#精准的link解析
urls_datas.append('http://bj.58.com/pingbandiannao/'+link_tag.get('href').split('entinfo')[1].split('&')[0][1:-2]+'x.shtml')
#print(urls_datas)
get_infos(urls_datas,who_sells)
def getViews(url):
#解析获取浏览数,但是得到的始终为0,待解决 ???
info_id =url.split('/')[-1].strip('x.shtml')
api = 'http://jst1.58.com/counter?infoid={}'.format(info_id)
headers = {
'Cookie': r'bj58_id58s="eG44SE0raFpjSmpwMjI4NQ=="; id58=c5/ns1jBVI5v3RDiA5T7Ag==; als=0; myfeet_tooltip=end; bangbigtip2=1; city=bj; ipcity=gltokyo%7C%u4E1C%u4EAC; sessionid=e3b672f8-f7ca-4c60-8eb3-eba3cfb9a905; 58tj_uuid=a1cc2a2a-8536-417e-8fdf-b86563c43986; new_session=0; new_uv=9; utm_source=; spm=; init_refer=; final_history={}; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=10'.format(str(info_id)),
'User-Agent': r'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'jst1.58.com',
'Referer': r'http://bj.58.com/pingbandiannao/{}x.shtml'.format(info_id)
}
r = requests.get(api,headers)
return r.text.split('total=')[1]
def get_infos(urls,who_sells =0):
for url in urls:
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
time.sleep(1)
if "zhuanzhuan" not in url:
data = {
'title': soup.title.text.strip(),
'price': soup.select('.price.c_f50')[0].text,
'date': soup.select('li.time')[0].text ,
'area': list(soup.select('.c_25d')[0].stripped_strings) if soup.find_all('span', 'c_25d') else None,
'cate': '个人' if who_sells == 0 else '商家',
'views': getViews(url)
}
print(data)
else:
data = {
'title': soup.title.text.strip(),
'price': soup.select('span.price_now i')[0].text,
'date': None,
'area': list(soup.select('div.palce_li i')[0].stripped_strings) if soup.find_all('div', 'palce_li') else None,
'cate': '个人' ,
'views': soup.select('span.look_time')[0].get_text().strip(u'次浏览')
}
print(data)
#保存到本地文件中
# with open('/Users/lht/Downloads/imgs/text','a') as fs:
# for data0 in datas:
# fs.write(str(data0))
get_links(0)
个人总结
-浏览量通过js请求,还未解决
网友评论