成果:
Paste_Image.png代码:
import requests
from bs4 import BeautifulSoup
import itertools
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36'}
0 是个人 1是商家
def get_url(choose):
each_phoneurl = []
list_phoneurl = 'http://gz.58.com/shouji/{}/'.format(str(choose))
phone = requests.get(list_phoneurl,headers=headers)
soup = BeautifulSoup(phone.text,'lxml')
link_phone = soup.select('tr td.t a.t')
for each_phone in link_phone:
each_phones = each_phone.get('href')
each_phoneurl.append(each_phones)
return each_phoneurl
get_url(0)
telephone_url = 'http://gz.58.com/shouji/25755367441717x.shtml?psid=173193533191818784206463731&entinfo=257t55367441717_0'
def get_pv(telephone_url):
number = telephone_url.split('/')[-1].split('x.')[0]
#split 表示分割及部分+[N]-表示切片部分,strip 去掉一部分[0]要说明要那一部分
headser = {'Referer':telephone_url}
#Referer是反爬虫的一个手段之一
post_url = 'http://jst1.58.com/counter?infoid={}'.format(str(number))
js = requests.get(post_url,headers=headser)
pv = js.text.split('=')[-1]
#print(pv)
return(pv)
get_pv(telephone_url)
def get_imformation(choose=0):
each_phoneurls = get_url(choose)
for phone_url in each_phoneurls:
try:
phone_each = requests.get(phone_url,headers=headers)
toup = BeautifulSoup(phone_each.text,'lxml')
leimu = toup.select('#header > div.breadCrumb.f12 > span > a')[-1].get_text()#if toup.select('#header > div.breadCrumb.f12 > span > a') else None
biaoti = toup.select('div.col_sub.mainTitle > h1')[0].get_text()
release_time = toup.select('#index_show > ul.mtit_con_left.fl > li.time')[0].get_text()
price = toup.select('span.price.c_f50')[0].get_text()
place = list(toup.select('div.su_con > span.c_25d')[-1].stripped_strings)
#view 加载的时候浏览量由0变为123,由JS控制
#.stripped_strings 输出的字符串中可能包含了很多空格或空行,使用 .stripped_strings 可以去除多余空白内容,这个时候会出现迭代器,所以要list()
data = {
'类目':leimu,
'标题':biaoti,
'价格':price,
'地址':"".join(itertools.chain(*place)),#"".join(itertools.chain(*list))把列表中所有的元素链接起来
'发布时间':release_time,
'浏览量':get_pv(phone_url)
}
print(data)
except:
pass
get_imformation(0)
'''
过滤方法
for link in soup.select('td.t > a[href^="http://bj.58.com/"]'): # 利用正则表达式删选自己需要网页的url
urls.append(link.get('href'))
return urls
- for title in titles:
if title.get('data-addtype') == None and title.get('onclick') == None:
filters.append(title)
'''
网友评论