1. 通过模拟网页请求获取到自如,爱上租页面信息:
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0(Windows NT 6.1; WOW64)',
}
try:
response = requests.get(url, headers=headers)
html = response.text
return html
except:
print('request error')
pass
2. 取出房源信息,soup匹配到关键的ul字段,自如为houseList,爱上租没看到ul字段,改用div:
- 自如房源获取:
def get_ziru_hourse(html):
soup = BeautifulSoup(html, 'lxml')
house_names = []
house_urls = []
links = soup.find('ul', id='houseList').find_all('a')
spans = soup.find('ul', id='houseList').find_all('span')
special = ''
spes = []
for span in spans:
special += (span.get_text()) + ' '
if '每月' in span.get_text():
spes.append(special)
special = ''
for link in links:
house_url = link.get('href')
if 'http' not in house_url:
house_url = 'http:' + house_url
if house_url not in house_urls and 'youjia_fbh' not in house_url:
house_urls.append(house_url)
house_name = link.get_text()
if '龙湖春江' in house_name:
house_names.append(house_name)
return zip(house_names, house_urls, spes)
- 爱上租房源获取:
def get_isz_hourse(html):
soup = BeautifulSoup(html, 'lxml')
house_names = []
house_urls = []
links = soup.find('div', class_='left').find_all('a')
house_name = ''
for link in links:
span = link.get_text()
if '龙湖春江' in span:
house_names.append(house_name)
house_name = ''
house_url = link.get('href')
house_name += span + ' '
if house_url is not None and house_url not in house_urls and '%E9%BE%99%E6%B9%96%E6%98%A5%E6%B1%9F' not in house_url:
house_urls.append(house_url)
del house_names[0]
return zip(house_names,house_urls)
最后,模块调用,以杭州龙湖春江为例子:
if __name__=='__main__':
url = 'http://hz.ziroom.com/z/nl/z3-d330108.html?qwd=%E9%BE%99%E6%B9%96%E6%98%A5%E6%B1%9F'
html = get_html(url)
houses = get_hourse(html)
print('自如房子:')
for house in houses:
print(house[0], house[1], house[2])
isz_url = 'http://www.ishangzu.com/zufang/?q=%E9%BE%99%E6%B9%96%E6%98%A5%E6%B1%9F'
html = get_html(isz_url)
houses = get_isz_hourse(html)
print('爱上租房子:')
for house in houses:
print(house[0], house[1])
结果:

网友评论