参考 http://blog.csdn.net/leeafay/article/details/76167189
使用python库 BeautifulSoup 及 pandas
开发环境:pycharm
python版本:3.6.3
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
# beautifulsoup方法
## load html file
def get_content(url):
html = urllib.request.urlopen(url)
content = html.read().decode("utf-8") # 转码 'ignore'
html.close() # 一定要关闭网页
return content
def save_to_file(file_name, contents):
fh = open(file_name,'w')
fh.write(contents)
fh.close()
def get_txt(info):
soup = BeautifulSoup(info,"lxml") # 设置解析器为“lxml”
#lianjia房价数据
#月份
# month =soup.select('.qushi-1')
# smonth = str(month).strip('[
'+'月链家参考均价]' ) 是# print(smonth)
#挂牌均价
average_price = soup.select('.qushi-2 > .num')
saverage_price = str(average_price).strip('[' + ']')
#print (saverage_price)
#链家房源数
total = soup.select('.txt' )
#print(total)
#在售房源
stotal1 = str(total[1]).strip('在售房源'+'套')
#print (stotal1)
#最近90天成交房源数
stotal2 = str(total[2]).strip('最近90天内成交房源'+'套')
#print(stotal2)
#昨日新增房
add = soup.select('.num')
sadd1 = str(add[1]).strip('
'+'')#昨日新增客
sadd2 = str(add[2]).strip('
' + '')#昨日带看
sadd3 = str(add[3]).strip('
' + '#print(sadd1,sadd2,sadd3)
return saverage_price,stotal1,stotal2,sadd1,sadd2,sadd3
url ="https://hz.lianjia.com/fangjia/"
content = get_content(url)
c=get_txt(content)
df = pd.DataFrame()
df["average_price"] =([c[0]])
df['house num on sale'] =([c[1]])
df['recent 90 days'] =([c[2]]) #最近90天成交房源数
df['new house num last day'] =([c[3]])
df['new guest last day'] =([c[4]])
df['new visit last day'] = ([c[5]])
print(df)
df.to_csv('/Users/wzzhou/Desktop/test.csv')
网友评论