本文先对链家网最新的武汉二手房数据进行爬取,提取房价面积和平米单价并以逗号分割存储在txt文件中,最后对数据分别进行线性回归和多项式回归,并以绘制相关图查看效果。
1. 数据爬取
import requests
from bs4 import BeautifulSoup
import bs4
import os
def getHTMLText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def getAreaPrice(url,path):
html = getHTMLText(url)
soup = BeautifulSoup(html, 'html.parser')
try:
ul = soup.find_all('ul', class_="sellListContent")[0]
with open(path,"a") as f:
for li in ul.children:
try:
if isinstance(li, bs4.element.Tag):
area = li('div', class_="houseInfo")[0].text.split('|')[2][:-3]
price = li('div', class_="unitPrice")[0].string[2:-4]
f.write(str(float(area)).strip() + "," + str(int(price)) + "\n")
except:
continue
except:
pass
if __name__ == '__main__':
list = []
path = "prices.txt"
if os.path.exists(path):
os.remove(path)
pageNum = 97
for i in range(pageNum):
url = "https://wh.lianjia.com/ershoufang/pg" + str(i)
print("\r房价数据正在下载,当前进度: {:.2f}%".format((i +1) * 100 / pageNum), end="")
getAreaPrice(url,path)
爬取的原始数据,2908条
2. 回归分析
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
datasets_X = []
datasets_Y = []
fr = open('prices.txt','r',encoding= 'utf-8')
lines = fr.readlines()
for line in lines:
items = line.strip().split(',')
datasets_X.append(float(items[0]))
datasets_Y.append(int(items[1]))
length = len(datasets_X)
datasets_X = np.array(datasets_X).reshape([length,1])
datasets_Y = np.array(datasets_Y)
minX = min(datasets_X)
maxX = max(datasets_X)
X = np.arange(minX,maxX).reshape([-1,1])
# 线性回归
linear = linear_model.LinearRegression()
linear.fit(datasets_X, datasets_Y)
# 多项式回归
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(datasets_X)
lin_reg_2 = linear_model.LinearRegression()
lin_reg_2.fit(X_poly, datasets_Y)
# 绘图
plt.figure(figsize=(12,7)) #设置图片大小
plt.scatter(datasets_X, datasets_Y, color = 'green',marker='.')
plt.plot(X, linear.predict(X), color = 'red')
plt.plot(X, lin_reg_2.predict(poly_reg.fit_transform(X)), color = 'blue')
plt.xlabel('Area')
plt.ylabel('Price')
plt.show()
回归图
结论:如上图所示,绿色点为样本散点,红色直线是线性回归结果,蓝色曲线是多项式回归结果,很直观的表明多项式曲线拟合得更好,从图中可以看出大多数二手房面积在80-110平米之间,同时该区间房价基本是最低的,小户型房价略有提升,随着户型增大,房价上升趋势比较明显。
网友评论