美文网首页
【机器学习】:房价预测(实例)

【机器学习】:房价预测(实例)

作者: Alcazar | 来源:发表于2019-09-15 08:25 被阅读0次

    一、数据集处理

    1、数据集导入

    # 读取房价数据
    import pandas as pd
    
    def load_housing_data():
        return pd.read_csv('./housing.csv')
    

    2、查看前五行数据

    housing = load_housing_data()
    # 查看前五行数据
    housing.head()
    
    房价预测

    3、获取数据的简单描述

    # 获取数据的简单描述
    housing.info()
    

    输出数据描述

    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 20640 entries, 0 to 20639
    Data columns (total 10 columns):
    longitude             20640 non-null float64
    latitude              20640 non-null float64
    housing_median_age    20640 non-null float64
    total_rooms           20640 non-null float64
    total_bedrooms        20433 non-null float64
    population            20640 non-null float64
    households            20640 non-null float64
    median_income         20640 non-null float64
    median_house_value    20640 non-null float64
    ocean_proximity       20640 non-null object
    dtypes: float64(9), object(1)
    memory usage: 1.6+ MB
    

    3、查看ocean_proximity数据的取值范围及各个值的数量

    # 查看ocean_proximity数据的取值范围
    housing['ocean_proximity'].value_counts()
    

    输出数据:

    <1H OCEAN     9136
    INLAND        6551
    NEAR OCEAN    2658
    NEAR BAY      2290
    ISLAND           5
    Name: ocean_proximity, dtype: int64
    

    4、了解数据的基本情况

    # 了解数据的基本情况
    housing.describe()
    

    输出图例:


    5、绘制每个属性的直方图,来快速了解数据。

    # 绘制每个属性的直方图,来快速了解数据。
    %matplotlib inline
    import matplotlib.pyplot as plt
    
    housing.hist(bins=50,figsize=(20,15))
    plt.show()
    
    Image.png

    二、 测试集处理

    import numpy as np
    
    def split_train_test(data,test_radio):
        '''
            生成训练集和测试集
        '''
        np.random.seed(42)
        #生成不重复的下标随机数
        shuffled_indices = np.random.permutation(len(data))
        test_set_size = int(len(data) * test_radio)
        # 测试集的下标集合
        test_indices = shuffled_indices[:test_set_size]
        # 训练集的下标集合
        train_indices = shuffled_indices[test_set_size:]
        
        return data.iloc[train_indices] , data.iloc[test_indices]
    

    输出训练集和测试集长度

    train_set,test_set = split_train_test(housing, 0.2)
    
    len(train_set),len(test_set)
    

    输出:

    (16512, 4128)

    使用 hashlib

    import hashlib
    # 选择测试集,方式2.无惧新增数据
    def test_set_check(identifier, test_radio, hash):
        return hash(np.int64(identifier)).digest()[-1] < (256 * test_radio)
    
    def split_train_test_by_id(data, test_radio,id_column,hash=hashlib.md5):
        ids = data[id_column]
        in_test_set = ids.apply(lambda id_:test_set_check(id_,test_radio, hash))
        return data.loc[~in_test_set], data.loc[in_test_set]
    
    # 为数据添加index列
    housing_width_id = housing.reset_index()
    
    train_set, test_set = split_train_test_by_id(housing_width_id,0.2,"index")
    
    len(train_set),len(test_set)
    

    输出训练集和测试集长度如上。(16512, 4128)

    判断:取最后一个字节,是否小于(256 * 0.2)

    hashlib.md5(np.int64(0)).digest()[-1] < (256 * 0.2)
    

    返会 False.


    绘制median_income的柱状图
    from sklearn.model_selection import train_test_split
    
    train_set , test_set = train_test_split(housing,test_size=0.2,random_state=42)
    
    # len(train_set),len(test_set)
    
    housing['income_cat'].where(housing['income_cat'] < 5 , 5.0, inplace=True)
    
    housing['income_cat'].hist(bins=5,histtype='stepfilled')
    
    房价预测

    分层随机抽取测试集

    from sklearn.model_selection import StratifiedShuffleSplit
    
    # 分层随机抽取测试集
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    result = split.split(housing,housing['income_cat'])
    for train_index,test_index in result:
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]
    
    len(strat_test_set),len(strat_train_set) 
    
    查看每个类别在总数总所占比例。
    housing['income_cat'].value_counts() / len(housing)
    '''
    3.0    0.350581
    2.0    0.318847
    4.0    0.176308
    5.0    0.114438
    1.0    0.039826
    Name: income_cat, dtype: float64
    '''
    

    每个类别在测试集中的数据比例

    strat_test_set['income_cat'].value_counts()/len(strat_test_set)
    '''
    3.0    0.350533
    2.0    0.318798
    4.0    0.176357
    5.0    0.114583
    1.0    0.039729
    Name: income_cat, dtype: float64
    '''
    

    删除收入类别列

    for set in (strat_test_set,strat_train_set):
        # inplace = True 时,在源数据上替换,并返回None 
        set.drop(['income_cat'],axis=1,inplace=True)
    strat_train_set.head()
    
    

    探索数据

    housing = strat_train_set.copy()
    
    # 将地理数据可视化
    housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.4,
                 s=housing['population'] / 100,label='population',c="median_house_value",
                cmap=plt.get_cmap("jet"),colorbar=True)
    plt.legend()
    plt.show()
    
    寻找特征之间的相关性

    【相关系数】:用来衡量两组变量之间的相关性。取值范围是-1到1之间。约接近1表示约正相关。约接近-1表示约负相关。 约接近0表示约不相关。

    计算相关系数矩阵
    corr_matrix = housing.corr()
    
    corr_matrix["median_house_value"]
    

    使用 Pandas 的 scatter_matrix() 函数
    绘制每个属性相对于其他属性的相关性。

    from pandas.plotting import scatter_matrix
    attributes = ["median_house_value","median_income","total_rooms","housing_median_age"]
    scatter_matrix(housing[attributes],figsize=(12,8))
    plt.show()
    
    housing["rooms_per_household"] = housing['total_rooms'] / housing["households"]
    housing["bedrooms_per_room"] =  housing['total_bedrooms'] / housing["total_rooms"]
    housing["population_per_household"] = housing['population'] / housing["households"]
    
    corr_matrix = housing.corr()
    corr_matrix["median_house_value"].sort_values(ascending=False)
    
    输出:
    '''
    median_house_value          1.000000
    median_income               0.687160
    rooms_per_household         0.146285
    total_rooms                 0.135097
    housing_median_age          0.114110
    households                  0.064506
    total_bedrooms              0.047689
    population_per_household   -0.021985
    population                 -0.026920
    longitude                  -0.047432
    latitude                   -0.142724
    bedrooms_per_room          -0.259984
    Name: median_house_value, dtype: float64
    '''
    

    数据准备

    【问】:如何处理缺失数据?

    • 放弃有缺失值的数据
    housing.dropna(['total_bedrooms'])
    
    • 放弃这个属性
    housing.drop("total_bedrooms",axis=1)
    
    • 将缺失值设置为某个值,例如平均值
    mean = housing['total_bedrooms'].mean()
    housing['total_bedrooms'].fillna(mean)
    
    # 获取训练数据(去除标签)
    housing = strat_train_set.drop("median_house_value",axis=1)
    # 获取训练数据的标签
    housing_labels = strat_train_set['median_house_value'].copy()
    
    ## Scikit-Learn 提供了容易的缺失值处理方式:imputer
    
    from sklearn.preprocessing import Imputer
    # 删除属性ocean_proximity
    housing_num = housing.drop('ocean_proximity',axis=1)
    imputer = Imputer(strategy="median")
    
    # 为所有属性生成填充策略
    imputer.fit(housing_num)
    
    '''输出:
    Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)
    '''
    
    每个属性要替换的值
    imputer.statistics_
    
    '''输出:
    array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
            408.    ,    3.5409])
    ''''
    

    数据填充

    # 完成填充,结果是Numpy数组
    X = imputer.transform(housing_num)
    
    # 将Numpy数组转换回DataFrame格式
    housing_tr = pd.DataFrame(X,columns=housing_num.columns)
    housing_tr.info()
    
    '''输出:
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 16512 entries, 0 to 16511
    Data columns (total 8 columns):
    longitude             16512 non-null float64
    latitude              16512 non-null float64
    housing_median_age    16512 non-null float64
    total_rooms           16512 non-null float64
    total_bedrooms        16512 non-null float64
    population            16512 non-null float64
    households            16512 non-null float64
    median_income         16512 non-null float64
    dtypes: float64(8)
    memory usage: 1.0 MB
    '''
    

    Scikit-Learn API设计

    Scikit-Learn API设计具有较高的一致性,一般来说Scikit-Learn由以下模块组成:

    估算器

    【解释】:能够根据数据集对某些参数进行估算的任意对象,都可以称为估算器(例如:Imputer)。
    估算器由 fit() 方法执行,需要一个数据集作为参数。对于监督式学习,需要两个参数,第二个是包含标签的数据集。创建估算器,需要指定估算策略,例如:strategy='mean'。

    转换器:

    【解释】:可以用来转换数据集的估算器,称为转换器 。API一般包括:

    • transform() : 该方法传入一个待转换的数据集,返回一个转换后的数据集。

    所有的转换器都有一个方法:

    • fit_transform(): 相当于先调用了 fit() 方法,再调用了 transform() 方法。并且这个经过了优化,会更快。
    预测器

    【解释】:能够基于一个给定的数据集进行预测的估算器,称为预测器。
    预测器有一个predict()方法,该方法接受一个新的实例数据集,返回一个包含相应预测的数据集。 还存在一个 score() 方法,可以用来衡量给定测试集的预测质量。

    处理文本和分类属性

    # 将分类文本转换为数字形式
    from sklearn.preprocessing import LabelEncoder
    encoder = LabelEncoder()
    housing_cat = housing['ocean_proximity']
    housing_cat_encoded = encoder.fit_transform(housing_cat)
    
    # 转换后的值
    housing_cat_encoded
    
    # 查看转换器学习的映射关系,'<1H OCEAN' 是0
    encoder.classes_
    
    '''(输出)
    array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
          dtype=object)
    '''
    
    housing_cat_encoded.reshape(-1,1)
    '''(输出)
    array([[0],
           [0],
           [4],
           ...,
           [1],
           [0],
           [3]])
    '''
    

    独热编码

    # 将数字形式的编码,转换为独热编码
    from sklearn.preprocessing import OneHotEncoder
    encoder = OneHotEncoder(categories='auto')
    housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
    # 返回的结果是一个稀疏矩阵,以节省内存空间。可以调用.toarray()方法,将其转换为numpy数组。
    housing_cat_1hot
    
    了解独热编码?稀疏矩阵

    可以一次将文本转换为独热编码

    LabelBinarizer 转换器

    from sklearn.preprocessing import LabelBinarizer
    encoder = LabelBinarizer()
    housing_cat_1hot = encoder.fit_transform(housing_cat)
    housing_cat_1hot
    
    '''(输出)
    array([[1, 0, 0, 0, 0],
           [1, 0, 0, 0, 0],
           [0, 0, 0, 0, 1],
           ...,
           [0, 1, 0, 0, 0],
           [1, 0, 0, 0, 0],
           [0, 0, 0, 1, 0]])
    '''
    

    自定义转换器

    转换器需要实现三个方法:fit() (返回自身) 、transform() 、fit_transform()
    【问】:如何添加TransformMixin 作为基类?
    【答】:可以直接调用fit_transform(). 如果将BaseEstimator作为基类(在构造方法中,不适用args,*kwargs 参数)。
    【知识补充】:可以获得get_params() 和 set_params()方法(用来调整超参数)

    from sklearn.base import BaseEstimator,TransformerMixin
    rooms_ix, bedrooms_ix,population_ix, househould_ix = 3, 4, 5, 6
    
    class CombineAttributesAdder(BaseEstimator,TransformerMixin):
        def __init__(self,add_bedrooms_per_room = True):
            self.add_bedrooms_per_room = add_bedrooms_per_room
            
        def fit(self, X, Y = None):
            return self
        
        def transform(self, X, Y = None):
            rooms_per_household = X[:,rooms_ix] / X[:,househould_ix]
            population_per_household = X[:,population_ix] / X[:,househould_ix]
            
            print(rooms_per_household)
            if self.add_bedrooms_per_room:
                bedrooms_per_room = X[:,bedrooms_ix] / X[:,rooms_ix]
                # 将数据按列拼接
                return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
            else :
                return np.c_[X,rooms_per_household, population_per_household]
            
    
    attr_adder = CombineAttributesAdder(add_bedrooms_per_room=False)
    housing_extra_attribs = attr_adder.transform(housing.values)
    
    housing_extra_attribs
    

    输出:

    [4.625368731563422 6.008849557522124 4.225108225108225 ...
     6.34640522875817 5.50561797752809 4.843505477308295]
    Out[33]:
    array([[-121.89, 37.29, 38.0, ..., '<1H OCEAN', 4.625368731563422,
            2.094395280235988],
           [-121.93, 37.05, 14.0, ..., '<1H OCEAN', 6.008849557522124,
            2.7079646017699117],
           [-117.2, 32.77, 31.0, ..., 'NEAR OCEAN', 4.225108225108225,
            2.0259740259740258],
           ...,
           [-116.4, 34.09, 9.0, ..., 'INLAND', 6.34640522875817,
            2.742483660130719],
           [-118.01, 33.82, 31.0, ..., '<1H OCEAN', 5.50561797752809,
            3.808988764044944],
           [-122.45, 37.77, 52.0, ..., 'NEAR BAY', 4.843505477308295,
            1.9859154929577465]], dtype=object)
    

    相关文章

      网友评论

          本文标题:【机器学习】:房价预测(实例)

          本文链接:https://www.haomeiwen.com/subject/rqoasctx.html