第七次尝试

作者: Evas77 | 来源:发表于2017-10-28 21:11 被阅读4次

    第七课:案例分析

    import pandas as pd
    import numpy as np
    import scipy.stats
    import matplotlib.pyplot as plt
    import matplotlib
    matplotlib.style.use('ggplot')  
    
    %matplotlib inline
    %config InlineBackend.figure_format = 'retina'
    
    ls data 
    
     驱动器 C 中的卷没有标签。
     卷的序列号是 46F0-2618
    
     C:\Users\Eva's\Desktop\python课件\第七课材料\第七课材料\codes\data 的目录
    
    2017/09/14  21:51    <DIR>          .
    2017/09/14  21:51    <DIR>          ..
    2017/09/03  23:30             6,148 .DS_Store
    2017/09/03  23:30            14,142 data_75_12.csv
    2017/09/03  23:30             3,930 evolution.csv
    2017/09/03  23:30             8,568 fortis_heritability.csv
                   4 个文件         32,788 字节
                   2 个目录  6,175,817,728 可用字节
    

    随时间的演化

    数据导入和清洗

    evolution = pd.read_csv('data/evolution.csv')
    
    evolution.head()
    

    <div>
    <style>
    .dataframe thead tr:only-child th {
    text-align: right;
    }

    .dataframe thead th {
        text-align: left;
    }
    
    .dataframe tbody tr th {
        vertical-align: top;
    }
    

    </style>
    <table border="1" class="dataframe">
    <thead>
    <tr style="text-align: right;">
    <th></th>
    <th>Year</th>
    <th>Species</th>
    <th>Beak length</th>
    <th>Beak depth</th>
    <th>Beak width</th>
    <th>CI Beak length</th>
    <th>CI Beak depth</th>
    <th>CI Beak width</th>
    </tr>
    </thead>
    <tbody>
    <tr>
    <th>0</th>
    <td>1973</td>
    <td>fortis</td>
    <td>10.76</td>
    <td>9.48</td>
    <td>8.69</td>
    <td>0.097</td>
    <td>0.13</td>
    <td>0.081</td>
    </tr>
    <tr>
    <th>1</th>
    <td>1974</td>
    <td>fortis</td>
    <td>10.72</td>
    <td>9.42</td>
    <td>8.66</td>
    <td>0.144</td>
    <td>0.17</td>
    <td>0.112</td>
    </tr>
    <tr>
    <th>2</th>
    <td>1975</td>
    <td>fortis</td>
    <td>10.57</td>
    <td>9.19</td>
    <td>8.55</td>
    <td>0.075</td>
    <td>0.084</td>
    <td>0.057</td>
    </tr>
    <tr>
    <th>3</th>
    <td>1976</td>
    <td>fortis</td>
    <td>10.64</td>
    <td>9.23</td>
    <td>8.58</td>
    <td>0.048</td>
    <td>0.053</td>
    <td>0.039</td>
    </tr>
    <tr>
    <th>4</th>
    <td>1977</td>
    <td>fortis</td>
    <td>10.73</td>
    <td>9.35</td>
    <td>8.63</td>
    <td>0.085</td>
    <td>0.092</td>
    <td>0.066</td>
    </tr>
    </tbody>
    </table>
    </div>

    evolution.info()
    
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 83 entries, 0 to 82
    Data columns (total 8 columns):
    Year              82 non-null object
    Species           81 non-null object
    Beak length       81 non-null object
    Beak depth        80 non-null float64
    Beak width        80 non-null float64
    CI Beak length    80 non-null object
    CI Beak depth     80 non-null object
    CI Beak width     80 non-null object
    dtypes: float64(2), object(6)
    memory usage: 5.3+ KB
    
    evolution = evolution.dropna()
    
    evolution.info()
    
    <class 'pandas.core.frame.DataFrame'>
    Int64Index: 80 entries, 0 to 82
    Data columns (total 8 columns):
    Year              80 non-null object
    Species           80 non-null object
    Beak length       80 non-null object
    Beak depth        80 non-null float64
    Beak width        80 non-null float64
    CI Beak length    80 non-null object
    CI Beak depth     80 non-null object
    CI Beak width     80 non-null object
    dtypes: float64(2), object(6)
    memory usage: 5.6+ KB
    
    evolution['Beak length'] = pd.to_numeric(evolution['Beak length']) 
    
    evolution['CI Beak length'] = pd.to_numeric(evolution['CI Beak length'], errors='coerce')  
    evolution['CI Beak depth'] = pd.to_numeric(evolution['CI Beak depth'], errors='coerce')
    evolution['CI Beak width'] = pd.to_numeric(evolution['CI Beak width'], errors='coerce')
    
    evolution['Year'] = pd.to_datetime(evolution['Year']) 
    
    evolution.info()
    
    <class 'pandas.core.frame.DataFrame'>
    Int64Index: 80 entries, 0 to 82
    Data columns (total 8 columns):
    Year              80 non-null datetime64[ns]
    Species           80 non-null object
    Beak length       80 non-null float64
    Beak depth        80 non-null float64
    Beak width        80 non-null float64
    CI Beak length    79 non-null float64
    CI Beak depth     79 non-null float64
    CI Beak width     79 non-null float64
    dtypes: datetime64[ns](1), float64(6), object(1)
    memory usage: 5.6+ KB
    

    数据探索

    evolution.Species.value_counts() 
    
    scandens    40
    fortis      40
    Name: Species, dtype: int64
    
    
    fortis = evolution[evolution.Species == 'fortis']     
    scandens = evolution[evolution.Species == 'scandens'] 
    
    fortis.plot(x='Year', y = ['Beak length', 'Beak depth', 'Beak width'])
    
    <matplotlib.axes._subplots.AxesSubplot at 0x113096828>
    
    output_17_1.png
    scandens.plot(x='Year', y = ['Beak length', 'Beak depth', 'Beak width'])
    
    <matplotlib.axes._subplots.AxesSubplot at 0x11574a2e8>
    
    output_18_1.png
    scandens.plot(x='Year', y = ['Beak length', 'Beak depth', 'Beak width'], subplots=True, figsize=(10, 6))
    
    array([<matplotlib.axes._subplots.AxesSubplot object at 0x11624fc50>,
           <matplotlib.axes._subplots.AxesSubplot object at 0x1164974e0>,
           <matplotlib.axes._subplots.AxesSubplot object at 0x1164a8940>], dtype=object)
    
    output_19_1.png
    fortis.plot(x='Year', y ='Beak depth', yerr='CI Beak depth', marker='o', figsize=(10,5), color='DarkBlue')
    
    <matplotlib.axes._subplots.AxesSubplot at 0x116752b38>
    
    output_20_1.png
    scandens.plot(x='Year', y='Beak depth', yerr='CI Beak depth', marker='o', figsize=(10,5), color='DarkBlue')
    
    <matplotlib.axes._subplots.AxesSubplot at 0x115703160>
    
    output_21_1.png

    1975年和2012年数据的比较

    数据探索

    data = pd.read_csv('data/data_75_12.csv')
    
    data.head()
    

    <div>
    <style>
    .dataframe thead tr:only-child th {
    text-align: right;
    }

    .dataframe thead th {
        text-align: left;
    }
    
    .dataframe tbody tr th {
        vertical-align: top;
    }
    

    </style>
    <table border="1" class="dataframe">
    <thead>
    <tr style="text-align: right;">
    <th></th>
    <th>species</th>
    <th>length</th>
    <th>depth</th>
    <th>year</th>
    </tr>
    </thead>
    <tbody>
    <tr>
    <th>0</th>
    <td>fortis</td>
    <td>9.4</td>
    <td>8.0</td>
    <td>1975</td>
    </tr>
    <tr>
    <th>1</th>
    <td>fortis</td>
    <td>9.2</td>
    <td>8.3</td>
    <td>1975</td>
    </tr>
    <tr>
    <th>2</th>
    <td>fortis</td>
    <td>9.5</td>
    <td>7.5</td>
    <td>1975</td>
    </tr>
    <tr>
    <th>3</th>
    <td>fortis</td>
    <td>9.5</td>
    <td>8.0</td>
    <td>1975</td>
    </tr>
    <tr>
    <th>4</th>
    <td>fortis</td>
    <td>11.5</td>
    <td>9.9</td>
    <td>1975</td>
    </tr>
    </tbody>
    </table>
    </div>

    data.info()
    
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 651 entries, 0 to 650
    Data columns (total 4 columns):
    species    651 non-null object
    length     651 non-null float64
    depth      651 non-null float64
    year       651 non-null int64
    dtypes: float64(2), int64(1), object(1)
    memory usage: 20.4+ KB
    
    data.groupby(['species', 'year']).count() 
    

    <div>
    <style>
    .dataframe thead tr:only-child th {
    text-align: right;
    }

    .dataframe thead th {
        text-align: left;
    }
    
    .dataframe tbody tr th {
        vertical-align: top;
    }
    

    </style>
    <table border="1" class="dataframe">
    <thead>
    <tr style="text-align: right;">
    <th></th>
    <th></th>
    <th>length</th>
    <th>depth</th>
    </tr>
    <tr>
    <th>species</th>
    <th>year</th>
    <th></th>
    <th></th>
    </tr>
    </thead>
    <tbody>
    <tr>
    <th rowspan="2" valign="top">fortis</th>
    <th>1975</th>
    <td>316</td>
    <td>316</td>
    </tr>
    <tr>
    <th>2012</th>
    <td>121</td>
    <td>121</td>
    </tr>
    <tr>
    <th rowspan="2" valign="top">scandens</th>
    <th>1975</th>
    <td>87</td>
    <td>87</td>
    </tr>
    <tr>
    <th>2012</th>
    <td>127</td>
    <td>127</td>
    </tr>
    </tbody>
    </table>
    </div>

    data.groupby(['species', 'year']).mean() 
    

    <div>
    <style>
    .dataframe thead tr:only-child th {
    text-align: right;
    }

    .dataframe thead th {
        text-align: left;
    }
    
    .dataframe tbody tr th {
        vertical-align: top;
    }
    

    </style>
    <table border="1" class="dataframe">
    <thead>
    <tr style="text-align: right;">
    <th></th>
    <th></th>
    <th>length</th>
    <th>depth</th>
    </tr>
    <tr>
    <th>species</th>
    <th>year</th>
    <th></th>
    <th></th>
    </tr>
    </thead>
    <tbody>
    <tr>
    <th rowspan="2" valign="top">fortis</th>
    <th>1975</th>
    <td>10.565190</td>
    <td>9.171646</td>
    </tr>
    <tr>
    <th>2012</th>
    <td>10.517355</td>
    <td>8.605372</td>
    </tr>
    <tr>
    <th rowspan="2" valign="top">scandens</th>
    <th>1975</th>
    <td>14.120920</td>
    <td>8.960000</td>
    </tr>
    <tr>
    <th>2012</th>
    <td>13.421024</td>
    <td>9.186220</td>
    </tr>
    </tbody>
    </table>
    </div>

    
    fortis2 = data[data.species == 'fortis']
    scandens2 = data[data.species == 'scandens']
    
    
    fortis2.boxplot(by='year', figsize = (10,6))
    
    array([<matplotlib.axes._subplots.AxesSubplot object at 0x11745d748>,
           <matplotlib.axes._subplots.AxesSubplot object at 0x1178591d0>], dtype=object)
    
    output_30_1.png
    scandens2.boxplot(by='year', figsize = (10,6))
    
    array([<matplotlib.axes._subplots.AxesSubplot object at 0x1179b3278>,
           <matplotlib.axes._subplots.AxesSubplot object at 0x117df4048>], dtype=object)
    
    output_31_1.png

    中地雀鸟喙深度和长度的变化

    
    fortis1975 = fortis2[fortis2.year == 1975]
    fortis2012 = fortis2[fortis2.year == 2012]
    
    fortis1975.depth.mean()
    
    9.171645569620255
    
    fortis2012.depth.mean()
    
    8.605371900826446
    

    鸟喙深度95%置信区间

    def mean_ci(data):    
        '''给定样本数据,计算均值95%的置信区间'''        
        sample_size = len(data)    
        std = np.std(data, ddof=1)     
        se = std / np.sqrt(sample_size)        
        point_estimate = np.mean(data)      
        z_score = scipy.stats.norm.isf(0.025) 
        confidence_interval = (point_estimate - z_score * se, point_estimate + z_score * se)    
        
        return confidence_interval
    
    
    mean_ci(fortis1975.depth)
    
    (9.0903471711839057, 9.2529439680566039)
    
    
    mean_ci(fortis2012.depth)
    
    (8.4748436937850755, 8.7359001078678169)
    

    鸟喙深度差值的95%置信区间

    diff_mean = fortis2012.depth.mean() - fortis1975.depth.mean()
    diff_mean
    
    -0.5662736687938086
    
    def draw_bs_reps(data, func, size=1):
        '''bootstrap方法'''
    
        
        bs_replicates = np.empty(size)
    
       
        for i in range(size):
            bs_replicates[i] = func(np.random.choice(data, size=len(data)))
    
        return bs_replicates
    
    
    bs1975 = draw_bs_reps(fortis1975.depth, np.mean, 10000)
    bs2012 = draw_bs_reps(fortis2012.depth, np.mean, 10000)
    
    
    bs_diff = bs2012 - bs1975
    
    
    conf_int = np.percentile(bs_diff, [2.5, 97.5])
    
    
    print('均值的差: ', diff_mean, 'mm')
    print('95% 置信区间:', conf_int, 'mm')
    
    均值的差:  -0.5662736687938086 mm
    95% 置信区间: [-0.71819592 -0.41113047] mm
    

    鸟喙长度差值的95%置信区间

    
    diff_mean = fortis2012.length.mean() - fortis1975.length.mean()
    
    
    bs1975 = draw_bs_reps(fortis1975.length, np.mean, 10000)
    bs2012 = draw_bs_reps(fortis2012.length, np.mean, 10000)
    
    
    bs_diff = bs2012 - bs1975
    
    
    conf_int = np.percentile(bs_diff, [2.5, 97.5])
    
    
    print('均值的差: ', diff_mean, 'mm')
    print('95% 置信区间:', conf_int, 'mm')
    
    均值的差:  -0.047834501516897276 mm
    95% 置信区间: [-0.209585   0.1162503] mm
    

    假设检验判断鸟喙深度和长度是否有改变(t分布)

    
    scipy.stats.ttest_ind(fortis2012.depth, fortis1975.depth)
    
    Ttest_indResult(statistic=-7.196495054344143, pvalue=2.727142579713897e-12)
    

    原假设为1975年和2012年中地雀鸟喙深度均值相同,取$\alpha = 0.01$, 因为 $pvalue < \alpha$, 所以原假设不成立,得到结论是:2012年和1975年相比,中地雀的鸟喙深度地雀发生了变化。

    scipy.stats.ttest_ind(fortis2012.length, fortis1975.length)
    
    Ttest_indResult(statistic=-0.62821616808278724, pvalue=0.53019202035616386)
    

    取$\alpha = 0.01$, 因为 $pvalue > \alpha$, 所以无法拒绝原假设,即不能证明鸟喙长度发生了改变。

    中地雀鸟喙形状的变化

    
    ax = fortis1975.plot.scatter(x='length', y='depth', color='Blue', label='1975')
    fortis2012.plot.scatter(x='length', y='depth', color='Red', label='2012', ax=ax)
    plt.legend(loc='upper left')
    
    <matplotlib.legend.Legend at 0x1182fc240>
    
    output_52_1.png
    
    np.corrcoef(fortis1975.length, fortis1975.depth)[0,1]
    
    0.82123033856315231
    
    
    np.corrcoef(fortis2012.length, fortis2012.depth)[0,1]
    
    0.72342938117020117
    
    
    ratio1975 = fortis1975.length / fortis1975.depth
    ratio2012 = fortis2012.length / fortis2012.depth
    
    
    print(np.mean(ratio1975))
    print(np.mean(ratio2012))
    
    1.154557328563076
    1.2250642338241673
    
    
    scipy.stats.ttest_ind(ratio2012, ratio1975)
    
    Ttest_indResult(statistic=11.221158991741978, pvalue=7.7605361953834971e-26)
    

    结论,2012年相比1975年,鸟喙形状发生了改变。

    遗传数据分析

    数据探索

    
    herit = pd.read_csv('data/fortis_heritability.csv')
    
    herit.head()
    
    

    <div>
    <style>
    .dataframe thead tr:only-child th {
    text-align: right;
    }

    .dataframe thead th {
        text-align: left;
    }
    
    .dataframe tbody tr th {
        vertical-align: top;
    }
    

    </style>
    <table border="1" class="dataframe">
    <thead>
    <tr style="text-align: right;">
    <th></th>
    <th>species</th>
    <th>mid_offspr</th>
    <th>male_parent</th>
    <th>female_parent</th>
    </tr>
    </thead>
    <tbody>
    <tr>
    <th>0</th>
    <td>fortis</td>
    <td>10.70</td>
    <td>10.90</td>
    <td>9.3</td>
    </tr>
    <tr>
    <th>1</th>
    <td>fortis</td>
    <td>9.78</td>
    <td>10.70</td>
    <td>8.4</td>
    </tr>
    <tr>
    <th>2</th>
    <td>fortis</td>
    <td>9.48</td>
    <td>10.70</td>
    <td>8.1</td>
    </tr>
    <tr>
    <th>3</th>
    <td>fortis</td>
    <td>9.60</td>
    <td>10.70</td>
    <td>9.8</td>
    </tr>
    <tr>
    <th>4</th>
    <td>fortis</td>
    <td>10.27</td>
    <td>9.85</td>
    <td>10.4</td>
    </tr>
    </tbody>
    </table>
    </div>

    herit.info()
    
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 413 entries, 0 to 412
    Data columns (total 4 columns):
    species          413 non-null object
    mid_offspr       413 non-null float64
    male_parent      413 non-null float64
    female_parent    413 non-null float64
    dtypes: float64(3), object(1)
    memory usage: 13.0+ KB
    
    
    np.corrcoef(herit.mid_offspr, herit.male_parent)[0,1]
    
    0.52168902179704335
    
    
    np.corrcoef(herit.mid_offspr, herit.female_parent)[0,1]
    
    0.58770631611267576
    
    
    herit['mid_parent'] = (herit.male_parent + herit.female_parent) / 2
    herit.head()
    

    <div>
    <style>
    .dataframe thead tr:only-child th {
    text-align: right;
    }

    .dataframe thead th {
        text-align: left;
    }
    
    .dataframe tbody tr th {
        vertical-align: top;
    }
    

    </style>
    <table border="1" class="dataframe">
    <thead>
    <tr style="text-align: right;">
    <th></th>
    <th>species</th>
    <th>mid_offspr</th>
    <th>male_parent</th>
    <th>female_parent</th>
    <th>mid_parent</th>
    </tr>
    </thead>
    <tbody>
    <tr>
    <th>0</th>
    <td>fortis</td>
    <td>10.70</td>
    <td>10.90</td>
    <td>9.3</td>
    <td>10.100</td>
    </tr>
    <tr>
    <th>1</th>
    <td>fortis</td>
    <td>9.78</td>
    <td>10.70</td>
    <td>8.4</td>
    <td>9.550</td>
    </tr>
    <tr>
    <th>2</th>
    <td>fortis</td>
    <td>9.48</td>
    <td>10.70</td>
    <td>8.1</td>
    <td>9.400</td>
    </tr>
    <tr>
    <th>3</th>
    <td>fortis</td>
    <td>9.60</td>
    <td>10.70</td>
    <td>9.8</td>
    <td>10.250</td>
    </tr>
    <tr>
    <th>4</th>
    <td>fortis</td>
    <td>10.27</td>
    <td>9.85</td>
    <td>10.4</td>
    <td>10.125</td>
    </tr>
    </tbody>
    </table>
    </div>

    
    np.corrcoef(herit.mid_offspr, herit.mid_parent)[0,1]
    
    0.72834123955184848
    
    
    herit.plot.scatter(x='mid_parent', y='mid_offspr')
    
    <matplotlib.axes._subplots.AxesSubplot at 0x1179942e8>
    
    output_68_1.png

    推断是否有显著的遗传相关性

    def draw_bs_pairs(x, y, func, size=1):
        """对配对数据使用bootstrap方法"""
    
        
        inds = np.arange(len(x))
    
        
        bs_replicates = np.empty(size)
    
        
        for i in range(size):
            bs_inds = np.random.choice(inds, len(inds))
            bs_x, bs_y = x[bs_inds], y[bs_inds]
            bs_replicates[i] = func(bs_x, bs_y)
    
        return bs_replicates
    
    
    def pearson_r(x, y):
        """计算皮尔逊相关系数"""
        corr_mat = np.corrcoef(x,y)
        return corr_mat[0,1]
    
    
    bs_replicates = draw_bs_pairs(herit.mid_offspr, herit.mid_parent, pearson_r, 1000)
    
    
    conf_int = np.percentile(bs_replicates, [2.5, 97.5])
    
    
    print(conf_int)
    
    [ 0.67140317  0.77933128]
    

    皮尔逊相关系数的计算公式:

    $$ \rho = \frac{cov(x,y)}{\sigma_x \sigma_y} $$

    但是,遗传的相关性,应该只依赖于父母,而非儿女,所以要修改该公式:

    $$ \rho = \frac{cov(x,y)}{\sigma_x \sigma_x} = \frac{cov(x,y)}{var_x}$$

    
    def heritability(parents, offspring):
        """计算遗传相关性系数"""
        covariance_matrix = np.cov(parents, offspring)
        return covariance_matrix[0,1] / covariance_matrix[0,0]
    
    herit_fortis = heritability(herit.mid_parent, herit.mid_offspr)
    print(herit_fortis)
    
    0.722905191144
    
    bs_replicates = draw_bs_pairs(herit.mid_offspr, herit.mid_parent, heritability, 1000)
    conf_int = np.percentile(bs_replicates, [2.5, 97.5])
    print( conf_int)
    
    [ 0.65984576  0.8041422 ]
    
    ##完全没有懂。
    
    
    
    
    
    
    

    相关文章

      网友评论

        本文标题:第七次尝试

        本文链接:https://www.haomeiwen.com/subject/lrtepxtx.html