美文网首页
用Python 实现 POI类别与人口密度相关性

用Python 实现 POI类别与人口密度相关性

作者: 机智的柠檬 | 来源:发表于2019-07-18 11:32 被阅读0次

    基于上一篇文章,python实现TF-IDF算法https://www.jianshu.com/p/d9c303c29a67

    • 将POI的类别抽象为文档中的词
    • 划分的区域 抽象为 文档
      则可利用TFIDF的值 计算 与 人口密度的协方差 ,计算出二者的相关性,相关系数越大,则该类别对人口的影响越大 ,后续则可利用得出的结论 研究分析更多的问题。
      下面将给出用python实现TFIDF与人口密度的相关系数
    image.png

    TLij为相应的TFIDF值
    PDj为第j区域的人口密度值

    用python实现该公式:

    def relative(tf_idf,idf,area):
        #N是第i类所占的分区数
        rou = {}
        for m in idf:
            #tl_pd 是(第i类所在分区的tf_idf值)与(第i类所在分区的栅格值)的乘积 后的求和  
            #sum_2 是 (第i类所在分区的tf_idf值)的平方的求和
            #sum 是 第 (第i类所在分区的tf_idf值)的求和
            #PD 是第i类所在的分区的栅格值的求和
            #PD_2 是第i类所在的分区的栅格值的平方的求和
            N = 0 
            tl_pd = 0
            sum_2 = 0
            sum = 0
            PD = 0
            PD_2 = 0
            for l in tf_idf:
                if m in tf_idf[l].keys():
                    N += 1
            for l in tf_idf:
                if m in tf_idf[l].keys():
                    sum_2+=tf_idf[l][m]*tf_idf[l][m]
                    tl_pd+=tf_idf[l][m]*area[l]
                    sum+=tf_idf[l][m]
    
            for l in tf_idf:
                if m in tf_idf[l].keys():
                    PD += area[l]
                    PD_2 += area[l]*area[l]
            fenzi = N*tl_pd-sum*PD
            fenmu = (N*sum_2-sum*sum)**0.5- (N*PD_2-PD*PD)**0.5
            poi_md = fenzi/fenmu
            rou[m] = poi_md
         
        return rou
    
    

    完整的代码如下:
    输入数据为:

    data1.json
    {
      "id":POI的编号
      "type":POI的类别
      "number":POI所属区域的区域编号
    }
    area.json
    {
      "区域编号":区域人口值
    }
    

    输出为:

    [
      ["h", 232.7468753744591],
      ["f",511.4907105256037],
      ["e",614.9901766893532],
      ["g",697.3614562757045],
      ["d",812.2365216229458],
      ["b",878.0307964717691],
      ["a",1137.981560137959],
      ["c",1448.4753152430358]
    ]
    a b c d e f g h 分别代表不同类别的POI
    
    import csv
    import json 
    import math
    
    filename = "data1.json"
    # 处理基础数据 转换成字典 
    #     key为分区代码  相应的value为嵌套字典
    #         键为某一分区内 POI的类型代码 相应的值为该类POI出现的次数
    def data_parse(filename):
        number_counts = {}
        with open(filename,'r',encoding='utf-8') as f:
            data = json.load(f)
            for l in data:
                if l["number"] not in number_counts.keys():
                    number_counts[l["number"]] = {}
                    if l["type"] not in number_counts[l["number"]].keys():
                        number_counts[l["number"]][l["type"]] = 1             
                else:
                    if l["type"] not in number_counts[l["number"]].keys():
                        number_counts[l["number"]][l["type"]] = 1
                    else:
                        number_counts[l["number"]][l["type"]] += 1
        return number_counts
    
    #计算TF值(TF=当前分区含有的I类POI数目/当前分区含有的POI数目)
    def tf(number_counts):
        for m in number_counts:
            fenmu = 0
            for j in number_counts[m]:
                fenmu += number_counts[m][j]
            for k in number_counts[m]:
                number_counts[m][k] = number_counts[m][k]/fenmu 
        return number_counts
    
    #计算IDF的值
    def idf(number_counts):
        idf = {"a":0,"b":0,"c":0,"d":0,"e":0,"f":0,"g":0,"h":0}
        for l in idf:
            count = 0
            D = 0
            for m in number_counts:
                D += 1
                if l in number_counts[m].keys():
                    count+=1
            idf[l] = math.log(D/count)
        return idf
    
    #计算TF-IDF的值
    def TFIDF(tf,idf):
        for m in tf:
            for k in idf:
                if k in tf[m].keys():
                    tf[m][k] = tf[m][k]*idf[k]
                    # for j in tf[m]:
                    #     print(m,k)
                    #     tf[m][j] = idf[k]*tf[m][j]
                    #     print(tf[m][j])
        return tf
    
    #将最终的TF值写入成json文件
    def writetojson(number_counts,filename):
        with open(filename,'a') as json_file:
            json.dump(number_counts, json_file, indent=2,ensure_ascii=False) 
    
    def read_area(filename):
        with open(filename,'r',encoding='utf-8') as f:
            area = json.load(f)
        return area
    
    def relative(tf_idf,idf,area):
        #N是第i类所占的分区数
        #tl_pd 是(第i类所在分区的tf_idf值)与(第i类所在分区的栅格值)的乘积 后的求和  
        #sum_2 是 (第i类所在分区的tf_idf值)的平方的求和
        #sum 是 第 (第i类所在分区的tf_idf值)的求和
        #PD 是第i类所在的分区的栅格值的求和
        #PD_2 是第i类所在的分区的栅格值的平方的求和
    
        rou = {}
        for m in idf:
            N = 0
            tl_pd = 0
            sum_2 = 0
            sum = 0
            PD = 0
            PD_2 = 0                 
            for l in tf_idf:
                if m in tf_idf[l].keys():
                    N += 1 
                if m in tf_idf[l].keys():
                    sum_2+=tf_idf[l][m]*tf_idf[l][m]
                    tl_pd+=tf_idf[l][m]*area[l]
                    sum+=tf_idf[l][m]
            for l in tf_idf:
                if m in tf_idf[l].keys():
                    PD += area[l]
                    PD_2 += area[l]*area[l]
            fenzi = N*tl_pd-sum*PD
            fenmu = (N*sum_2-sum*sum)**0.5- (N*PD_2-PD*PD)**0.5
            poi_md = fenzi/fenmu
            rou[m] = poi_md
        return rou
        
    if __name__ == '__main__':
        filename = "data1.json"
        number_counts = data_parse(filename)
        tf = tf(number_counts)
        idf = idf(number_counts)
        tf_idf = TFIDF(tf,idf)
        print(tf_idf)
        area = read_area('area.json',)   
        rou = relative(tf_idf,idf,area)
        rou=sorted(rou.items(),key=lambda x:x[1],reverse=False)
        writetojson(rou,"rou.json")
    

    相关文章

      网友评论

          本文标题:用Python 实现 POI类别与人口密度相关性

          本文链接:https://www.haomeiwen.com/subject/gliklctx.html