美文网首页
spark熵权法验证

spark熵权法验证

作者: 达微 | 来源:发表于2022-07-15 10:25 被阅读0次

    指标权重计算流程

    参考:https://www.zhihu.com/question/357680646/answer/1748591262

    1、归一化

    2、指标占比

    3、计算熵

    4、计算差异系数

    5、计算权重

    6、验证:权重和为1

    样例:WeightScoreTest.scala

    
    case class Room(name:String,x1:Double,x2:Double,x3:Double,x4:Double,x5:Double,x6:Double,x7:Double,x8:Double){
    
    }
    object Room{
     // def apply(name: String, x1: Double, x2: Double, x3: Double, x4: Double, x5: Double, x6: Double, x7: Double, x8: Double): Room = new Room(name, x1, x2, x3, x4, x5, x6, x7, x8)
      def apply(row:String)={
        val r = row.split(" ")
        new Room(r(0),r(1).toDouble,r(2).toDouble,r(3).toDouble,r(4).toDouble,r(5).toDouble,r(6).toDouble,r(7).toDouble,r(8).toDouble)
      }
    }
    object WeightScoreTest   {
    
    
      /** l1个科室9项整体护理评价指标得分表 */
      val samples =
        """
          |A    100 90  100 84  90  100 100 100 100
          |B    100 100 78.6    100 90  100 100 100 100
          |C    75  100 85.7    100 90  100 100 100 100
          |D    100 100 78.6    100 90  100 94.4    100 100
          |E    100 90  100 100 100 90  100 100 80
          |F    100 100 100 100 90  100 100 85.7    100
          |G    100 100 78.6    100 90  100 55.6    100 100
          |H    87.5    100 85.7    100 100 100 100 100 100
          |I    100 100 92.9    100 80  100 100 100 100
          |J    100 90  100 100 100 100 100 100 100
          |K    100 100 92.9    100 90  100 100 100 100
          |
        """.stripMargin
       
       def start(): Unit = {
         val sparkConf = new SparkConf().setAppName("WeightScoreTest")
         sparkConf.setMaster("local[*]")
         val sparkContext = new SparkContext(sparkConf)
    
         val sparkSessionBuilder = SparkSession.builder()
           .enableHiveSupport()
           .config(sparkConf)
           .appName(sparkContext.appName)
         val spark = sparkSessionBuilder.getOrCreate()
         spark.udf.register("sumofsquares", new Sumofsquares())
         val rooms = samples.split("\r\n").filter(StringUtils.isNoneBlank(_)).map(r=>{
          Room(r)
         }).toList.asJava
        val df = spark.createDataFrame(rooms)
        df.show(20)
         //TODO 对每一列指标进行归一化,
         val summary = df.summary("count", "mean", "max", "min", "stddev").cache()
         var features = summary.columns.filterNot(r => r == "summary" || r == "name" )
         val feaMaxMap = summary.filter("summary = 'max'").collect().head.getValuesMap[Double](features)
         val feaMinMap = summary.filter("summary = 'min'").collect().head.getValuesMap[Double](features)
         summary.show(20)
         val feaMeanMap = summary.filter("summary = 'mean'").collect().head.getValuesMap[String](features)
         val feaStdMap = summary.filter("summary = 'stddev'").collect().head.getValuesMap[String](features)
    
        val df2 = df.selectExpr(features.map{ f =>
           val maxVal = feaMaxMap.getOrElse(f, 1)
           val minVal = feaMinMap.getOrElse(f, 0)
           //s"($f - $minVal)/($maxVal - $minVal + 1e-6) as ${f}"  //
          if("name".equals(f)){
            s"${f}"
          }else {
            s"($f - $minVal)/($maxVal - $minVal ) as ${f}"
          }
        }:_*)
         df2.show(20)
    
    
    
         // 然后计算权重
         val diverse = features.map(f => calWeight(df2, f))
         val s = diverse.sum + 1e-6
         val weights = diverse.map(_ / s)
         println(s"weights,list:${JSON.toJSONString(weights,false)}")
         println(s"weights,sum:${weights.sum}")
    
         println("features:"+JSON.toJSONString(features,false))
         println("summary = 'max':"+JSON.toJSONString(feaMaxMap.asJava,false))
         println("summary = 'min':"+JSON.toJSONString(feaMinMap.asJava,false))
         println("summary = 'mean':"+JSON.toJSONString(feaMeanMap.asJava,false))
         println("summary = 'stddev':"+JSON.toJSONString(feaStdMap.asJava,false))
      }
    
      def calWeight(dataDF:DataFrame, field: String):Double={
        val scoreDf =  dataDF.rdd.map{_.getAs[Any](field).toString.toDouble}//.select(field)
        val sumIter=scoreDf.sum() + 1e-6
        val scalar = -1.0/math.log(scoreDf.count())
        val Ej = scoreDf.map{ v=>val l1value = math.abs(v)/sumIter
          l1value * math.log(l1value + 1e-6)
        }.sum * scalar
        println(s"sum:${Ej}")
        if(1 - Ej < 0) println("差异系数为负数")
        1 - Ej // 差异系数*/
      }
    
      def main(args: Array[String]): Unit = {
    
        start()
      }
    

    计算结果

     /**数据集探索*/   
     val summary=
          """
            |+-------+----+----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-----------------+
            ||summary|name|              x1|               x2|               x3|               x4|               x5|                x6|                x7|               x8|
            |+-------+----+----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-----------------+
            ||  count|  11|              11|               11|               11|               11|               11|                11|                11|               11|
            ||   mean|null|96.5909090909091|97.27272727272727|90.27272727272727|98.54545454545455|91.81818181818181|  99.0909090909091| 95.45454545454545|             98.7|
            ||    max|   K|           100.0|            100.0|            100.0|            100.0|            100.0|             100.0|             100.0|            100.0|
            ||    min|   A|            75.0|             90.0|             78.6|             84.0|             80.0|              90.0|              55.6|             85.7|
            || stddev|null|8.08337238353579|4.670993664969138|9.180750613004465|4.824181513244218|6.030226891555272|3.0151134457776365|13.324591073377345|4.311612227462018|
            |+-------+----+----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+-----------------+
            |
          """.stripMargin
         /**指标标准化矩阵*/   
         val standString =
           """
             |+---+---+-------------------+---+---+---+-----------------+------------------+
             || x1| x2|                 x3| x4| x5| x6|               x7|                x8|
             |+---+---+-------------------+---+---+---+-----------------+------------------+
             ||1.0|0.0| 1.0000000000000002|0.0|0.5|1.0|              1.0|0.9999999999999998|
             ||1.0|1.0|                0.0|1.0|0.5|1.0|              1.0|0.9999999999999998|
             ||0.0|1.0|0.33177570093457986|1.0|0.5|1.0|              1.0|0.9999999999999998|
             ||1.0|1.0|                0.0|1.0|0.5|1.0|0.873873873873874|0.9999999999999998|
             ||1.0|0.0| 1.0000000000000002|1.0|1.0|0.0|              1.0|0.9999999999999998|
             ||1.0|1.0| 1.0000000000000002|1.0|0.5|1.0|              1.0|               0.0|
             ||1.0|1.0|                0.0|1.0|0.5|1.0|              0.0|0.9999999999999998|
             ||0.5|1.0|0.33177570093457986|1.0|1.0|1.0|              1.0|0.9999999999999998|
             ||1.0|1.0| 0.6682242990654211|1.0|0.0|1.0|              1.0|0.9999999999999998|
             ||1.0|0.0| 1.0000000000000002|1.0|1.0|1.0|              1.0|0.9999999999999998|
             ||1.0|1.0| 0.6682242990654211|1.0|0.5|1.0|              1.0|0.9999999999999998|
             |+---+---+-------------------+---+---+---+-----------------+------------------+
           """.stripMargin
    
          val weightString =
            """
              |weights,list:[0.08110818342879658,0.23453511631130167,0.2904122876143106,0.07019980217824295,0.11258311168727239,0.07019980217824295,0.07076012846443831,0.07019980217824295]
              |weights,sum:0.9999982340408485
              |features:["x1","x2","x3","x4","x5","x6","x7","x8"]
              |summary = 'max':{"x8":"100.0","x3":"100.0","x7":"100.0","x2":"100.0","x5":"100.0","x6":"100.0","x1":"100.0","x4":"100.0"}
              |summary = 'min':{"x8":"85.7","x3":"78.6","x7":"55.6","x2":"90.0","x5":"80.0","x6":"90.0","x1":"75.0","x4":"84.0"}
              |summary = 'mean':{"x8":"98.7","x3":"90.27272727272727","x7":"95.45454545454545","x2":"97.27272727272727","x5":"91.81818181818181","x6":"99.0909090909091","x1":"96.5909090909091","x4":"98.54545454545455"}
              |summary = 'stddev':{"x8":"4.311612227462018","x3":"9.180750613004465","x7":"13.324591073377345","x2":"4.670993664969138","x5":"6.030226891555272","x6":"3.0151134457776365","x1":"8.08337238353579","x4":"4.824181513244218"}
            """.stripMargin
    

    打分原理

    根据分布情况求累积概率
    累计概率(cumulativeprobability)即所有可能取值的概率之和。
    正向指标 发生概率越大,分数越高
    反向指标 发生概率越小,分数越高
    
    ```
    import org.apache.commons.math3.distribution.{ExponentialDistribution, NormalDistribution}
    NormalDist => new NormalDistribution(平均值, 标准差)
    ExponDist => new ExponentialDistribution(平均值)
    {if(-1 == effect) (1 - dist.cumulativeProbability(indexVal)) * 100
    else 100 * dist.cumulativeProbability(indexVal)}.formatted("%.2f").toFloat
    ```
    

    打分说明

    反向指标打分:
    求疲劳驾驶发生次数小于等于10次/100km的概率P(X <= 10) ,该指标分数 (1-p)*100
    正向指标打分:
    经济速度占比,经济负载占比

    总分以及二级指标处理

    总分:各个指标的分数*指标权重 相加
    二级指标:分数 * ( 权重占比 即 权重/二级权重之和 ) 相加

    相关文章

      网友评论

          本文标题:spark熵权法验证

          本文链接:https://www.haomeiwen.com/subject/ufbqirtx.html