美文网首页
【2019-12-10】 spark ALS 训练数据可视化数据

【2019-12-10】 spark ALS 训练数据可视化数据

作者: 6g3y | 来源:发表于2019-12-10 14:13 被阅读0次

    知乎关注主题列表

    package A
    
    import java.io.File
    import java.util
    
    import org.apache.commons.io.FileUtils
    import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
    import org.apache.spark.rdd.RDD
    import org.apache.spark.{SparkConf, SparkContext}
    
    object asd {
    
      def main(args: Array[String]): Unit = {
        val conf: SparkConf = new SparkConf().setMaster("local").setAppName("My App")
        val sc: SparkContext = new SparkContext(conf)
        val map = new util.HashMap[String, Int]
    
        val inputFile1: String = "topics/topic.csv"
        //    val inputFile2: String = "ml-20m/ratings.csv"
        var i = 0;
        val train = sc.textFile(inputFile1).flatMap(_.split("\n"))//.filter(_ => Math.random() > 0.97)
          .map(_.split(",") match {
            case Array(user, item, rate) => Rating(map.computeIfAbsent(user, _ => {
              i = i + 1;
              i
            }), item.toInt, math.log(rate.toDouble + math.E))
          })
        //    val model = ALS.train(train, 30, 15)
        val model = new ALS()
          .setIterations(20)
          .setRank(60)
          .setLambda(0.005)
          .run(train)
        model.save(sc,"ALS.model")
    //    evaluateMode(train, model);
      }
    
      /**
       * 模型评估
       */
      private def evaluateMode(ratings: RDD[Rating], model: MatrixFactorizationModel) {
    
        //使用训练数据训练模型
        val usersProducets = ratings.map {
          case Rating(user, product, rate) => (user, product)
        }
    
        //预测数据
        val predictions = model.predict(usersProducets).map {
          case Rating(user, product, rate) => ((user, product), rate)
        }
    
        //将真实分数与预测分数进行合并
        val ratesAndPreds = ratings.map {
          case Rating(user, product, rate) =>
            ((user, product), rate)
        }.join(predictions)
    
        //计算均方
        val MSE = ratesAndPreds.map {
          case ((user, product), (r1, r2)) =>
    
            val err = (r1 - r2) //(math.pow(math.E, r1) - math.pow(math.E, r2))
            err * err
        }.mean()
    
    
        val sb = new StringBuilder("0,1,2,3\n")
        ratesAndPreds.map {
          case ((user, product), (r1, r2)) =>
            user + "," + product + "," + (math.pow(math.E,r1)-1) + "," + (math.pow(math.E,r2)-1) + "\n"
        }.take(999999999).foreach(sb ++= _)
        FileUtils.write(new File("s"),sb,false)
    
    
        //打印出均方差值
        println(s"MSE = ${MSE}")
      }
    
    }
    
    

    回答数目预测 X是原回答数目
    Y是预计回答数目

    还是可以勉强看成线性回归的拿来预测应该还是可以的 Figure_1.png
    Figure_2.png
    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd
    
    CSV_FILE_PATH = 's'
    df = pd.read_csv(CSV_FILE_PATH)
    plt.figure(figsize=(7, 7))
    print(df)
    
    x = df['2']
    y = df['3']
    # scale = 100 * np.random.rand(n)
    # s 表示散点的大小,形如 shape (n, )
    # label 表示显示在图例中的标注
    # alpha 是 RGBA 颜色的透明分量
    # edgecolors 指定三点圆周的颜色
    plt.scatter(x, y, s=2, alpha=0.3)
    plt.title('Scatter')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()
    
    
    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
    
        <groupId>asd</groupId>
        <artifactId>asd</artifactId>
        <version>1.0-SNAPSHOT</version>
    
    
        <repositories>
            <repository>
                <id>aliyunmaven</id>
                <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
            </repository>
        </repositories>
    
        <properties>
            <spark.version>2.4.4</spark.version>
            <scala.version>2.12</scala.version>
        </properties>
    
    
    
        <dependencies>
    
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-core_${scala.version}</artifactId>
                <version>${spark.version}</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-mllib_${scala.version}</artifactId>
                <version>${spark.version}</version>
            </dependency>
    
        </dependencies>
    
    </project>
    

    相关文章

      网友评论

          本文标题:【2019-12-10】 spark ALS 训练数据可视化数据

          本文链接:https://www.haomeiwen.com/subject/uycggctx.html