美文网首页
Spark Dataset learning on udemy

Spark Dataset learning on udemy

作者: 赐我理由在披甲上阵 | 来源:发表于2019-06-20 17:26 被阅读0次

    DataSet and DataFrame

    https://www.cnblogs.com/starwater/p/6841807.html

    DataFrame

    It is basicall an oddity of row objects that's really all it is.
    And a row can in turn contain any number of columns of informantion
    that may be of whatever type you want.
    more like data base
    have schema and let spark represent it more efficiently
    
    image.png

    DataSet

    DataFrame is also DataSet. 
    DataFrame = DataSet[Row]
    
    DataSet is more generally a set of structure data no necessarily a row. It could be a specific type.
    DataSet[Person], DataSet[(String, Double)]
    
    DataFrames schema is inferred at run time
    DataSets can be inferred at compile time
    
    image.png
    image.png image.png
    import org.apache.spark._
    import org.apache.spark.SparkContext._
    
    object SparkTest {
    
      case class Student(name: String, score: Double)
    
      def parseLine(line: String) = {
        val fields = line.split(",")
        val student: Student = Student(fields(0), fields(1).toDouble)
      }
    
      def showTopGradeStudent(df: DataFrame) = {
        val topGrade = df.agg({"grade","max"}).collect()
        topGrade.foreach(println)
      }
    
      def main(args: Array[String]) {
    
        val sc = new SparkContext("local[*]", "TopGrade");
        import spark.implicits._
        val lines = sc.textFile("../studnet.csv");
        val df = lines.map(parseLine).toDF().cache()
        showTopGradeStudent(df)
    
      }
    
      def showOut() = {
        schemaStudent.printSchema()
        schemaStudent.creatOrReplaceTempView()
      }
    }
    
    

    import org.apache.spark._
    import org.apache.spark.SparkContext._
    import org.apache.spark.sql.SparkSession
    import org.apache.spark.sql.{Row, SQLContext}
    import org.apache.spark.sql.types.{StructField, StructType}
    import org.apache.spark.sql.functions.{min, max} 
    
    object SparkTest {
    
      case class Student(name: String, score: Double)
    
      def parseLine(line: String) = {
        val fields = line.split(",")
        val student: Student = Student(fields(0), fields(1).toDouble)
        student
      }
    
      def main(args: Array[String]) {
    
        val sc = new SparkContext("local[*]", "TopGrade");
        import spark.implicits._
        val lines = sc.textFile("../studnet.csv");
        val df = lines.map(parseLine).toDF()
        val topGrade = df.agg(max("score")).collect
        
        
        
        
        //     var data = Array( "a,100","b,100","c,98","d,78")
        //     val lines = sc.parallelize(data)
    //     df.foreach(item => println(item.getAs[String]("name")))
    //     showTopGradeStudent(df)
    //     ds.foreach(item => println(item.name)
    
    // -----------
    //     val stdDS = lines.map(parseLine).toDS().cache()
      }
    
    //   def showOut() = {
    //     schemaStudent.printSchema()
    //     schemaStudent.creatOrReplaceTempView()
    //   }
    }
    

    相关文章

      网友评论

          本文标题:Spark Dataset learning on udemy

          本文链接:https://www.haomeiwen.com/subject/fpcppqtx.html