DataFrame常用操作

作者: sparkle123 | 来源:发表于2018-03-06 02:01 被阅读0次
    • 在spark-shell状态下查看sql内置函数:
      spark.sql("show functions").show(1000)
      比如:SUBSTR(columnName,0,1)='B'

    • show,take,first,head

    df.show(30,false)df.take(10)
    df.first()
    df.head(3)
    
    • 选择某列显示
      df.select("column").show(30,false)
    • 按条件过滤
      df.filter("name='' OR name='NULL'").show
    • 按列升序or降序排序
    df.sort(df("name").desc).show
    df.sort(df("name").asc, df("id").desc).show
    
    • 别名
      df.select(df("name").as("student_name")).show

    • join
      df1.join(df2, df1.col("id") === df2.col("id")).show

    • 源码:

    import org.apache.spark.sql.SparkSession
    
    object DataFrameCase {
    
      def main(args: Array[String]): Unit = {
    
        val spark = SparkSession.builder()
          .appName("DataFrameCase")
          .master("local[2]")
          .getOrCreate()
    
        val rdd = spark.sparkContext.textFile("C:\\Users\\Administrator\\IdeaProjects\\SparkSQLProject\\spark-warehouse\\student.data");
    
        import spark.implicits._
        val studentDF = rdd.map(_.split("\\|"))
          .map(line => Student(line(0).toInt,line(1),line(2),line(3)))
          .toDF()
    
        studentDF.show
        studentDF.show(30,false)
    
        studentDF.take(10)
        studentDF.first()
        studentDF.head(3)
    
        studentDF.select("email").show(30,false)
        
        studentDF.filter("name='' OR name='NULL'").show
        
        //name以B开头的人
        studentDF.filter("SUBSTR(name,0,1)='B'").show
        
        //sort
        studentDF.sort(studentDF("name")).show
        studentDF.sort(studentDF("name").desc).show
    
        studentDF.sort("name","id").show
        
        studentDF.sort(studentDF("name").asc, studentDF("id").desc).show
        
        //as alias
        studentDF.select(studentDF("name").as("student_name")).show
        
        
        val studentDF2 = rdd.map(_.split("\\|")).map(line => Student(line(0).toInt, line(1), line(2), line(3))).toDF()
    
        //inner join ===
        studentDF.join(studentDF2, studentDF.col("id") === studentDF2.col("id")).show
    
        spark.stop()
    
      }
    
       case class Student(id: Int, name: String, phone: String, email: String)
    
    }
    
    

    相关文章

      网友评论

        本文标题:DataFrame常用操作

        本文链接:https://www.haomeiwen.com/subject/qhckfftx.html