package bl.test.spark
import org.apache.spark.sql.SparkSession
/**
* DataFrame中的操作操作
*/
object DataFrameCase {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("DataFrameRDDApp").master("local[2]").getOrCreate()
// RDD ==> DataFrame
val rdd = spark.sparkContext.textFile("file:///Users/rocky/data/student.data")
//注意:需要导入隐式转换
import spark.implicits._
val studentDF = rdd.map(_.split("\\|")).map(line => Student(line(0).toInt, line(1), line(2), line(3))).toDF()
//show默认只显示前20条
studentDF.show
studentDF.show(30)
studentDF.show(30, false)
//前10行
studentDF.take(10)
//第一行
studentDF.first()
studentDF.head(3)
//获取emaill的前30行 不截取
studentDF.select("email").show(30, false)
//名称为空的 或 Null
studentDF.filter("name=''").show
studentDF.filter("name='' OR name='NULL'").show
//name以M开头的人
studentDF.filter("SUBSTR(name,0,1)='M'").show
//按照name排序 默认升序
studentDF.sort(studentDF("name")).show
studentDF.sort(studentDF("name").desc).show
//按照name和id排序
studentDF.sort("name", "id").show
//按照name升序 id降序列
studentDF.sort(studentDF("name").asc, studentDF("id").desc).show
//别名查询
studentDF.select(studentDF("name").as("student_name")).show
//join查询
val studentDF2 = rdd.map(_.split("\\|")).map(line => Student(line(0).toInt, line(1), line(2), line(3))).toDF()
studentDF.join(studentDF2, studentDF.col("id") === studentDF2.col("id")).show
spark.stop()
}
case class Student(id: Int, name: String, phone: String, email: String)
}
网友评论