spark dataframe的基本统计.
做数据探索性分析的时候,有几个比较重要的数值,它们能简要的概括数据的分布情况,它们包括分位数、均值、最值、分位数等。
//加载数据 val data_detail_this = sqlContext.read.parquet("/fe/cateyes/tdid_poi/monthly/201902/*").sample(0.1,11L).coalesce(512)
import sqlContext.implicits._
val colNames =Seq("hours_7_9_1_days","hours_9_12_1_days","hours_12_14_1_days","hours_14_18_1_days","hours_18_20_1_days","hours_20_22_1_days",
"hours_22_7_1_days","hours_0_24_1_hours","hours_0_24_1_week","hours_0_24_1_days","hours_0_24_2_weeks","hours_0_24_2_days")
val cols = colNames.map(f =>col(f).cast(DoubleType))
val dataDF = sqlRdd.select(cols: _*)
val describe = dataDF.describe()
// 常用的几个数值型变量类型
val numType =Array(DoubleType)
val tuples = dataDF.schema.map(sf => {
(sf.name, sf.dataType)
}).filter(tp => {
numType.contains(tp._2)
}).map(scm => {
val col = scm._1
val quantiles = dataDF.select(s"$col").distinct().stat.approxQuantile(col, Array(0.25, 0.5, 0.75), 0)
(col, quantiles)
})
val quantileDF = tuples.toDF("attribute", "quantiles")
.withColumn("q1", $"quantiles".getItem(0))
.withColumn("median", $"quantiles".getItem(1))
.withColumn("q3", $"quantiles".getItem(2))
.withColumn("IOR", $"q3" -$"q1")// 计算四分位差
.drop("quantiles")
quantileDF.show()
val schema = describe.schema
val longForm = describe.flatMap(row => {
val metric = row.getString(0)
(1 until row.length).map(i => {
(metric, schema(i).name, row.getString(i).toDouble)
})
}).toDF("summary", "attribute", "values")
// 列联表
val dataset:DataFrame = longForm.groupBy($"attribute").pivot("summary").agg(first("values"))
quantileDF.join(dataset, "attribute").drop("count").write.csv(s"/fe/user/zccao/out/$month/sql20190320")

网友评论