直接上代码,主要实现按照key分组求平均值
val spark = SparkSession
.builder
.appName("InterfaceMonitor")
//.master("local[2]")
.getOrCreate()
spark.conf.set("spark.sql.session.timeZone", "UTC+8")
val df = spark.read.format("json").json("file:///e:\\a.json")
def encoder(columns: Seq[String]): Encoder[Row] = RowEncoder(StructType(columns.map(StructField(_, StringType, nullable = false))))
val outputCols = Seq("ckey", "sum")
val result = df.groupByKey(_.getString(0))(Encoders.kryo[String])
.flatMapGroups((key, rowsForEach) => {
val list1 = scala.collection.mutable.ListBuffer[Row]()
var sum = 0L
var count = 0
for (elem <- rowsForEach) {
sum += elem.getLong(1)
count = count + 1
}
val avg = sum / count
list1.append(Row(key, avg.toString))
list1
})(encoder(outputCols)).toDF
result.show(10)
网友评论