spark读取GBK文件乱码
spark的textFile方法默认写死了读UTF-8格式的文件,读其他格式的文件则会显示乱码。如下面的代码所示
object Test2 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.appName("PowerLeo")
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val sc = spark.sparkContext
val fileRdd = sc.textFile("file:///C:\\Users\\leo\\Desktop\\20190704\\bigdata_buy2_fq_2019_07_04.DAT")
fileRdd.foreach(println(_))
spark.stop()
}
}
结果截图:
spark解决方法
object Test2 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.appName("PowerLeo")
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val sc = spark.sparkContext
//val fileRdd = sc.textFile("file:///C:\\Users\\leo\\Desktop\\20190704\\bigdata_buy2_fq_2019_07_04.DAT")
//val fileRdd = transfer(sc, "file:///C:\\Users\\leo\\Desktop\\20190704\\bigdata_buy2_fq_2019_07_04.DAT")
val fileRdd = loadFileToRdd(sc,"file:///C:\\\\Users\\\\leo\\\\Desktop\\\\20190704\\\\bigdata_buy2_fq_2019_07_04.DAT")
fileRdd.foreach(println(_))
spark.stop()
}
def transfer(sc: SparkContext, path: String): RDD[String] = {
sc.hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], 1)
.map(p => new String(p._2.getBytes, 0, p._2.getLength, "GBK"))
}
/**
*
* @param sc spark context 上下文
* @param path 文件读取路径
* @param encoding 文件编码
* @return (文件路径名,文件行内容)
*/
def loadFileToRdd(sc: SparkContext, path: String, encoding: String = "GBK"): RDD[(String, String, Int)] = {
sc.hadoopFile[LongWritable, Text, TextInputFormat](path)
.asInstanceOf[HadoopRDD[LongWritable, Text]]
.mapPartitionsWithInputSplit((inputSplit: InputSplit, iterator: Iterator[(LongWritable, Text)]) => {
val file = inputSplit.asInstanceOf[FileSplit]
iterator.filter(x => x._2 != null).map(x => {
(file.getPath.getName, new String(x._2.getBytes, 0, x._2.getLength, encoding), 1)
})
})
}
}
运行截图:
res
网友评论