美文网首页
spark读取GBK文件的方法

spark读取GBK文件的方法

作者: NikolasNull | 来源:发表于2019-08-22 14:39 被阅读0次

    spark读取GBK文件乱码

    spark的textFile方法默认写死了读UTF-8格式的文件,读其他格式的文件则会显示乱码。如下面的代码所示

    
    object Test2 {
      def main(args: Array[String]): Unit = {
        val spark = SparkSession
          .builder
          .appName("PowerLeo")
          .master("local[*]")
          .enableHiveSupport()
          .getOrCreate()
        spark.sparkContext.setLogLevel("WARN")
    
        val sc = spark.sparkContext
    
        val fileRdd = sc.textFile("file:///C:\\Users\\leo\\Desktop\\20190704\\bigdata_buy2_fq_2019_07_04.DAT")
        fileRdd.foreach(println(_))
    
        spark.stop()
      }
    }
    

    结果截图:

    spark

    解决方法

    
    object Test2 {
      def main(args: Array[String]): Unit = {
        val spark = SparkSession
          .builder
          .appName("PowerLeo")
          .master("local[*]")
          .enableHiveSupport()
          .getOrCreate()
        spark.sparkContext.setLogLevel("WARN")
    
        val sc = spark.sparkContext
    
        //val fileRdd = sc.textFile("file:///C:\\Users\\leo\\Desktop\\20190704\\bigdata_buy2_fq_2019_07_04.DAT")
        //val fileRdd = transfer(sc, "file:///C:\\Users\\leo\\Desktop\\20190704\\bigdata_buy2_fq_2019_07_04.DAT")
        val fileRdd = loadFileToRdd(sc,"file:///C:\\\\Users\\\\leo\\\\Desktop\\\\20190704\\\\bigdata_buy2_fq_2019_07_04.DAT")
        
        fileRdd.foreach(println(_))
    
        spark.stop()
      }
    
      def transfer(sc: SparkContext, path: String): RDD[String] = {
        sc.hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], 1)
          .map(p => new String(p._2.getBytes, 0, p._2.getLength, "GBK"))
      }
    
      /**
        *
        * @param sc       spark context 上下文
        * @param path     文件读取路径
        * @param encoding 文件编码
        * @return (文件路径名,文件行内容)
        */
      def loadFileToRdd(sc: SparkContext, path: String, encoding: String = "GBK"): RDD[(String, String, Int)] = {
        sc.hadoopFile[LongWritable, Text, TextInputFormat](path)
          .asInstanceOf[HadoopRDD[LongWritable, Text]]
          .mapPartitionsWithInputSplit((inputSplit: InputSplit, iterator: Iterator[(LongWritable, Text)]) => {
            val file = inputSplit.asInstanceOf[FileSplit]
            iterator.filter(x => x._2 != null).map(x => {
              (file.getPath.getName, new String(x._2.getBytes, 0, x._2.getLength, encoding), 1)
            })
          })
      }
    }
    
    
    

    运行截图:

    res

    相关文章

      网友评论

          本文标题:spark读取GBK文件的方法

          本文链接:https://www.haomeiwen.com/subject/dvpnsctx.html