Spark-MLlib-特征抽取

作者: Codlife | 来源:发表于2016-08-16 09:38 被阅读0次

参考资料:spark MLlib 官方文档

package lean_mllib

//import breeze.linalg.PCA
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.Vectors

/**
  * Created by wjf on 2016/8/15.
  */
object TestCountVector {

  val spark=MLLibConf.spark
  def main(args: Array[String]): Unit = {
  /*  val df= spark.createDataFrame(Seq(
      (0,Array("a","b","C","c")),
      (1,Array("a","a","b","b","c","C"))
    )).toDF("id","words")

    val cvModel:CountVectorizerModel =new CountVectorizer().setInputCol("words").setOutputCol("features").setVocabSize(3).setMinDF(2).fit(df)

    val cvm =new CountVectorizerModel(Array("a","b","c")).setInputCol("words").setOutputCol("features")
    cvModel.transform(df).select("features","words").show(false)*/

//    removeStopWord()
//    nGram()
//    testPCA()
//    testPolynomialExpansion()

//    testDiscreteCosineTransform()

//    testStringIndexer()
    testOneHotCoder()

  }


  def removeStopWord(): Unit ={
    val remover =new StopWordsRemover().setInputCol("raw").setOutputCol("filtered")
    val dataSet =spark.createDataFrame(Seq(
      (0,Seq("I","saw","the","red","baloon")),
      (1,Seq("Marry","had","a","little","lamb"))
    )).toDF("id","raw")
    remover.transform(dataSet).show()
  }

  def nGram(): Unit ={
    val wordDataFrame =spark.createDataFrame(Seq(
      (0,Array("Hi","I","heard","about","Spark")),
      (1,Array("I","wish","Java","Could","use","case","classes")),
      (2,Array("Logistic","regression","models","are","neat"))
    )).toDF("label","words")

    val ngram =new NGram().setInputCol("words").setOutputCol("ngrams")
    val ngramDataFrame =ngram.setN(10).transform(wordDataFrame)
    ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)

  }
  def testPCA(): Unit ={
    val data =Array(
      Vectors.sparse(5,Seq((1,1.0),(3,7.0))),
      Vectors.dense(2.0,0.0,3.0,4.0,5.0),
      Vectors.dense(4.0,0.0,0.0,6.0,7.0)
    )
    val df =spark.createDataFrame(data.map(Tuple1.apply)).toDF("label","features")
    df.take(10).foreach(println)
    val pca=new PCA().setInputCol("features").setOutputCol("pcaFeatures").setK(1).fit(df)
    val pcaDF =pca.transform(df)
    pcaDF.take(10).foreach(println)
    val result = pcaDF.select("pcaFeatures","label")
    result.show()
  }
  def testPolynomialExpansion(): Unit ={
    val data= Array(
      Vectors.dense(-2.0,2.3),
      Vectors.dense(0.0,0.0),
      Vectors.dense(0.6,-1.1)
    )
    val df =spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
    val polynomialExpansion =new PolynomialExpansion().setInputCol("features").setOutputCol("polyFeatures").setDegree(3)
    val polyDF = polynomialExpansion.transform(df)
    polyDF.select("polyFeatures").take(3).foreach(println)
  }

  def testDiscreteCosineTransform(): Unit ={
    val data =Seq(
      Vectors.dense(0.0,1.0,-2.0,3.0),
      Vectors.dense(-1.0,2.0,4.0,-7.0),
      Vectors.dense(14.0,-2.0,-5.0,1.0)
    )
    data.foreach(println)

    val df=spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
    // take some cow to the driver program. if the n is too large ,the driver program may be crash
    df.take(10).foreach(println)
    val dct =new DCT().setInputCol("features").setOutputCol("featuresDCT").setInverse(false)

    val dctDF = dct.transform(df)

    dctDF.select("featuresDCT").show(3)

  }


  def testStringIndexer(): Unit ={
    val df =spark.createDataFrame(Seq(
      (0,"a"),(1,"b"),(2,"c"),(3,"a"),(4,"a"),(5,"c")
    )).toDF("id","category")

    df.take(6).foreach(println)
    val indexer =new StringIndexer().setInputCol("category").setOutputCol("categoryIndex").fit(df)

    val indexed = indexer.transform(df)
    indexed.take(6).foreach(println)

    val converter =new IndexToString().setInputCol("categoryIndex").setOutputCol("originalCategory")

    val converted = converter.transform(indexed)
    converted.select("id","categoryIndex","originalCategory").show()

  }
  def testOneHotCoder(): Unit ={
    val df = spark.createDataFrame(Seq(
      (0,"a"),(1,"b"),(2,"c"),(3,"a"),(4,"a"),(5,"c"),(6,"b")
    )).toDF("id","category")
    val indexer = new StringIndexer().setInputCol("category").setOutputCol("categoryIndex").fit(df)

    val indexed = indexer.transform(df)
    val encoder =new OneHotEncoder().setInputCol("categoryIndex").setOutputCol("categoryVec")

    val encoded = encoder.transform(indexed)
    encoded.select("id","categoryVec").show()

  }

}

相关文章

  • Spark-MLlib-特征抽取

    参考资料:spark MLlib 官方文档

  • sklearn学习 — 特征抽取

    特征抽取 sklearn库提供了特征抽取模块 sklearn.feature_extraction 对数据进行特征...

  • 15.Spark学习(Python版本):特征处理相关的算法(T

    特征处理相关的算法,大体分为以下三类: 特征抽取:从原始数据中抽取特征特征转换:特征的维度、特征的转化、特征的修改...

  • 特征工程——特征抽取

      数据和特征决定了机器学习的上限,而模型和算法只是逼近这个上限。因此,特征工程在机器学习过程中占据极其重要的一部...

  • 降维与特征选择

    第七章 降维与特征选择 [TOC] 1. 特征抽取和特征选择的区别 特征抽取:在已有的特征上,采用特征变换的方法,...

  • 3.1.1.1特征抽取

    3.1.1特征抽取 特征抽取,就是逐条将原始数据转化为特征向量的形式,这个过程同时涉及对数据特征的量化表示;而特征...

  • scikit-learn特征工程

    字典数据特征抽取 sklearn的DictVectorizer可以对字典做特征抽取,并以onehot形式编码。 有...

  • 特征抽取器

    文本的特点: 文本是一个序列 文本长度不固定 文本的位置是个重要的信息 文本的任务: 序列标注 分类任务 句子关系...

  • 06 特征抽取

    1数据的特征抽取 数据的特征抽取:将文本等数据进行特征值化(转换成计算机可以理解的数字类型) DictVector...

  • 特征选择, 经典三刀

    ​特征选择(Feature Selection,FS)和特征抽取(Feature Extraction, FE)是...

网友评论

    本文标题:Spark-MLlib-特征抽取

    本文链接:https://www.haomeiwen.com/subject/jdkpsttx.html