二值化:
import org.apache.spark.sql.SparkSession import org.apache.spark.ml.feature.Binarizer object Test { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("Test") .master("local[4]") .getOrCreate() val sc = spark.sparkContext val data = new Array[(Int, Double)](10) for (i <- 0 until 10) { data(i) = (i, scala.util.Random.nextDouble()) } val dataFrame = spark.createDataFrame(data).toDF("id", "feature") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(0.5) val binarizedDataFrame = binarizer.transform(dataFrame) binarizedDataFrame.show() } }
+---+--------------------+-----------------+
| id| feature|binarized_feature|
+---+--------------------+-----------------+
| 0|0.040647633164892394| 0.0|
| 1| 0.884659994780227| 1.0|
| 2| 0.6001477317587052| 1.0|
| 3| 0.6894334347305856| 1.0|
| 4|0.043729244458723615| 0.0|
| 5| 0.9618516358924446| 1.0|
| 6| 0.9630185650445161| 1.0|
| 7| 0.4671422367665342| 0.0|
| 8| 0.11077010598457682| 0.0|
| 9| 0.5390550839376956| 1.0|
+---+--------------------+-----------------+