Spark教程
作者: 时海 风自在
创建DataFrame

1、从SparkSession.sqlContext创建

    val spark = SparkSession
      .builder()
      .appName("DataFrameTest")
      .master("local[4]")
      .getOrCreate()

    val sqlContext = spark.sqlContext

    // 创建顶点DataFrame
    val person = sqlContext.createDataFrame(List(
      ("a", "Alice", 34, "w"),
      ("b", "Bob", 36, "m"),
      ("c", "Charlie", 30, "w"),
      ("d", "David", 29, "m"),
      ("e", "Esther", 32, "m"),
      ("f", "Fanny", 36, "m"),
      ("g", "Alice", 60, "w")
    )).toDF("id", "name", "age", "sex")

2、直接从Seq、Array、List等创建


 import spark.sqlContext.implicits._
 val df1 = Seq(

   ("Sam", 24, java.sql.Date.valueOf("2018-06-06")),

   ("Tome", 26, java.sql.Date.valueOf("2018-08-08"))

 ).toDF("name", "age", "register_time")

 df1.printSchema()
 df1.show()

3、使用Row Rdd和Schema创建

   import org.apache.spark.sql.types._

    val spark = SparkSession
      .builder()
      .appName("DataFrameTest")
      .master("local[4]")
      .getOrCreate()

    val sc = spark.sparkContext

    val schema = StructType(List(

      StructField("name", StringType, nullable = false),

      StructField("age", IntegerType, nullable = true),

      StructField("register_time", DateType, nullable = true)

    ))

    var rdd3 = sc.parallelize(Seq(

      Row("Sam", 24, java.sql.Date.valueOf("2018-06-06")),

      Row("Tome", 26, java.sql.Date.valueOf("2018-08-08"))

    ))
    val df3 = spark.createDataFrame(rdd3, schema)

    df3.printSchema()
    df3.show()

4、从json字符串或json文件创建


val jsonRdd = sc.makeRDD(Array(
   "{\"name\":\"Same\",\"age\":24}",
   "{\"name\":\"Tom\",\"age\":26}"
 ))

 //    var df4 = spark.read.json("example/data/data.json")
 var df4 = spark.read.json(jsonRdd)

 df4.printSchema()
 df4.show()
标签: val、sqlcontext、df4、spark、age
一个创业中的苦逼程序员
  • 回复
隐藏