Spark教程
1、从SparkSession.sqlContext创建
val spark = SparkSession .builder() .appName("DataFrameTest") .master("local[4]") .getOrCreate() val sqlContext = spark.sqlContext // 创建顶点DataFrame val person = sqlContext.createDataFrame(List( ("a", "Alice", 34, "w"), ("b", "Bob", 36, "m"), ("c", "Charlie", 30, "w"), ("d", "David", 29, "m"), ("e", "Esther", 32, "m"), ("f", "Fanny", 36, "m"), ("g", "Alice", 60, "w") )).toDF("id", "name", "age", "sex")
2、直接从Seq、Array、List等创建
import spark.sqlContext.implicits._ val df1 = Seq( ("Sam", 24, java.sql.Date.valueOf("2018-06-06")), ("Tome", 26, java.sql.Date.valueOf("2018-08-08")) ).toDF("name", "age", "register_time") df1.printSchema() df1.show()
3、使用Row Rdd和Schema创建
import org.apache.spark.sql.types._ val spark = SparkSession .builder() .appName("DataFrameTest") .master("local[4]") .getOrCreate() val sc = spark.sparkContext val schema = StructType(List( StructField("name", StringType, nullable = false), StructField("age", IntegerType, nullable = true), StructField("register_time", DateType, nullable = true) )) var rdd3 = sc.parallelize(Seq( Row("Sam", 24, java.sql.Date.valueOf("2018-06-06")), Row("Tome", 26, java.sql.Date.valueOf("2018-08-08")) )) val df3 = spark.createDataFrame(rdd3, schema) df3.printSchema() df3.show()
4、从json字符串或json文件创建
val jsonRdd = sc.makeRDD(Array( "{\"name\":\"Same\",\"age\":24}", "{\"name\":\"Tom\",\"age\":26}" )) // var df4 = spark.read.json("example/data/data.json") var df4 = spark.read.json(jsonRdd) df4.printSchema() df4.show()