GraphFrame的定位:GraphX基于Spark Rdd ,GraphFrame 基于Spark DataFrame
Maven依赖:
<dependency> <groupId>org.graphframes</groupId> <artifactId>graphframes</artifactId> <version>0.5.0-spark2.1-s_2.11</version> </dependency>
import org.apache.spark.sql.SparkSession import org.graphframes.GraphFrame object GraphFrameTest { def main(args: Array[String]) { val spark = SparkSession .builder() .appName("GraphFrameTest") .master("local[4]") .getOrCreate() val sqlContext = spark.sqlContext // 创建顶点DataFrame val v = sqlContext.createDataFrame(List( ("a", "Alice", 34), ("b", "Bob", 36), ("c", "Charlie", 30), ("d", "David", 29), ("e", "Esther", 32), ("f", "Fanny", 36), ("g", "Gabby", 60) )).toDF("id", "name", "age") // 创建边DataFrame val e = sqlContext.createDataFrame(List( ("a", "b", "friend"), ("b", "c", "follow"), ("c", "b", "follow"), ("f", "c", "follow"), ("e", "f", "follow"), ("e", "d", "friend"), ("d", "a", "friend"), ("a", "e", "friend") )).toDF("src", "dst", "relationship") // 创建 GraphFrame 图对象 val g = GraphFrame(v, e) //TODO spark.stop() }
g.inDegrees.printSchema() // root // |-- id: string (nullable = true) // |-- inDegree: integer (nullable = false) g.inDegrees.show() // +---+--------+ // | id|inDegree| // +---+--------+ // | f| 1| // | e| 1| // | d| 1| // | c| 2| // | b| 2| // | a| 1| // +---+--------+ g.outDegrees.printSchema() // root // |-- id: string (nullable = true) // |-- outDegree: integer (nullable = false) g.outDegrees.show() // +---+---------+ // | id|outDegree| // +---+---------+ // | f| 1| // | e| 2| // | d| 1| // | c| 1| // | b| 1| // | a| 2| // +---+---------+
计算PageRank值
待续