Spark教程
1、直接使用内存数据
import org.apache.spark.{SparkConf, SparkContext} object MyWordCount { def main(args: Array[String]) { val conf=new SparkConf().setAppName("wordcount") val sc=new SparkContext(conf) val rdd1=sc.parallelize(Seq("hello world","hello spark","hello nice")) val rdd2=rdd1.flatMap(x=>x.split(" ")).map(x=>(x,1)) val rdd3=rdd2.reduceByKey((x,y)=>x+y) val result = rdd3.collect() result.foreach(x=>println(x)) } }
本地模式运行:
bin/spark-submit --master "local[4]" --class "MyWordCount" wordcount.jar> result.txt输出结果:
(spark,1) (nice,1) (hello,3) (world,1)
2、从hdfs上读数据,并写入到hdfs
import org.apache.spark.{SparkConf, SparkContext} object MyWordCount { def main(args: Array[String]) { val conf = new SparkConf().setAppName("wordcount") val sc = new SparkContext(conf) val rdd1 = sc.textFile("hdfs://<masterIP>:9000/tmp/input") val rdd2 = rdd1.flatMap(x => x.split(" ")).map(x => (x, 1)) val rdd3 = rdd2.reduceByKey((x, y) => x + y) rdd3.saveAsTextFile("hdfs://<masterIP>:9000/tmp/output") } }