Spark教程
作者: 时海 风自在
WordCount

1、直接使用内存数据

import org.apache.spark.{SparkConf, SparkContext}

object MyWordCount {
  def main(args: Array[String]) {
    val conf=new SparkConf().setAppName("wordcount")
    val sc=new SparkContext(conf)
    val rdd1=sc.parallelize(Seq("hello world","hello spark","hello nice"))
    val rdd2=rdd1.flatMap(x=>x.split(" ")).map(x=>(x,1))
    val rdd3=rdd2.reduceByKey((x,y)=>x+y)
    val result = rdd3.collect()
    result.foreach(x=>println(x))
  }

}

本地模式运行:

bin/spark-submit --master "local[4]"  --class "MyWordCount" wordcount.jar> result.txt
输出结果:


(spark,1)
(nice,1)
(hello,3)
(world,1)

2、从hdfs上读数据,并写入到hdfs


import org.apache.spark.{SparkConf, SparkContext}

object MyWordCount {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("wordcount")
    val sc = new SparkContext(conf)
    val rdd1 = sc.textFile("hdfs://<masterIP>:9000/tmp/input")
    val rdd2 = rdd1.flatMap(x => x.split(" ")).map(x => (x, 1))
    val rdd3 = rdd2.reduceByKey((x, y) => x + y)
    rdd3.saveAsTextFile("hdfs://<masterIP>:9000/tmp/output")
  }

}


标签: val、rdd3、rdd2、wordcount、rdd1
一个创业中的苦逼程序员
  • 回复
隐藏