本文共 4013 字,大约阅读时间需要 13 分钟。
package com.jh.java;import java.util.Arrays;import java.util.List;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.rdd.RDD;public class My_version1{ public static void main(String[] args) { //告诉hadoop在哪,也可不写,但前提是配置有hadoop的环境变量 System.setProperty("hadoop.home.dir", "G:\\untar\\hadoop-3.2.1\\hadoop-3.2.1") //spark的配置信息 SparkConf sparkConf = new SparkConf(); //给application起个名字 sparkConf.setAppName("My_java_spark"); //运行的模式,是本地模式 sparkConf.setMaster("local[*]"); //初始化spark JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); //test01(javaSparkContext); test02(javaSparkContext); //关闭资源 javaSparkContext.stop(); } public static void test01(JavaSparkContext sc ) { System.out.println("java-test01");// //准备集合// ListasList = Arrays.asList(14,56,89,42,56,33);// 转换成Rdd// JavaRDD javaRDD = sc.parallelize(asList); // // System.out.println(javaRDD);// //遍历// javaRDD.foreach(t ->System.out.println(t)); //准备集合 List asList = Arrays.asList(14,56,89,42,56,33); //后面的数字是分区数 JavaRDD javaRDD = sc.parallelize(asList,2); System.out.println(javaRDD); javaRDD.foreachPartition(t->{ //查看类型,是迭代器 System.out.println("asd"+t.getClass().getSimpleName()); //遍历每个分区的元素 t.forEachRemaining(f->System.out.print(f + ",")); System.out.println("end"); }); //分区数 System.out.println("num:" + javaRDD.getNumPartitions()); } public static void test02(JavaSparkContext sc) { System.out.println("java-test02"); String path = "E:\\eclipse201912_daima\\dev31th\\test01"; //分两区,转换成Rdd JavaRDD textFile = sc.textFile(path,2); //转换成集合 List collect = textFile.collect(); //查看元素 System.out.println(collect); //查看分区数 System.out.println("num" + textFile.getNumPartitions()); String path1="C:\\\\Users\\\\admin\\\\Desktop\\\\2"; //将数据存储到磁盘中 textFile.saveAsTextFile(path1); System.out.println("GG"); }}
package com.jh.scalaimport org.apache.spark.SparkContextimport org.apache.spark.SparkConfimport java.util.Arraysobject Teacher_version1 { def main(args: Array[String]): Unit = { //告诉hadoop在哪,也可不写,但前提是配置有hadoop的环境变量 System.setProperty("hadoop.home.dir", "G:\\untar\\hadoop-3.2.1\\hadoop-3.2.1") var conf = new SparkConf(); //设置大哥 conf.setMaster("local[*]") //给这个application起个名字 conf.setAppName("Teacher-version") var sc = new SparkContext(conf) test01(sc) //test02(sc) sc.stop() } def test01(c:SparkContext){ println("test01") var arr = Array(10,20,30,40) //创建一个Rdd var arrRdd = c.parallelize(arr) //查看其中元素 var collectArr = arrRdd.collect();// println("collectArr:" + Arrays.toString(collectArr)) //循环遍历元素,都可以// arradd.foreach(println)// arradd.foreach(println(_))// arradd.foreach(t=>println(t)) println("分区") //后面的参数是分几个区 var arrParaRdd = c.parallelize(arr, 2) //查看元素 collectArr = arrParaRdd.collect() println("collectArr:" + Arrays.toString(collectArr)) //已分区集合查看其分区 arrParaRdd.foreachPartition(f=> {//查看类型 println("111" + f.getClass.getSimpleName) f.foreach(t=>{ print("循环" + t + "\t") }) println("end") } ) //查看分区数 println("num:" + arrParaRdd.getNumPartitions) } def test02(sc:SparkContext){ //文件路径 var path = "E:\\eclipse201912_daima\\dev31th\\test01" //参数:文件路径,分区数 var textFileRdd = sc.textFile(path,2) println("textFileRdd:" + textFileRdd) //查看分区数量 println("getNumPartitions" + textFileRdd.getNumPartitions) //收集元素为数组 var textArr = textFileRdd.collect() println("textArr" + textArr) textFileRdd.foreach(println) var tarPath = "C:\\Users\\admin\\Desktop\\1" //将rdd存储到磁盘上,存储的是文本文件 textFileRdd.saveAsTextFile(tarPath) }}
转载地址:http://hgrzi.baihongyu.com/