一、准备hadoop环境

        hadoop的离线计算框架MapReduce,实现WordCount就稍许麻烦了,在计算效率上比Flink 、Spark还是要逊色很多,下面使用MapReduce实现WordCount。

 二、 编写Map类

public class WordCountMapper  extends Mapper<LongWritable, Text,Text, IntWritable> {
    Text k = new Text();
    IntWritable v = new IntWritable(1);
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 1 获取一行
        String line = value.toString();
        // 2 切割
        String[] words = line.split(" ");
        // 3 输出
        for (String word : words) {
            k.set(word);
            context.write(k, v);
        }
    }
}

   三、 编写Reduce类

public class WordCountReduce  extends Reducer<Text, IntWritable,Text,IntWritable> {
    int sum;
    IntWritable v = new IntWritable();
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        // 1 累加求和
        sum = 0;
        for (IntWritable count : values) {
            sum += count.get();
        }
        // 2 输出
        v.set(sum);
        context.write(key,v);
    }
}

  四、 提交计算

    /**
     *MapReduce 编程  yarn
     */
    public  static   void  mapReduceDriver(String ... args) throws IOException, ClassNotFoundException, InterruptedException {
        // 1 获取配置信息以及封装任务
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
        // 2 设置jar加载路径
        job.setJarByClass(HadoopApp.class);
        // 3 设置map和reduce类
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReduce.class);
        // 4 设置map输出
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        // 5 设置最终输出kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        // 6 设置输入和输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        // 7 提交
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }

        测试

        String inPath="D:/workplace/java-item/res/file";
        String outPath = inPath+"/hadoop";
        try {
            HadoopUtils.mapReduceDriver(inPath+"/hadoop_word_count.txt", outPath+"/out_word_count.txt");
        } catch (IOException | ClassNotFoundException | InterruptedException e) {
            e.printStackTrace();
            System.out.println("执行任务调度失败");
        }

Logo

技术共进,成长同行——讯飞AI开发者社区

更多推荐