JAVA API操作小文件合并至HDFS笔记

相关文件请自行创建！！！

package com.hadoop.hdfs;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.IoUtils;
/**

合并小文件至 HDFS
*/
public class MergeSmallFilesToHDFS {
private static FileSystem fs = null;
private static FileSystem local = null;

public static void main(String[] args) throws IOException,
URISyntaxException {
list();
}

/**
- 数据集合并，并上传至HDFS
- throws IOException
  throws URISyntaxException
  /
  public static void list() throws IOException, URISyntaxException {
  // 读取hadoop文件系统的配置
  Configuration conf = new Configuration();
  //文件系统访问接口，注意：hdfs://master:9000修改成自己的HDFS地址
  URI uri = new URI("hdfs://master:9000");
  //创建FileSystem对象
  fs = FileSystem.get(uri, conf);
  // 获得本地文件系统
  local = FileSystem.getLocal(conf);
  //过滤目录下的 svn文件，注意：文件路径E://Hadoop/73/修改成自己的路径
  FileStatus[] dirstatus = local.globStatus(new Path("E://Hadoop/73/"),new RegexExcludePathFilter("^.svn$"));
  //获取73目录下的所有文件路径
  Path[] dirs = FileUtil.stat2Paths(dirstatus);
  FSDataOutputStream out = null;
  FSDataInputStream in = null;
  for (Path dir : dirs) {
  //2019-10-31
  String fileName = dir.getName().replace("-", "");//文件名称
  //只接受日期目录下的.txt文件
  FileStatus[] localStatus = local.globStatus(new Path(dir+"/"),new RegexAcceptPathFilter("^.txt$"));
  // 获得日期目录下的所有文件
  Path[] listedpaths = FileUtil.stat2Paths(localStatus);
  //输出路径，注意：hdfs://master:9000/20191031/修改成自己的HDFS目录地址
  Path block = new Path("hdfs://master:9000/20191031/"+ fileName + ".txt");
  System.out.println("合并后的文件名称："+fileName+".txt");
  // 打开输出流
  out = fs.create(block);
  for (Path p : listedpaths) {
  in = local.open(p);// 打开输入流
  IoUtils.copyBytes(in, out, 4096, false); // 复制数据
  // 关闭输入流
  in.close();
  }
  if (out != null) {
  // 关闭输出流
  out.close();
  }
  }
}

/**
- 过滤 regex 格式的文件
- */
  public static class RegexExcludePathFilter implements PathFilter {
  private final String regex;
  public RegexExcludePathFilter(String regex) {
  this.regex = regex;
  }
  
  public boolean accept(Path path) {
  boolean flag = path.toString().matches(regex);
  return !flag;
  }
}

/**
- 接受 regex 格式的文件
- */
  public static class RegexAcceptPathFilter implements PathFilter {
  private final String regex;
  public RegexAcceptPathFilter(String regex) {
  this.regex = regex;
  }
  
  @Override
  public boolean accept(Path path) {
  boolean flag = path.toString().matches(regex);
  return flag;
  }
}
}

JAVA API操作小文件合并至HDFS笔记

相关推荐