Java access to HDFS configuration instructions for Hadoop distributed file system

Author：Eve Cole Update Time：2025-03-26 19:32:01

Configuration File

m103 is replaced with hdfs service address.
To use the Java client to access files on HDFS, I have to say that the configuration file hadoop-0.20.2/conf/core-site.xml was the one I suffered a big loss here at first, so I couldn't connect to HDFS and the files could not be created or read.

 <?xml version="1.0"?><?xml-stylesheet type="text/xsl" href="configuration.xsl"?><configuration><!--- global properties --><property><name>hadoop.tmp.dir</name><value>/home/zhangzk/hadoop</value><description>A base for other temporary directories.</description></property><!-- file system properties --><property><name>fs.default.name</name><value>hdfs://linux-zzk-113:9000</value></property></configuration>

Configuration item: hadoop.tmp.dir represents the directory location where metadata is stored on the named node, and for the data node, it is the directory where file data is stored on the node.

Configuration item: fs.default.name represents the named IP address and port number. The default value is file:///. For Java API, connecting to HDFS must use the configured URL address here. For data nodes, data nodes access the named node through this URL.

hdfs-site.xml

 <?xml version="1.0" encoding="UTF-8"?><!--Autogenerated by Cloudera Manager--><configuration> <property> <name>dfs.namenode.name.dir</name> <value>file:///mnt/sdc1/dfs/nn</value> </property> <property> <name>dfs.namenode.servicerpc-address</name> <value>m103:8022</value> </property> <property> <name>dfs.https.address</name> <value>m103:50470</value> </property> <property> <name>dfs.https.port</name> <value>50470</value> </property> <property> <name>dfs.namenode.http-address</name> <value>m103:50070</value> </property> <property> <name>dfs.replication</name> <value>3</value> </property> <property> <name>dfs.blocksize</name> <value>134217728</value> </property> <property> <name>dfs.client.use.datanode.hostname</name> <value>false</value> </property> <property> <name>fs.permissions.umask-mode</name> <value>022</value> </property> <property> <name>dfs.namenode.acls.enabled</name> <value>false</value> </property> <property> <name>dfs.block.local-path-access.user</name> <value>cloudera-scm</value> </property> <property> <name>dfs.client.read.shortcircuit</name> <value>false</value> </property> <property> <name>dfs.domain.socket.path</name> <value>/var/run/hdfs-sockets/dn</value> </property> <property> <name>dfs.client.read.shortcircuit.skip.checksum</name> <value>false</value> </property> <property> <name>dfs.client.domain.socket.data.traffic</name> <value>false</value> </property> <property> <name>dfs.datanode.hdfs-blocks-metadata.enabled</name> <value>true</value> </property> <property> <name>fs.http.impl</name> <value>com.scistor.datavision.fs.HTTPFileSystem</value> </property></configuration>

mapred-site.xml

 <?xml version="1.0" encoding="UTF-8"?><!--Autogenerated by Cloudera Manager--><configuration> <property> <name>mapreduce.job.split.metainfo.maxsize</name> <value>100000000</value> </property> <property> <name>mapreduce.job.counters.max</name> <value>120</value> </property> <property> <name>mapreduce.output.fileoutputformat.compress</name> <value>true</value> </property> <property> <name>mapreduce.output.fileoutputformat.compress.type</name> <value>BLOCK</value> </property> <property> <name>mapreduce.output.fileoutputformat.compress.codec</name> <value>org.apache.hadoop.io.compress.SnappyCodec</value> </property> <property> <name>mapreduce.map.output.compress.codec</name> <value>org.apache.hadoop.io.compress.SnappyCodec</value> </property> <property> <name>mapreduce.map.output.compress</name> <value>true</value> </property> <property> <name>zlib.compress.level</name> <value>DEFAULT_COMPRESSION</value> </property> <property> <name>mapreduce.task.io.sort.factor</name> <value>64</value> </property> <property> <name>mapreduce.map.sort.spill.percent</name> <value>0.8</value> </property> <property> <name>mapreduce.reduce.shuffle.parallelcopies</name> <value>10</value> </property> <property> <name>mapreduce.task.timeout</name> <value>600000</value> </property> <property> <name>mapreduce.client.submit.file.replication</name> <value>1</value> </property> <property> <name>mapreduce.job.reduces</name> <value>24</value> </property> <property> <name>mapreduce.task.io.sort.mb</name> <value>256</value> </property> <property> <name>mapreduce.map.speculative</name> <value>false</value> </property> <property> <name>mapreduce.reduce.speculative</name> <value>false</value> </property> <property> <name>mapreduce.reduce.speculative</name> <value>false</value> </property> <property> <name>mapreduce.job.reduce.slowstart.completedmaps</name> <value>0.8</value> </property> <property> <name>mapreduce.jobhistory.address</name> <value>m103:10020</value> </property> <property> <name>mapreduce.jobhistory.webapp.address</name> <value>m103:19888</value> </property> <property> <name>mapreduce.jobhistory.webapp.https.address</name> <value>m103:19890</value> </property> <property> <name>mapreduce.jobhistory.admin.address</name> <value>m103:10033</value> </property> <property> <name>mapreduce.framework.name</name> <value>yarn</value> </property> <property> <name>yarn.app.mapreduce.am.staging-dir</name> <value>/user</value> </property> <property> <name>mapreduce.am.max-attempts</name> <value>2</value> </property> <property> <name>yarn.app.mapreduce.am.resource.mb</name> <value>2048</value> </property> <property> <name>yarn.app.mapreduce.am.resource.cpu-vcores</name> <value>1</value> </property> <property> <name>mapreduce.job.ubertask.enable</name> <value>false</value> </property> <property> <name>yarn.app.mapreduce.am.command-opts</name> <value>-Djava.net.preferIPv4Stack=true -Xmx1717986918</value> </property> <property> <name>mapreduce.map.java.opts</name> <value>-Djava.net.preferIPv4Stack=true -Xmx1717986918</value> </property> <property> <name>mapreduce.reduce.java.opts</name> <value>-Djava.net.preferIPv4Stack=true -Xmx2576980378</value> </property> <property> <name>yarn.app.mapreduce.am.admin.user.env</name> <value>LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:$JAVA_LIBRARY_PATH</value> </property> <property> <name>mapreduce.map.memory.mb</name> <value>2048</value> </property> <property> <name>mapreduce.map.cpu.vcores</name> <value>1</value> </property> <property> <name>mapreduce.reduce.memory.mb</name> <value>3072</value> </property> <property> <name>mapreduce.map.cpu.vcores</name> <value>1</value> </property> <property> <name>mapreduce.reduce.cpu.vcores</name> <value>1</value> </property> <property> <name>mapreduce.application.classpath</name> <value>$HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,$MR2_CLASSPATH,$CDH_HCAT_HOME/share/hcatalog/*,$CDH_HIVE_HOME/lib/*,/etc/hive/conf,/opt/cloudera/parcels/CDH/lib/udps/*</value> </property> <property> <name>mapreduce.admin.user.env</name> <value>LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:$JAVA_LIBRARY_PATH</value> </property> <property> <name>mapreduce.shuffle.max.connections</name> <value>80</value> </property></configuration>

Use Java API to access HDFS files and directories

 package com.demo.hdfs;import java.io.BufferedInputStream;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.OutputStream;import java.net.URI;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FSDataOutputStream;import org.apache.hadoop.fs.FileStatus;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IOUtils;import org.apache.hadoop.util.Progressable;/** * @author zhangzk * */public class FileCopyToHdfs { public static void main(String[] args) throws Exception { try { //uploadToHdfs(); //deleteFromHdfs(); //getDirectoryFromHdfs(); appendToHdfs(); readFromHdfs(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { System.out.println("SUCCESS"); } } /**Upload the file to HDFS*/ private static void uploadToHdfs() throws FileNotFoundException,IOException { String localSrc = "d://qq.txt"; String dst = "hdfs://192.168.0.113:9000/user/zhangzk/qq.txt"; InputStream in = new BufferedInputStream(new FileInputStream(localSrc)); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(dst), conf); OutputStream out = fs.create(new Path(dst), new Progressable() { public void progress() { System.out.print("."); } }); IOUtils.copyBytes(in, out, 4096, true); } /**Read file from HDFS*/ private static void readFromHdfs() throws FileNotFoundException,IOException { String dst = "hdfs://192.168.0.113:9000/user/zhangzk/qq.txt"; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(dst), conf); FSDataInputStream hdfsInStream = fs.open(new Path(dst)); OutputStream out = new FileOutputStream("d:/qq-hdfs.txt"); byte[] ioBuffer = new byte[1024]; int readLen = hdfsInStream.read(ioBuffer); while(-1 != readLen){ out.write(ioBuffer, 0, readLen); readLen = hdfsInStream.read(ioBuffer); } out.close(); hdfsInStream.close(); fs.close(); } /**Add content to the end of the file on HDFS in append; Note: when file updates, you need to add <property><name>dfs.append.support</name><value>true</value></property>*/ private static void appendToHdfs() throws FileNotFoundException,IOException { String dst = "hdfs://192.168.0.113:9000/user/zhangzk/qq.txt"; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(dst), conf); FSDataOutputStream out = fs.append(new Path(dst)); int readLen = "zhangzk add by hdfs java api".getBytes().length; while(-1 != readLen){ out.write("zhangzk add by hdfs java api".getBytes(), 0, readLen); } out.close(); fs.close(); } /**Delete the file from HDFS*/ private static void deleteFromHdfs() throws FileNotFoundException,IOException { String dst = "hdfs://192.168.0.113:9000/user/zhangzk/qq-bak.txt"; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(dst), conf); fs.deleteOnExit(new Path(dst)); fs.close(); } /**Transfer files and directories on HDFS*/ private static void getDirectoryFromHdfs() throws FileNotFoundException,IOException { String dst = "hdfs://192.168.0.113:9000/user/zhangzk"; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(dst), conf); FileStatus fileList[] = fs.listStatus(new Path(dst)); int size = fileList.length; for(int i = 0; i < size; i++){ System.out.println("name:" + fileList[i].getPath().getName() + "/t/tsize:" + fileList[i].getLen()); } fs.close(); } }

Note: For append operations, it has not been supported since hadoop-0.21. For Append operations, please refer to a document on Javaeye.