Skip to content

Commit ecce24a

Browse files
author
Aswin M Prabhu
committed
HDFS-13538. HDFS DiskChecker should handle disk full situation
1 parent 631939f commit ecce24a

File tree

4 files changed

+47
-12
lines changed

4 files changed

+47
-12
lines changed

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DiskChecker.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,10 @@ private static void doDiskIo(File dir) throws DiskErrorException {
267267
ioe = e;
268268
}
269269
}
270-
throw ioe; // Just rethrow the last exception to signal failure.
270+
// Throw the exception only if it's not about disk being full.
271+
if (!ioe.getMessage().contains("No space left")) {
272+
throw ioe; // Just rethrow the last exception to signal failure.
273+
}
271274
} catch(IOException e) {
272275
throw new DiskErrorException("Error checking directory " + dir, e);
273276
}

hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestDiskCheckerWithDiskIo.java

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,7 @@
3131
import java.nio.file.attribute.PosixFilePermissions;
3232
import java.util.concurrent.atomic.AtomicInteger;
3333

34-
import static org.junit.jupiter.api.Assertions.assertThrows;
35-
import static org.junit.jupiter.api.Assertions.assertTrue;
34+
import static org.junit.jupiter.api.Assertions.*;
3635

3736

3837
/**
@@ -48,7 +47,7 @@ public final class TestDiskCheckerWithDiskIo {
4847
@Test
4948
public final void testDiskIoIgnoresTransientCreateErrors() throws Throwable {
5049
DiskChecker.replaceFileOutputStreamProvider(new TestFileIoProvider(
51-
DiskChecker.DISK_IO_MAX_ITERATIONS - 1, 0));
50+
DiskChecker.DISK_IO_MAX_ITERATIONS - 1, 0, false));
5251
checkDirs(true);
5352
}
5453

@@ -59,7 +58,7 @@ public final void testDiskIoIgnoresTransientCreateErrors() throws Throwable {
5958
public final void testDiskIoDetectsCreateErrors() throws Throwable {
6059
assertThrows(DiskErrorException.class, () -> {
6160
DiskChecker.replaceFileOutputStreamProvider(new TestFileIoProvider(
62-
DiskChecker.DISK_IO_MAX_ITERATIONS, 0));
61+
DiskChecker.DISK_IO_MAX_ITERATIONS, 0, false));
6362
checkDirs(false);
6463
});
6564
}
@@ -70,7 +69,7 @@ public final void testDiskIoDetectsCreateErrors() throws Throwable {
7069
@Test
7170
public final void testDiskIoIgnoresTransientWriteErrors() throws Throwable {
7271
DiskChecker.replaceFileOutputStreamProvider(new TestFileIoProvider(
73-
0, DiskChecker.DISK_IO_MAX_ITERATIONS - 1));
72+
0, DiskChecker.DISK_IO_MAX_ITERATIONS - 1, false));
7473
checkDirs(true);
7574
}
7675

@@ -81,7 +80,7 @@ public final void testDiskIoIgnoresTransientWriteErrors() throws Throwable {
8180
public final void testDiskIoDetectsWriteErrors() throws Throwable {
8281
assertThrows(DiskErrorException.class, ()->{
8382
DiskChecker.replaceFileOutputStreamProvider(new TestFileIoProvider(
84-
0, DiskChecker.DISK_IO_MAX_ITERATIONS));
83+
0, DiskChecker.DISK_IO_MAX_ITERATIONS, false));
8584
checkDirs(false);
8685
});
8786
}
@@ -104,6 +103,18 @@ public void testDiskIoFileNaming() {
104103
"File name does not match expected pattern: " + guidFile);
105104
}
106105

106+
/**
107+
* Verify DiskChecker doesn't fail on ENOSPC errors.
108+
*/
109+
@Test
110+
public void testDiskIoDetectsENOSPCWriteErrors() {
111+
assertDoesNotThrow(()->{
112+
DiskChecker.replaceFileOutputStreamProvider(new TestFileIoProvider(
113+
0, DiskChecker.DISK_IO_MAX_ITERATIONS, true));
114+
checkDirs(true);
115+
});
116+
}
117+
107118
/**
108119
* A dummy {@link DiskChecker#FileIoProvider} that can throw a programmable
109120
* number of times.
@@ -114,11 +125,13 @@ private static class TestFileIoProvider implements FileIoProvider {
114125

115126
private final int numTimesToThrowOnCreate;
116127
private final int numTimesToThrowOnWrite;
128+
private final boolean throwENOSPCError;
117129

118130
public TestFileIoProvider(
119-
int numTimesToThrowOnCreate, int numTimesToThrowOnWrite) {
131+
int numTimesToThrowOnCreate, int numTimesToThrowOnWrite, boolean throwENOSPCError) {
120132
this.numTimesToThrowOnCreate = numTimesToThrowOnCreate;
121133
this.numTimesToThrowOnWrite = numTimesToThrowOnWrite;
134+
this.throwENOSPCError = throwENOSPCError;
122135
}
123136

124137
/**
@@ -139,7 +152,11 @@ public FileOutputStream get(File f) throws FileNotFoundException {
139152
@Override
140153
public void write(FileOutputStream fos, byte[] data) throws IOException {
141154
if (numWriteCalls.getAndIncrement() < numTimesToThrowOnWrite) {
142-
throw new IOException("Dummy exception for testing");
155+
if (!throwENOSPCError) {
156+
throw new IOException("Dummy exception for testing");
157+
} else {
158+
throw new IOException("No space left on device");
159+
}
143160
}
144161
fos.write(data);
145162
}

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,10 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
190190
public static final long DFS_DN_CACHED_DFSUSED_CHECK_INTERVAL_DEFAULT_MS =
191191
600000;
192192

193+
public static final String DFS_DATANODE_CHECK_DIR_WITH_DISKIO =
194+
"dfs.datanode.check.dir.with.diskio";
195+
public static final boolean DFS_DATANODE_CHECK_DIR_WITH_DISKIO_DEFAULT = false;
196+
193197
public static final String DFS_NAMENODE_PATH_BASED_CACHE_BLOCK_MAP_ALLOCATION_PERCENT =
194198
"dfs.namenode.path.based.cache.block.map.allocation.percent";
195199
public static final float DFS_NAMENODE_PATH_BASED_CACHE_BLOCK_MAP_ALLOCATION_PERCENT_DEFAULT = 0.25f;

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/BlockPoolSlice.java

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ public class BlockPoolSlice {
120120
private final long cachedDfsUsedCheckTime;
121121
private final Timer timer;
122122
private final int maxDataLength;
123+
private final boolean checkDirWithDiskIo;
123124
private final FileIoProvider fileIoProvider;
124125
private final Configuration config;
125126
private final File bpDir;
@@ -179,6 +180,10 @@ public int compare(File f1, File f2) {
179180
CommonConfigurationKeys.IPC_MAXIMUM_DATA_LENGTH,
180181
CommonConfigurationKeys.IPC_MAXIMUM_DATA_LENGTH_DEFAULT);
181182

183+
this.checkDirWithDiskIo = conf.getBoolean(
184+
DFSConfigKeys.DFS_DATANODE_CHECK_DIR_WITH_DISKIO,
185+
DFSConfigKeys.DFS_DATANODE_CHECK_DIR_WITH_DISKIO_DEFAULT);
186+
182187
this.timer = timer;
183188

184189
// Files that were being written when the datanode was last shutdown
@@ -484,9 +489,15 @@ ReplicaInfo activateSavedReplica(ReplicaInfo replicaInfo,
484489
}
485490

486491
void checkDirs() throws DiskErrorException {
487-
DiskChecker.checkDir(finalizedDir);
488-
DiskChecker.checkDir(tmpDir);
489-
DiskChecker.checkDir(rbwDir);
492+
if (checkDirWithDiskIo) {
493+
DiskChecker.checkDirWithDiskIo(finalizedDir);
494+
DiskChecker.checkDirWithDiskIo(tmpDir);
495+
DiskChecker.checkDirWithDiskIo(rbwDir);
496+
} else {
497+
DiskChecker.checkDir(finalizedDir);
498+
DiskChecker.checkDir(tmpDir);
499+
DiskChecker.checkDir(rbwDir);
500+
}
490501
}
491502

492503

0 commit comments

Comments
 (0)