Skip to content

Commit 77828a7

Browse files
authored
HDFS-17815. Fix upload fsimage failure when checkpoint takes a long time (#7845). Contributed by caozhiqiang.
Reviewed-by: Tao Li <[email protected]> Signed-off-by: Ayush Saxena <[email protected]> Signed-off-by: He Xiaoqiao <[email protected]>
1 parent 072b2d6 commit 77828a7

File tree

4 files changed

+63
-4
lines changed

4 files changed

+63
-4
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7174,6 +7174,11 @@ public synchronized void verifyToken(DelegationTokenIdentifier identifier,
71747174
public EditLogTailer getEditLogTailer() {
71757175
return editLogTailer;
71767176
}
7177+
7178+
@VisibleForTesting
7179+
public long getStandbyLastCheckpointTime() {
7180+
return standbyCheckpointer.getLastCheckpointTime();
7181+
}
71777182

71787183
@VisibleForTesting
71797184
public void setEditLogTailerForTests(EditLogTailer tailer) {

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,7 @@ void writeTransactionIdFile(StorageDirectory sd, long txid)
472472
* @param time time of the last checkpoint, in millis since the epoch
473473
*/
474474
void setMostRecentCheckpointInfo(long txid, long time) {
475+
LOG.info("setMostRecentCheckpointInfo txid is {}, time is {}", txid, time);
475476
this.mostRecentCheckpointTxId = txid;
476477
this.mostRecentCheckpointTime = time;
477478
}
@@ -486,7 +487,7 @@ public long getMostRecentCheckpointTxId() {
486487
/**
487488
* @return the time of the most recent checkpoint in millis since the epoch.
488489
*/
489-
long getMostRecentCheckpointTime() {
490+
public long getMostRecentCheckpointTime() {
490491
return mostRecentCheckpointTime;
491492
}
492493

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,11 @@ static int getCanceledCount() {
375375
return canceledCount;
376376
}
377377

378+
@VisibleForTesting
379+
public long getLastCheckpointTime() {
380+
return lastCheckpointTime;
381+
}
382+
378383
private long countUncheckpointedTxns() {
379384
FSImage img = namesystem.getFSImage();
380385
return img.getCorrectLastAppliedOrWrittenTxId() -
@@ -461,7 +466,8 @@ private void doWork() {
461466
} else if (secsSinceLast >= checkpointConf.getPeriod()) {
462467
LOG.info("Triggering checkpoint because it has been {} seconds " +
463468
"since the last checkpoint, which exceeds the configured " +
464-
"interval {}", secsSinceLast, checkpointConf.getPeriod());
469+
"interval {}, And now is {}, lastCheckpointTime is {}.",
470+
secsSinceLast, checkpointConf.getPeriod(), now, lastCheckpointTime);
465471
needCheckpoint = true;
466472
}
467473

@@ -487,8 +493,9 @@ private void doWork() {
487493
namesystem.setCreatedRollbackImages(true);
488494
namesystem.setNeedRollbackFsImage(false);
489495
}
490-
lastCheckpointTime = now;
491-
LOG.info("Checkpoint finished successfully.");
496+
lastCheckpointTime = monotonicNow();
497+
LOG.info("Checkpoint finished successfully, the lastCheckpointTime is:{}.",
498+
lastCheckpointTime);
492499
}
493500
} catch (SaveNamespaceCancelledException ce) {
494501
LOG.info("Checkpoint was cancelled: {}", ce.getMessage());

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -649,6 +649,52 @@ public void testCheckpointSucceedsWithLegacyOIVException() throws Exception {
649649
HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(12));
650650
}
651651

652+
/**
653+
* Test that lastCheckpointTime is correctly updated at each checkpoint.
654+
*/
655+
@Test(timeout = 300000)
656+
public void testLastCheckpointTime() throws Exception {
657+
for (int i = 1; i < NUM_NNS; i++) {
658+
cluster.shutdownNameNode(i);
659+
660+
// Make true checkpoint for DFS_NAMENODE_CHECKPOINT_PERIOD_KEY
661+
cluster.getConfiguration(i).setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 3);
662+
cluster.getConfiguration(i).setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 1000);
663+
}
664+
doEdits(0, 10);
665+
cluster.transitionToStandby(0);
666+
667+
// Standby NNs do checkpoint without active NN available.
668+
for (int i = 1; i < NUM_NNS; i++) {
669+
cluster.restartNameNode(i, false);
670+
}
671+
cluster.waitClusterUp();
672+
setNNs();
673+
674+
for (int i = 0; i < NUM_NNS; i++) {
675+
// Once the standby catches up, it should do a checkpoint
676+
// and save to local directories.
677+
HATestUtil.waitForCheckpoint(cluster, i, ImmutableList.of(12));
678+
}
679+
680+
long snnCheckpointTime1 = nns[1].getNamesystem().getStandbyLastCheckpointTime();
681+
long annCheckpointTime1 = nns[0].getNamesystem().getLastCheckpointTime();
682+
cluster.transitionToActive(0);
683+
cluster.transitionToObserver(2);
684+
685+
doEdits(11, 20);
686+
nns[0].getRpcServer().rollEditLog();
687+
HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(23));
688+
689+
long snnCheckpointTime2 = nns[1].getNamesystem().getStandbyLastCheckpointTime();
690+
long annCheckpointTime2 = nns[0].getNamesystem().getLastCheckpointTime();
691+
692+
// Make sure that both standby and active NNs' lastCheckpointTime intervals are larger
693+
// than 3 DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY.
694+
assertTrue(snnCheckpointTime2 - snnCheckpointTime1 >= 3000
695+
&& annCheckpointTime2 - annCheckpointTime1 >= 3000);
696+
}
697+
652698
private void doEdits(int start, int stop) throws IOException {
653699
for (int i = start; i < stop; i++) {
654700
Path p = new Path("/test" + i);

0 commit comments

Comments
 (0)