Skip to content

Commit 4ddbf5d

Browse files
rmdmattinglyHernan Gelaf-Romer
authored andcommitted
HBase Backport: Modern backup failures can cause backup system to lock up
1 parent 2d42f1a commit 4ddbf5d

File tree

16 files changed

+375
-6
lines changed

16 files changed

+375
-6
lines changed

hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/impl/BackupSystemTable.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1403,9 +1403,7 @@ public static void restoreFromSnapshot(Connection conn) throws IOException {
14031403
try (Admin admin = conn.getAdmin()) {
14041404
String snapshotName = BackupSystemTable.getSnapshotName(conf);
14051405
if (snapshotExists(admin, snapshotName)) {
1406-
admin.disableTable(BackupSystemTable.getTableName(conf));
1407-
admin.restoreSnapshot(snapshotName);
1408-
admin.enableTable(BackupSystemTable.getTableName(conf));
1406+
admin.restoreBackupSystemTable(snapshotName);
14091407
LOG.debug("Done restoring backup system table");
14101408
} else {
14111409
// Snapshot does not exists, i.e completeBackup failed after
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.backup.master;
19+
20+
import static org.junit.Assert.assertEquals;
21+
22+
import java.util.HashMap;
23+
import java.util.HashSet;
24+
import java.util.Map;
25+
import java.util.Set;
26+
import org.apache.hadoop.hbase.HBaseTestingUtility;
27+
import org.apache.hadoop.hbase.TableName;
28+
import org.apache.hadoop.hbase.backup.impl.BackupSystemTable;
29+
import org.apache.hadoop.hbase.client.Admin;
30+
import org.apache.hadoop.hbase.testclassification.MasterTests;
31+
import org.apache.hadoop.hbase.testclassification.MediumTests;
32+
import org.junit.AfterClass;
33+
import org.junit.BeforeClass;
34+
import org.junit.Test;
35+
import org.junit.experimental.categories.Category;
36+
37+
@Category({ MasterTests.class, MediumTests.class })
38+
public class TestRestoreBackupSystemTable {
39+
private static final String BACKUP_ROOT = "root";
40+
private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
41+
42+
@BeforeClass
43+
public static void setUp() throws Exception {
44+
UTIL.startMiniCluster();
45+
}
46+
47+
@Test
48+
public void itRestoresFromSnapshot() throws Exception {
49+
BackupSystemTable table = new BackupSystemTable(UTIL.getConnection());
50+
Set<TableName> tables = new HashSet<>();
51+
52+
tables.add(TableName.valueOf("test1"));
53+
tables.add(TableName.valueOf("test2"));
54+
tables.add(TableName.valueOf("test3"));
55+
56+
Map<String, Long> rsTimestampMap = new HashMap<>();
57+
rsTimestampMap.put("rs1:100", 100L);
58+
rsTimestampMap.put("rs2:100", 101L);
59+
rsTimestampMap.put("rs3:100", 103L);
60+
61+
table.writeRegionServerLogTimestamp(tables, rsTimestampMap, BACKUP_ROOT);
62+
BackupSystemTable.snapshot(UTIL.getConnection());
63+
64+
Admin admin = UTIL.getAdmin();
65+
TableName backupSystemTn = BackupSystemTable.getTableName(UTIL.getConfiguration());
66+
admin.disableTable(backupSystemTn);
67+
admin.truncateTable(backupSystemTn, true);
68+
69+
BackupSystemTable.restoreFromSnapshot(UTIL.getConnection());
70+
Map<TableName, Map<String, Long>> results = table.readLogTimestampMap(BACKUP_ROOT);
71+
72+
assertEquals(results.size(), tables.size());
73+
74+
for (TableName tableName : tables) {
75+
Map<String, Long> resultMap = results.get(tableName);
76+
assertEquals(resultMap, rsTimestampMap);
77+
}
78+
}
79+
80+
@AfterClass
81+
public static void tearDown() throws Exception {
82+
UTIL.shutdownMiniCluster();
83+
}
84+
}

hbase-client/src/main/java/org/apache/hadoop/hbase/client/Admin.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
package org.apache.hadoop.hbase.client;
1919

2020
import static org.apache.hadoop.hbase.util.FutureUtils.get;
21-
2221
import java.io.Closeable;
2322
import java.io.IOException;
2423
import java.util.Collection;
@@ -68,7 +67,6 @@
6867
import org.apache.hadoop.hbase.util.Bytes;
6968
import org.apache.hadoop.hbase.util.Pair;
7069
import org.apache.yetus.audience.InterfaceAudience;
71-
7270
import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableList;
7371

7472
/**
@@ -3356,4 +3354,15 @@ List<LogEntry> getLogEntries(Set<ServerName> serverNames, String logType, Server
33563354
* Get the list of cached files
33573355
*/
33583356
List<String> getCachedFilesList(ServerName serverName) throws IOException;
3357+
3358+
@InterfaceAudience.Private
3359+
default void restoreBackupSystemTable(String snapshotName) throws IOException {
3360+
SnapshotDescription snapshot = listSnapshots().stream()
3361+
.filter(s -> s.getName().equals(snapshotName)).findFirst().orElseThrow(() -> new
3362+
IOException("Snapshot " + snapshotName + " not found"));
3363+
TableName tn = snapshot.getTableName();
3364+
disableTable(tn);
3365+
restoreSnapshot(snapshotName);
3366+
enableTable(tn);
3367+
}
33593368
}

hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncAdmin.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1721,4 +1721,7 @@ CompletableFuture<List<LogEntry>> getLogEntries(Set<ServerName> serverNames, Str
17211721
* Get the list of cached files
17221722
*/
17231723
CompletableFuture<List<String>> getCachedFilesList(ServerName serverName);
1724+
1725+
@InterfaceAudience.Private
1726+
CompletableFuture<Void> restoreBackupSystemTable(String snapshotName);
17241727
}

hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncHBaseAdmin.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -921,4 +921,9 @@ public CompletableFuture<Void> flushMasterStore() {
921921
public CompletableFuture<List<String>> getCachedFilesList(ServerName serverName) {
922922
return wrap(rawAdmin.getCachedFilesList(serverName));
923923
}
924+
925+
@Override
926+
public CompletableFuture<Void> restoreBackupSystemTable(String snapshotName) {
927+
return wrap(rawAdmin.restoreBackupSystemTable(snapshotName));
928+
}
924929
}

hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2082,6 +2082,14 @@ public FlushMasterStoreResponse flushMasterStore(RpcController controller,
20822082
return stub.flushMasterStore(controller, request);
20832083
}
20842084

2085+
@Override
2086+
public MasterProtos.RestoreBackupSystemTableResponse restoreBackupSystemTable(
2087+
RpcController rpcController,
2088+
MasterProtos.RestoreBackupSystemTableRequest restoreBackupSystemTableRequest)
2089+
throws ServiceException {
2090+
return stub.restoreBackupSystemTable(rpcController, restoreBackupSystemTableRequest);
2091+
}
2092+
20852093
@Override
20862094
public ReplicationPeerModificationSwitchResponse replicationPeerModificationSwitch(
20872095
RpcController controller, ReplicationPeerModificationSwitchRequest request)

hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseAdmin.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2095,6 +2095,22 @@ public List<String> getCachedFilesList(ServerName serverName) throws IOException
20952095
this.connection.getAdmin(serverName));
20962096
}
20972097

2098+
@Override
2099+
public void restoreBackupSystemTable(String snapshotName) throws IOException {
2100+
long pid =
2101+
executeCallable(new MasterCallable<Long>(getConnection(), getRpcControllerFactory()) {
2102+
@Override
2103+
protected Long rpcCall() throws Exception {
2104+
return master.restoreBackupSystemTable(getRpcController(),
2105+
MasterProtos.RestoreBackupSystemTableRequest.newBuilder().setSnapshotName(snapshotName)
2106+
.build())
2107+
.getProcId();
2108+
}
2109+
});
2110+
ProcedureFuture<Void> future = new ProcedureFuture<>(this, pid);
2111+
get(future, getProcedureTimeout, TimeUnit.MILLISECONDS);
2112+
}
2113+
20982114
private MasterCallable<MasterProtos.TruncateRegionResponse>
20992115
getTruncateRegionCallable(TableName tableName, RegionInfo hri) {
21002116
return new MasterCallable<MasterProtos.TruncateRegionResponse>(getConnection(),

hbase-client/src/main/java/org/apache/hadoop/hbase/client/RawAsyncHBaseAdmin.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2716,6 +2716,19 @@ void onError(Throwable error) {
27162716
}
27172717
}
27182718

2719+
private static class RestoreBackupSystemTableProcedureBiConsumer extends ProcedureBiConsumer {
2720+
2721+
@Override
2722+
void onFinished() {
2723+
LOG.info("RestoreBackupSystemTableProcedure completed");
2724+
}
2725+
2726+
@Override
2727+
void onError(Throwable error) {
2728+
LOG.info("RestoreBackupSystemTableProcedure failed with {}", error.getMessage());
2729+
}
2730+
}
2731+
27192732
private static class CreateTableProcedureBiConsumer extends TableProcedureBiConsumer {
27202733

27212734
CreateTableProcedureBiConsumer(TableName tableName) {
@@ -4299,4 +4312,16 @@ List<String>> adminCall(controller, stub, request.build(),
42994312
resp -> resp.getCachedFilesList()))
43004313
.serverName(serverName).call();
43014314
}
4315+
4316+
@Override
4317+
public CompletableFuture<Void> restoreBackupSystemTable(String snapshotName) {
4318+
MasterProtos.RestoreBackupSystemTableRequest request =
4319+
MasterProtos.RestoreBackupSystemTableRequest.newBuilder().setSnapshotName(snapshotName)
4320+
.build();
4321+
return this.<MasterProtos.RestoreBackupSystemTableRequest,
4322+
MasterProtos.RestoreBackupSystemTableResponse> procedureCall(request,
4323+
MasterService.Interface::restoreBackupSystemTable,
4324+
MasterProtos.RestoreBackupSystemTableResponse::getProcId,
4325+
new RestoreBackupSystemTableProcedureBiConsumer());
4326+
}
43024327
}

hbase-client/src/main/java/org/apache/hadoop/hbase/client/ShortCircuitMasterConnection.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -796,6 +796,13 @@ public FlushMasterStoreResponse flushMasterStore(RpcController controller,
796796
return stub.flushMasterStore(controller, request);
797797
}
798798

799+
@Override
800+
public MasterProtos.RestoreBackupSystemTableResponse restoreBackupSystemTable(
801+
RpcController controller, MasterProtos.RestoreBackupSystemTableRequest request)
802+
throws ServiceException {
803+
return stub.restoreBackupSystemTable(controller, request);
804+
}
805+
799806
@Override
800807
public ReplicationPeerModificationSwitchResponse replicationPeerModificationSwitch(
801808
RpcController controller, ReplicationPeerModificationSwitchRequest request)

hbase-protocol-shaded/src/main/protobuf/Master.proto

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1228,6 +1228,9 @@ service MasterService {
12281228

12291229
rpc FlushMasterStore(FlushMasterStoreRequest)
12301230
returns(FlushMasterStoreResponse);
1231+
1232+
rpc RestoreBackupSystemTable(RestoreBackupSystemTableRequest)
1233+
returns(RestoreBackupSystemTableResponse);
12311234
}
12321235

12331236
// HBCK Service definitions.
@@ -1313,6 +1316,13 @@ message FixMetaRequest {}
13131316

13141317
message FixMetaResponse {}
13151318

1319+
message RestoreBackupSystemTableRequest {
1320+
required string snapshot_name = 1;
1321+
}
1322+
message RestoreBackupSystemTableResponse {
1323+
optional uint64 proc_id = 1;
1324+
}
1325+
13161326
service HbckService {
13171327
/** Update state of the table in meta only*/
13181328
rpc SetTableStateInMeta(SetTableStateInMetaRequest)

0 commit comments

Comments
 (0)