From e726210ae0f9897ff30701bff624ac9594e886a1 Mon Sep 17 00:00:00 2001 From: Berenguer Blasi Date: Fri, 29 Aug 2025 10:28:28 +0200 Subject: [PATCH 1/4] HCD-181 AssertionError from CassandraStreamReader --- .../apache/cassandra/db/streaming/CassandraStreamReader.java | 2 +- .../org/apache/cassandra/repair/consistent/LocalSessions.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraStreamReader.java b/src/java/org/apache/cassandra/db/streaming/CassandraStreamReader.java index ad5e4cb6b9a9..3a8a9b363f79 100644 --- a/src/java/org/apache/cassandra/db/streaming/CassandraStreamReader.java +++ b/src/java/org/apache/cassandra/db/streaming/CassandraStreamReader.java @@ -86,7 +86,7 @@ public CassandraStreamReader(StreamMessageHeader header, CassandraStreamHeader s { // we should only ever be streaming pending repair // sstables if the session has a pending repair id - assert session.getPendingRepair().equals(header.pendingRepair); + assert session.getPendingRepair().equals(header.pendingRepair) : session.getPendingRepair() + " != " + header.pendingRepair; } this.session = session; this.tableId = header.tableId; diff --git a/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java b/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java index 082dc37f64a4..11611cb84466 100644 --- a/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java +++ b/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java @@ -791,8 +791,8 @@ void setStateAndSave(LocalSession session, ConsistentSession.State state) synchronized (session) { Preconditions.checkArgument(session.getState().canTransitionTo(state), - "Invalid state transition %s -> %s", - session.getState(), state); + "Invalid state transition %s -> %s for session %s", + session.getState(), state, session.sessionID); logger.trace("Changing LocalSession state from {} -> {} for {}", session.getState(), state, session.sessionID); boolean wasCompleted = session.isCompleted(); session.setState(state); From 88a1b57d0f61231b3f7dd8d6073417e1f4b7a84a Mon Sep 17 00:00:00 2001 From: Berenguer Blasi Date: Wed, 3 Sep 2025 14:16:33 +0200 Subject: [PATCH 2/4] Enhanced diags --- .../apache/cassandra/repair/RepairMessageVerbHandler.java | 6 +++++- .../cassandra/repair/consistent/CoordinatorSession.java | 3 ++- .../apache/cassandra/repair/consistent/LocalSessions.java | 6 ++++-- .../org/apache/cassandra/service/ActiveRepairService.java | 3 +++ 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java b/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java index 8a16089c3cf7..8d532096948b 100644 --- a/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java +++ b/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java @@ -223,9 +223,13 @@ else if (message.verb() == STATUS_RSP) } catch (Exception e) { - logger.error("Got error processing {}, removing parent repair session", message.verb()); if (desc != null && desc.parentSessionId != null) + { + logger.error("Got error processing {}, removing parent repair session {}", message.verb(), desc.parentSessionId); ActiveRepairService.instance.removeParentRepairSession(desc.parentSessionId); + } + else + logger.error("Got error processing {}, removing parent repair session", message.verb()); throw new RuntimeException(e); } } diff --git a/src/java/org/apache/cassandra/repair/consistent/CoordinatorSession.java b/src/java/org/apache/cassandra/repair/consistent/CoordinatorSession.java index 5ddac3f74535..3b7971fdd916 100644 --- a/src/java/org/apache/cassandra/repair/consistent/CoordinatorSession.java +++ b/src/java/org/apache/cassandra/repair/consistent/CoordinatorSession.java @@ -37,6 +37,7 @@ import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.MoreExecutors; import com.google.common.util.concurrent.SettableFuture; +import org.apache.commons.lang3.exception.ExceptionUtils; import org.apache.commons.lang3.time.DurationFormatUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -274,7 +275,7 @@ public synchronized void fail() logger.error("Can't transition endpoints {} to FAILED", cantFail, new RuntimeException()); return; } - logger.info("Incremental repair session {} failed", sessionID); + logger.info("Incremental repair session {} failed \n {}", sessionID, ExceptionUtils.getStackTrace(new Exception())); sendFailureMessageToParticipants(); setAll(State.FAILED); diff --git a/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java b/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java index 11611cb84466..18ef44e8e2b6 100644 --- a/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java +++ b/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java @@ -55,6 +55,8 @@ import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.MoreExecutors; +import org.apache.commons.lang3.exception.ExceptionUtils; + import org.apache.cassandra.cql3.PageSize; import org.apache.cassandra.db.compaction.AbstractCompactionTask; import org.apache.cassandra.db.compaction.CleanupTask; @@ -417,7 +419,7 @@ public boolean canCleanup(UUID sessionID) */ public void cancelSession(UUID sessionID, boolean force) { - logger.info("Cancelling local repair session {}", sessionID); + logger.info("Cancelling local repair session {} \n {}", sessionID, ExceptionUtils.getStackTrace(new Exception())); LocalSession session = getSession(sessionID); Preconditions.checkArgument(session != null, "Session {} does not exist", sessionID); Preconditions.checkArgument(force || session.coordinator.equals(getBroadcastAddressAndPort()), @@ -831,7 +833,7 @@ public void failSession(LocalSession session, boolean sendMessage) } else if (session.getState() != FAILED) { - logger.info("Failing local repair session {}", session.sessionID); + logger.info("Failing local repair session {} \n {}", session.sessionID, ExceptionUtils.getStackTrace(new Exception())); setStateAndSave(session, FAILED); } } diff --git a/src/java/org/apache/cassandra/service/ActiveRepairService.java b/src/java/org/apache/cassandra/service/ActiveRepairService.java index 22925958e30d..2f48359501d0 100644 --- a/src/java/org/apache/cassandra/service/ActiveRepairService.java +++ b/src/java/org/apache/cassandra/service/ActiveRepairService.java @@ -37,6 +37,8 @@ import com.google.common.util.concurrent.ListeningExecutorService; import com.google.common.util.concurrent.MoreExecutors; +import org.apache.commons.lang3.exception.ExceptionUtils; + import org.apache.cassandra.concurrent.DebuggableThreadPoolExecutor; import org.apache.cassandra.db.compaction.CompactionManager; import org.apache.cassandra.locator.EndpointsByRange; @@ -413,6 +415,7 @@ public void run() public synchronized void terminateSessions() { + logger.info("Terminating repair sessions \n {}", ExceptionUtils.getStackTrace(new Exception())); Throwable cause = new IOException("Terminate session is called"); for (RepairSession session : sessions.values()) { From a7f77fcd9d2402382164b62d8b709dea02cd747d Mon Sep 17 00:00:00 2001 From: Berenguer Blasi Date: Wed, 8 Oct 2025 13:56:03 +0200 Subject: [PATCH 3/4] HCD-200 tracking in-flight compactions debug --- src/java/org/apache/cassandra/db/lifecycle/View.java | 4 ++++ .../cassandra/repair/consistent/CoordinatorSession.java | 2 +- .../org/apache/cassandra/repair/consistent/LocalSessions.java | 4 ++-- .../org/apache/cassandra/service/ActiveRepairService.java | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/java/org/apache/cassandra/db/lifecycle/View.java b/src/java/org/apache/cassandra/db/lifecycle/View.java index c41703def1bf..4a0916f1a9be 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/View.java +++ b/src/java/org/apache/cassandra/db/lifecycle/View.java @@ -27,6 +27,7 @@ import com.google.common.base.Predicate; import com.google.common.collect.*; +import org.apache.commons.lang3.exception.ExceptionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -105,6 +106,9 @@ public class View this.sstablesByFilename = Maps.newHashMapWithExpectedSize(sstables.size()); for (SSTableReader sstable : this.sstables) this.sstablesByFilename.put(sstable.getDataFile().name(), sstable); + + if (!this.compacting.isEmpty()) + logger.debug("New View {} at:\n{}", this, ExceptionUtils.getStackTrace(new Exception("HCD-200 debug - Ignore me"))); } public Memtable getCurrentMemtable() diff --git a/src/java/org/apache/cassandra/repair/consistent/CoordinatorSession.java b/src/java/org/apache/cassandra/repair/consistent/CoordinatorSession.java index 3b7971fdd916..67040acbbf32 100644 --- a/src/java/org/apache/cassandra/repair/consistent/CoordinatorSession.java +++ b/src/java/org/apache/cassandra/repair/consistent/CoordinatorSession.java @@ -275,7 +275,7 @@ public synchronized void fail() logger.error("Can't transition endpoints {} to FAILED", cantFail, new RuntimeException()); return; } - logger.info("Incremental repair session {} failed \n {}", sessionID, ExceptionUtils.getStackTrace(new Exception())); + logger.info("Incremental repair session {} failed \n {}", sessionID, ExceptionUtils.getStackTrace(new Exception("HCD-181 debug - Ignore me"))); sendFailureMessageToParticipants(); setAll(State.FAILED); diff --git a/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java b/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java index 18ef44e8e2b6..1c6f1a5dcca5 100644 --- a/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java +++ b/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java @@ -419,7 +419,7 @@ public boolean canCleanup(UUID sessionID) */ public void cancelSession(UUID sessionID, boolean force) { - logger.info("Cancelling local repair session {} \n {}", sessionID, ExceptionUtils.getStackTrace(new Exception())); + logger.info("Cancelling local repair session {} \n {}", sessionID, ExceptionUtils.getStackTrace(new Exception("HCD-181 debug - Ignore me"))); LocalSession session = getSession(sessionID); Preconditions.checkArgument(session != null, "Session {} does not exist", sessionID); Preconditions.checkArgument(force || session.coordinator.equals(getBroadcastAddressAndPort()), @@ -833,7 +833,7 @@ public void failSession(LocalSession session, boolean sendMessage) } else if (session.getState() != FAILED) { - logger.info("Failing local repair session {} \n {}", session.sessionID, ExceptionUtils.getStackTrace(new Exception())); + logger.info("Failing local repair session {} \n {}", session.sessionID, ExceptionUtils.getStackTrace(new Exception("HCD-181 debug - Ignore me"))); setStateAndSave(session, FAILED); } } diff --git a/src/java/org/apache/cassandra/service/ActiveRepairService.java b/src/java/org/apache/cassandra/service/ActiveRepairService.java index 2f48359501d0..dce0b7e7a562 100644 --- a/src/java/org/apache/cassandra/service/ActiveRepairService.java +++ b/src/java/org/apache/cassandra/service/ActiveRepairService.java @@ -415,7 +415,7 @@ public void run() public synchronized void terminateSessions() { - logger.info("Terminating repair sessions \n {}", ExceptionUtils.getStackTrace(new Exception())); + logger.info("Terminating repair sessions \n {}", ExceptionUtils.getStackTrace(new Exception("HCD-181 debug - Ignore me"))); Throwable cause = new IOException("Terminate session is called"); for (RepairSession session : sessions.values()) { From 9f5c223ac1cd96264c308d3e05e8fd3750ed3ef7 Mon Sep 17 00:00:00 2001 From: Berenguer Blasi Date: Wed, 8 Oct 2025 14:54:52 +0200 Subject: [PATCH 4/4] HCD-200 extended diag --- src/java/org/apache/cassandra/db/lifecycle/View.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/java/org/apache/cassandra/db/lifecycle/View.java b/src/java/org/apache/cassandra/db/lifecycle/View.java index 4a0916f1a9be..68b6530accdc 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/View.java +++ b/src/java/org/apache/cassandra/db/lifecycle/View.java @@ -282,6 +282,9 @@ public static Function> selectLive(AbstractBounds< @VisibleForTesting public static Function updateCompacting(final Set unmark, final Iterable mark) { + if (unmark != null && !unmark.isEmpty()) + logger.debug("HCD-200 updateCompacting unmark {} at\n{}", unmark, ExceptionUtils.getStackTrace(new Exception("HCD-200 debug - Ignore me" ))); + if (unmark.isEmpty() && Iterables.isEmpty(mark)) return Functions.identity(); return new Function()