Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 27 additions & 11 deletions lib/checker/checkercomponent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ void CheckerComponent::OnConfigLoaded()
Checkable::OnNextCheckChanged.connect([this](const Checkable::Ptr& checkable, const Value&) {
NextCheckChangedHandler(checkable);
});
Checkable::OnRescheduleCheck.connect([this](const Checkable::Ptr& checkable, double nextCheck) {
NextCheckChangedHandler(checkable, nextCheck);
});
}

void CheckerComponent::Start(bool runtimeCreated)
Expand All @@ -68,7 +71,7 @@ void CheckerComponent::Start(bool runtimeCreated)
m_Thread = std::thread([this]() { CheckThreadProc(); });

m_ResultTimer = Timer::Create();
m_ResultTimer->SetInterval(5);
m_ResultTimer->SetInterval(m_ResultTimerInterval);
m_ResultTimer->OnTimerExpired.connect([this](const Timer * const&) { ResultTimerHandler(); });
m_ResultTimer->Start();
}
Expand Down Expand Up @@ -135,7 +138,6 @@ void CheckerComponent::CheckThreadProc()

bool forced = checkable->GetForceNextCheck();
bool check = true;
bool notifyNextCheck = false;
double nextCheck = -1;

if (!forced) {
Expand All @@ -144,7 +146,6 @@ void CheckerComponent::CheckThreadProc()
<< "Skipping check for object '" << checkable->GetName() << "': Dependency failed.";

check = false;
notifyNextCheck = true;
}

Host::Ptr host;
Expand Down Expand Up @@ -181,7 +182,6 @@ void CheckerComponent::CheckThreadProc()
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck);

check = false;
notifyNextCheck = true;
}
}
}
Expand All @@ -200,11 +200,6 @@ void CheckerComponent::CheckThreadProc()
checkable->UpdateNextCheck();
}

if (notifyNextCheck) {
// Trigger update event for Icinga DB
Checkable::OnNextCheckUpdated(checkable);
}

lock.lock();

continue;
Expand Down Expand Up @@ -341,7 +336,7 @@ CheckableScheduleInfo CheckerComponent::GetCheckableScheduleInfo(const Checkable
return csi;
}

void CheckerComponent::NextCheckChangedHandler(const Checkable::Ptr& checkable)
void CheckerComponent::NextCheckChangedHandler(const Checkable::Ptr& checkable, double nextCheck)
{
std::unique_lock<std::mutex> lock(m_Mutex);

Expand All @@ -356,7 +351,13 @@ void CheckerComponent::NextCheckChangedHandler(const Checkable::Ptr& checkable)

idx.erase(checkable);

CheckableScheduleInfo csi = GetCheckableScheduleInfo(checkable);
CheckableScheduleInfo csi;
if (nextCheck < 0) {
csi = GetCheckableScheduleInfo(checkable);
} else {
csi.NextCheck = nextCheck;
csi.Object = checkable;
}
idx.insert(csi);

m_CV.notify_all();
Expand All @@ -375,3 +376,18 @@ unsigned long CheckerComponent::GetPendingCheckables()

return m_PendingCheckables.size();
}

/**
* Sets the interval in seconds for the result timer.
*
* The result timer periodically logs the number of pending and idle checkables
* as well as the checks per second rate. The default interval is 5 seconds.
*
* Note, this method must be called before the component is started to have an effect on the timer.
*
* @param interval Interval in seconds for the result timer.
*/
void CheckerComponent::SetResultTimerInterval(double interval)
{
m_ResultTimerInterval = interval;
}
38 changes: 21 additions & 17 deletions lib/checker/checkercomponent.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,25 @@ struct CheckableScheduleInfo
{
Checkable::Ptr Object;
double NextCheck;
};

/**
* @ingroup checker
*/
struct CheckableNextCheckExtractor
{
typedef double result_type;

/**
* @threadsafety Always.
* Get the index value for ordering in the multi-index container.
*
* This function returns a very large value for checkables that have a running check, effectively pushing
* them to the end of the ordering. This ensures that checkables with running checks are not prioritized
* for scheduling ahead of others. Rescheduling of such checkables is unnecessary because the checkable
* is going to reject this anyway if it notices that a check is already running, so avoiding unnecessary
* CPU load. Once the running check is finished, the checkable will be re-inserted into the set with its
* actual next check time as the index value.
*
* @return The index value for ordering in the multi-index container.
*/
double operator()(const CheckableScheduleInfo& csi)
double Index() const
{
return csi.NextCheck;
if (Object->HasRunningCheck()) {
return std::numeric_limits<double>::max();
}
return NextCheck;
}
};

Expand All @@ -57,7 +61,7 @@ class CheckerComponent final : public ObjectImpl<CheckerComponent>
CheckableScheduleInfo,
boost::multi_index::indexed_by<
boost::multi_index::ordered_unique<boost::multi_index::member<CheckableScheduleInfo, Checkable::Ptr, &CheckableScheduleInfo::Object> >,
boost::multi_index::ordered_non_unique<CheckableNextCheckExtractor>
boost::multi_index::ordered_non_unique<boost::multi_index::const_mem_fun<CheckableScheduleInfo, double, &CheckableScheduleInfo::Index>>
>
> CheckableSet;

Expand All @@ -69,12 +73,16 @@ class CheckerComponent final : public ObjectImpl<CheckerComponent>
unsigned long GetIdleCheckables();
unsigned long GetPendingCheckables();

void SetResultTimerInterval(double interval);

private:
std::mutex m_Mutex;
std::condition_variable m_CV;
bool m_Stopped{false};
std::thread m_Thread;

double m_ResultTimerInterval{5.0}; // Interval in seconds for the result timer.

CheckableSet m_IdleCheckables;
CheckableSet m_PendingCheckables;

Expand All @@ -86,12 +94,8 @@ class CheckerComponent final : public ObjectImpl<CheckerComponent>

void ExecuteCheckHelper(const Checkable::Ptr& checkable);

void AdjustCheckTimer();

void ObjectHandler(const ConfigObject::Ptr& object);
void NextCheckChangedHandler(const Checkable::Ptr& checkable);

void RescheduleCheckTimer();
void NextCheckChangedHandler(const Checkable::Ptr& checkable, double nextCheck = -1);

static CheckableScheduleInfo GetCheckableScheduleInfo(const Checkable::Ptr& checkable);
};
Expand Down
2 changes: 1 addition & 1 deletion lib/db_ido/dbevents.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ void DbEvents::StaticInitialize()
DbEvents::RemoveAcknowledgement(checkable);
});

Checkable::OnNextCheckUpdated.connect([](const Checkable::Ptr& checkable) { NextCheckUpdatedHandler(checkable); });
Checkable::OnNextCheckChanged.connect([](const Checkable::Ptr& checkable, const Value&) { NextCheckUpdatedHandler(checkable); });
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems you missed to rename NextCheckUpdatedHandler.

Checkable::OnFlappingChanged.connect([](const Checkable::Ptr& checkable, const Value&) { FlappingChangedHandler(checkable); });
Checkable::OnNotificationSentToAllUsers.connect([](const Notification::Ptr& notification, const Checkable::Ptr& checkable,
const std::set<User::Ptr>&, const NotificationType&, const CheckResult::Ptr&, const String&, const String&,
Expand Down
3 changes: 0 additions & 3 deletions lib/icinga/apiactions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,6 @@ Dictionary::Ptr ApiActions::RescheduleCheck(const ConfigObject::Ptr& object,

checkable->SetNextCheck(nextCheck);

/* trigger update event for DB IDO */
Checkable::OnNextCheckUpdated(checkable);

return ApiActions::CreateResult(200, "Successfully rescheduled check for object '" + checkable->GetName() + "'.");
}

Expand Down
98 changes: 65 additions & 33 deletions lib/icinga/checkable-check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, co
boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, StateType, const MessageOrigin::Ptr&)> Checkable::OnStateChange;
boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, std::set<Checkable::Ptr>, const MessageOrigin::Ptr&)> Checkable::OnReachabilityChanged;
boost::signals2::signal<void (const Checkable::Ptr&, NotificationType, const CheckResult::Ptr&, const String&, const String&, const MessageOrigin::Ptr&)> Checkable::OnNotificationsRequested;
boost::signals2::signal<void (const Checkable::Ptr&)> Checkable::OnNextCheckUpdated;
boost::signals2::signal<void (const Checkable::Ptr&, double)> Checkable::OnRescheduleCheck;

Atomic<uint_fast64_t> Checkable::CurrentConcurrentChecks (0);

Expand All @@ -45,12 +45,22 @@ void Checkable::SetSchedulingOffset(long offset)
m_SchedulingOffset = offset;
}

long Checkable::GetSchedulingOffset()
long Checkable::GetSchedulingOffset() const
{
return m_SchedulingOffset;
}

void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin)
/**
* Update the next check time of this checkable based on its check interval and last check time.
*
* If onlyReschedule is true, the next check time is not actually updated, but the @c Checkable::OnRescheduleCheck
* signal is emitted with the new calculated next check time. Otherwise, the next check time is updated
* and the @c Checkable::OnNextCheckChanged signal is emitted accordingly.
*
* @param origin The origin of the message triggering this update, can be nullptr.
* @param onlyReschedule If true, only emit @c OnRescheduleCheck without updating the next check time.
*/
void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin, bool onlyReschedule)
{
double interval;

Expand Down Expand Up @@ -78,14 +88,26 @@ void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin)
<< " (" << lastCheck << ") to next check time at "
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck) << " (" << nextCheck << ").";

SetNextCheck(nextCheck, false, origin);
if (onlyReschedule) {
// Someone requested to only reschedule the next check without actually changing it.
// So, just tell the checker about this new timestamp and return.
OnRescheduleCheck(this, nextCheck);
} else {
// Otherwise, set the next check to the newly calculated timestamp and inform all its listeners.
SetNextCheck(nextCheck, false, origin);
}
}

bool Checkable::HasBeenChecked() const
{
return GetLastCheckResult() != nullptr;
}

bool Checkable::HasRunningCheck() const
{
return m_CheckRunning;
}

double Checkable::GetLastCheck() const
{
CheckResult::Ptr cr = GetLastCheckResult();
Expand All @@ -105,7 +127,7 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr
VERIFY(producer);

ObjectLock olock(this);
m_CheckRunning = false;
m_CheckRunning.store(false);

double now = Utility::GetTime();

Expand Down Expand Up @@ -513,12 +535,15 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr
if (recovery) {
for (auto& child : children) {
if (child->GetProblem() && child->GetEnableActiveChecks()) {
auto nextCheck (now + Utility::Random() % 60);

ObjectLock oLock (child);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure the lock is no longer needed? Below you still have a get and kind of a set operation.


if (nextCheck < child->GetNextCheck()) {
child->SetNextCheck(nextCheck);
if (auto nextCheck (now + Utility::Random() % 60); nextCheck < child->GetNextCheck()) {
/**
* We only want to enforce the checker to pick this up sooner, and no need to actually change
* the timesatmp. Plus, no other listeners should be informed about this other than the checker,
* so we emit the OnRescheduleCheck signal directly. In case our checker isn't responsible for
* this child object, we've already broadcasted the `CheckResult` event which will cause on the
* responsible node to enter this exact branch and do the rescheduling for its own checker.
*/
OnRescheduleCheck(child, nextCheck);
}
}
}
Expand All @@ -534,8 +559,8 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr
continue;

if (parent->GetNextCheck() >= now + parent->GetRetryInterval()) {
ObjectLock olock(parent);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here. While on it, I guess it makes sense for the lock to cover both operations, get and set. Just as in the child case above.

parent->SetNextCheck(now);
// See comment above for children. We want to just enforce an immediate check by our checker.
OnRescheduleCheck(parent, now);
}
}
}
Expand All @@ -561,29 +586,21 @@ void Checkable::ExecuteCheck(const WaitGroup::Ptr& producer)
{
CONTEXT("Executing check for object '" << GetName() << "'");

/* don't run another check if there is one pending */
if (m_CheckRunning.exchange(true))
return; // Should never happen as the checker already takes care of this.

/* keep track of scheduling info in case the check type doesn't provide its own information */
double scheduled_start = GetNextCheck();
double before_check = Utility::GetTime();

SetLastCheckStarted(Utility::GetTime());

/* This calls SetNextCheck() which updates the CheckerComponent's idle/pending
* queues and ensures that checks are not fired multiple times. ProcessCheckResult()
* is called too late. See #6421.
*/
UpdateNextCheck();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this no longer necessary, especially in the local check case? Sure, now you return early if !!m_CheckRunning. But where is this checkable re-indexed inside the scheduler queue?


bool reachable = IsReachable();

{
ObjectLock olock(this);

/* don't run another check if there is one pending */
if (m_CheckRunning)
return;

m_CheckRunning = true;

SetLastStateRaw(GetStateRaw());
SetLastStateType(GetLastStateType());
SetLastReachable(reachable);
Expand Down Expand Up @@ -640,11 +657,16 @@ void Checkable::ExecuteCheck(const WaitGroup::Ptr& producer)
if (listener)
listener->SyncSendMessage(endpoint, message);

/* Re-schedule the check so we don't run it again until after we've received
* a check result from the remote instance. The check will be re-scheduled
* using the proper check interval once we've received a check result.
/*
* Let the checker use a dummy next check time until we actually receive the check result from the
* remote endpoint. This should be sufficiently far in the future to avoid excessive CPU load by
* constantly re-running the check, but not too far in the future to avoid that the check is not
* re-run for too long in case the remote endpoint never responds. We add a small grace period
* to the check command timeout to account for network latency and processing time on the remote
* endpoint. So, we only need to silently update this without notifying any listeners, and once
* this function returns, the checker is going access it via GetNextCheck() again.
*/
SetNextCheck(Utility::GetTime() + checkTimeout + 30);
SetNextCheck(Utility::GetTime() + checkTimeout + 30, true);

/*
* Let the user know that there was a problem with the check if
Expand All @@ -667,12 +689,22 @@ void Checkable::ExecuteCheck(const WaitGroup::Ptr& producer)
cr->SetOutput(output);

ProcessCheckResult(cr, producer);
} else {
/**
* The endpoint is currently either syncing its state or not connected yet and we are within
* the magical 5min cold startup window. In both cases, we just don't do anything and wait for
* the next check interval to re-try the check again. So, this check is effectively skipped.
*/
UpdateNextCheck();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Too bad if my check runs once a day. (The else if case at least schedules the next check just one retry interval in the future.)

}

{
ObjectLock olock(this);
m_CheckRunning = false;
}
/**
* If this is a remote check, we don't know when the check result will be received and processed.
* Therefore, we must mark the check as no longer running here, otherwise, no further checks
* would be executed for this checkable as it would always appear as having a running check
* (see the check at the start of this function).
*/
m_CheckRunning.store(false);
}
}

Expand Down
Loading
Loading