From 0fd1dffbfb25a2432f1b028909d2ec968b6f380b Mon Sep 17 00:00:00 2001 From: Hui Hu Date: Wed, 11 Jun 2025 12:16:47 +0800 Subject: [PATCH 1/2] feat: Support configuring rule.Reason as a Sprintf format string So we can define one templated Reason to generate many Reasons, e.g. generating Reasons for Nvidia GPU Xid errors in dmesg log. --- pkg/systemlogmonitor/log_monitor.go | 32 ++++- pkg/systemlogmonitor/log_monitor_test.go | 144 +++++++++++++++++++++++ 2 files changed, 172 insertions(+), 4 deletions(-) diff --git a/pkg/systemlogmonitor/log_monitor.go b/pkg/systemlogmonitor/log_monitor.go index 2fc5dd0c6..f79f5e2d0 100644 --- a/pkg/systemlogmonitor/log_monitor.go +++ b/pkg/systemlogmonitor/log_monitor.go @@ -20,6 +20,8 @@ import ( "encoding/json" "fmt" "os" + "regexp" + "strings" "time" "k8s.io/klog/v2" @@ -158,6 +160,9 @@ func (l *logMonitor) parseLog(log *systemlogtypes.Log) { continue } status := l.generateStatus(matched, rule) + if status == nil { + continue + } klog.Infof("New status generated: %+v", status) l.output <- status } @@ -170,12 +175,31 @@ func (l *logMonitor) generateStatus(logs []*systemlogtypes.Log, rule systemlogty message := generateMessage(logs, rule.PatternGeneratedMessageSuffix) var events []types.Event var changedConditions []*types.Condition + + // Support configuring rule.Reason as a Sprintf format string and formatting it with the matched capturing groups in rule.Pattern. + re := regexp.MustCompile(rule.Pattern) + matches := re.FindStringSubmatch(message) + reason := rule.Reason + formatArgs := make([]interface{}, 0) + if len(matches) > 1 { + // Use the matched capturing groups as the arguments for Sprintf. + for _, value := range matches[1:] { + formatArgs = append(formatArgs, value) + } + } + reason = fmt.Sprintf(rule.Reason, formatArgs...) + // If fmt.Sprintf fails, it will add "%!" for each failed template in the result string. + if strings.Contains(reason, "%!") { + klog.Errorf("Got wrong string %q for reason %q with pattern %q", reason, rule.Reason, rule.Pattern) + return nil + } + if rule.Type == types.Temp { // For temporary error only generate event events = append(events, types.Event{ Severity: types.Warn, Timestamp: timestamp, - Reason: rule.Reason, + Reason: reason, Message: message, }) } else { @@ -186,19 +210,19 @@ func (l *logMonitor) generateStatus(logs []*systemlogtypes.Log, rule systemlogty // Update transition timestamp and message when the condition // changes. Condition is considered to be changed only when // status or reason changes. - if condition.Status == types.False || condition.Reason != rule.Reason { + if condition.Status == types.False || condition.Reason != reason { condition.Transition = timestamp condition.Message = message events = append(events, util.GenerateConditionChangeEvent( condition.Type, types.True, - rule.Reason, + reason, message, timestamp, )) } condition.Status = types.True - condition.Reason = rule.Reason + condition.Reason = reason changedConditions = append(changedConditions, condition) break } diff --git a/pkg/systemlogmonitor/log_monitor_test.go b/pkg/systemlogmonitor/log_monitor_test.go index 230e9c659..2a4762a7d 100644 --- a/pkg/systemlogmonitor/log_monitor_test.go +++ b/pkg/systemlogmonitor/log_monitor_test.go @@ -408,6 +408,150 @@ func TestGenerateStatusForMetrics(t *testing.T) { } } +func TestGenerateStatusForEvents(t *testing.T) { + for c, test := range []struct { + name string + rule logtypes.Rule + expected *types.Status + logs []*logtypes.Log + }{ + { + name: "without matching group", + rule: logtypes.Rule{ + Type: types.Temp, + Reason: "NvidiaGPUXid", + Pattern: "NVRM: Xid \\(PCI:[^)]+\\): \\d+,.*", + }, + logs: []*logtypes.Log{ + { + Timestamp: time.Unix(1000, 1000), + Message: "[...] NVRM: Xid (PCI:0000:00:00.0): 45, Ch 00000010", + }, + }, + expected: &types.Status{ + Source: testSource, + Events: []types.Event{{ + Severity: types.Warn, + Timestamp: time.Unix(1000, 1000), + Reason: "NvidiaGPUXid", + Message: "[...] NVRM: Xid (PCI:0000:00:00.0): 45, Ch 00000010", + }}, + }, + }, + { + name: "one matching group", + rule: logtypes.Rule{ + Type: types.Temp, + Reason: "NvidiaGPUXid%s", + Pattern: "NVRM: Xid \\(PCI:[^)]+\\): (\\d+),.*", + }, + logs: []*logtypes.Log{ + { + Timestamp: time.Unix(1000, 1000), + Message: "[...] NVRM: Xid (PCI:0000:00:00.0): 45, Ch 00000010", + }, + }, + expected: &types.Status{ + Source: testSource, + Events: []types.Event{{ + Severity: types.Warn, + Timestamp: time.Unix(1000, 1000), + Reason: "NvidiaGPUXid45", + Message: "[...] NVRM: Xid (PCI:0000:00:00.0): 45, Ch 00000010", + }}, + }, + }, + { + name: "two matching groups", + rule: logtypes.Rule{ + Type: types.Temp, + Reason: "NvidiaGPUXid%s, Ch%s", + Pattern: "NVRM: Xid \\(PCI:[^)]+\\): (\\d+), Ch (\\d+).*", + }, + logs: []*logtypes.Log{ + { + Timestamp: time.Unix(1000, 1000), + Message: "[...] NVRM: Xid (PCI:0000:00:00.0): 45, Ch 00000010", + }, + }, + expected: &types.Status{ + Source: testSource, + Events: []types.Event{{ + Severity: types.Warn, + Timestamp: time.Unix(1000, 1000), + Reason: "NvidiaGPUXid45, Ch00000010", + Message: "[...] NVRM: Xid (PCI:0000:00:00.0): 45, Ch 00000010", + }}, + }, + }, + { + name: "not enough matching groups 1", + rule: logtypes.Rule{ + Type: types.Temp, + Reason: "NvidiaGPUXid%s, Ch%s", + Pattern: "NVRM: Xid \\(PCI:[^)]+\\): (\\d+),.*", + }, + logs: []*logtypes.Log{ + { + Timestamp: time.Unix(1000, 1000), + Message: "[...] NVRM: Xid (PCI:0000:00:00.0): 45, Ch 00000010", + }, + }, + expected: nil, + }, + { + name: "not enough matching groups 2", + rule: logtypes.Rule{ + Type: types.Temp, + Reason: "NvidiaGPUXid%s", + Pattern: "NVRM: Xid \\(PCI:[^)]+\\): \\d+,.*", + }, + logs: []*logtypes.Log{ + { + Timestamp: time.Unix(1000, 1000), + Message: "[...] NVRM: Xid (PCI:0000:00:00.0): 45, Ch 00000010", + }, + }, + expected: nil, + }, + { + name: "indexed matching groups", + rule: logtypes.Rule{ + Type: types.Temp, + Reason: "NvidiaGPUXid%[1]s, Ch%[2]s", + Pattern: "NVRM: Xid \\(PCI:[^)]+\\): (\\d+), Ch (\\d+).*", + }, + logs: []*logtypes.Log{ + { + Timestamp: time.Unix(1000, 1000), + Message: "[...] NVRM: Xid (PCI:0000:00:00.0): 45, Ch 00000010", + }, + }, + expected: &types.Status{ + Source: testSource, + Events: []types.Event{{ + Severity: types.Warn, + Timestamp: time.Unix(1000, 1000), + Reason: "NvidiaGPUXid45, Ch00000010", + Message: "[...] NVRM: Xid (PCI:0000:00:00.0): 45, Ch 00000010", + }}, + }, + }, + } { + l := &logMonitor{ + config: MonitorConfig{ + Source: testSource, + }, + } + (&l.config).ApplyDefaultConfiguration() + got := l.generateStatus(test.logs, test.rule) + + if !reflect.DeepEqual(test.expected, got) { + t.Errorf("case %d %s: expected status %+v, got %+v", c+1, test.name, test.expected, got) + } + } +} + func TestInitializeProblemMetricsOrDie(t *testing.T) { testCases := []struct { name string From 50016aaf3c7574ac749d09f62576ac5881024261 Mon Sep 17 00:00:00 2001 From: Hui Hu Date: Wed, 20 Aug 2025 16:49:02 +0800 Subject: [PATCH 2/2] fix: Allow capturing groups in rule.Pattern with non-templated rule.Reason --- pkg/systemlogmonitor/log_monitor.go | 30 +++++++++++++----------- pkg/systemlogmonitor/log_monitor_test.go | 23 ++++++++++++++++++ 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/pkg/systemlogmonitor/log_monitor.go b/pkg/systemlogmonitor/log_monitor.go index f79f5e2d0..56f76bdd3 100644 --- a/pkg/systemlogmonitor/log_monitor.go +++ b/pkg/systemlogmonitor/log_monitor.go @@ -176,22 +176,24 @@ func (l *logMonitor) generateStatus(logs []*systemlogtypes.Log, rule systemlogty var events []types.Event var changedConditions []*types.Condition - // Support configuring rule.Reason as a Sprintf format string and formatting it with the matched capturing groups in rule.Pattern. - re := regexp.MustCompile(rule.Pattern) - matches := re.FindStringSubmatch(message) reason := rule.Reason - formatArgs := make([]interface{}, 0) - if len(matches) > 1 { - // Use the matched capturing groups as the arguments for Sprintf. - for _, value := range matches[1:] { - formatArgs = append(formatArgs, value) + // Support configuring rule.Reason as a Sprintf format string and formatting it with the matched capturing groups in rule.Pattern. + if strings.Contains(reason, "%") { + re := regexp.MustCompile(rule.Pattern) + matches := re.FindStringSubmatch(message) + formatArgs := make([]interface{}, 0) + if len(matches) > 1 { + // Use the matched capturing groups as the arguments for Sprintf. + for _, value := range matches[1:] { + formatArgs = append(formatArgs, value) + } + } + reason = fmt.Sprintf(rule.Reason, formatArgs...) + // If fmt.Sprintf fails, it will add "%!" for each failed template in the result string. + if strings.Contains(reason, "%!") { + klog.Errorf("Got wrong string %q for reason %q with pattern %q", reason, rule.Reason, rule.Pattern) + return nil } - } - reason = fmt.Sprintf(rule.Reason, formatArgs...) - // If fmt.Sprintf fails, it will add "%!" for each failed template in the result string. - if strings.Contains(reason, "%!") { - klog.Errorf("Got wrong string %q for reason %q with pattern %q", reason, rule.Reason, rule.Pattern) - return nil } if rule.Type == types.Temp { diff --git a/pkg/systemlogmonitor/log_monitor_test.go b/pkg/systemlogmonitor/log_monitor_test.go index 2a4762a7d..807b60c55 100644 --- a/pkg/systemlogmonitor/log_monitor_test.go +++ b/pkg/systemlogmonitor/log_monitor_test.go @@ -537,6 +537,29 @@ func TestGenerateStatusForEvents(t *testing.T) { }}, }, }, + { + name: "using matching groups in Pattern but not in Reason", + rule: logtypes.Rule{ + Type: types.Temp, + Reason: "OOMKilling", + Pattern: "Killed process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB.*", + }, + logs: []*logtypes.Log{ + { + Timestamp: time.Unix(1000, 1000), + Message: "kernel: Killed process 13357 (mysqld), UID 27, total-vm:4977676kB, anon-rss:3256736kB, file-rss:0kB, shmem-rss:0kB", + }, + }, + expected: &types.Status{ + Source: testSource, + Events: []types.Event{{ + Severity: types.Warn, + Timestamp: time.Unix(1000, 1000), + Reason: "OOMKilling", + Message: "kernel: Killed process 13357 (mysqld), UID 27, total-vm:4977676kB, anon-rss:3256736kB, file-rss:0kB, shmem-rss:0kB", + }}, + }, + }, } { l := &logMonitor{ config: MonitorConfig{