Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions pkg/log/log.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,15 @@
package log

import (
"bytes"
"fmt"
"io"
stdlog "log"
"os"
"regexp"
"runtime"
"sort"
"strconv"
"sync/atomic"
"time"

Expand All @@ -62,6 +65,14 @@ const (

// Debug indicates that output should not normally be emitted.
Debug

// MaxStuckGoroutinesToLog is the maximum number of stuck goroutines to log in
// the "stuck goroutines" footer.
MaxStuckGoroutinesToLog = 10

// MaxPerGoroutineStackBytesToLog is the maximum number of bytes to log per
// goroutine stack in the "stuck goroutines" footer.
MaxPerGoroutineStackBytesToLog = 10000
)

func (l Level) String() string {
Expand Down Expand Up @@ -359,6 +370,98 @@ func TracebackAll(format string, v ...any) {
Warningf(format+":\n%s", v...)
}

// TracebackAllWithStuckSuffix logs the given message and dumps a stacktrace of all goroutines,
// and then adds a footer with the stacks of goroutines deemed to be stuck. These include the
// given stuckTasks, as well as any other goroutines languishing in an obviously stuck state.
//
// The number of goroutines logged is limited to MaxStuckGoroutinesToLog.
func TracebackAllWithStuckSuffix(stuckTasks map[int64]struct{}, format string, v ...any) {
stacks := Stacks(true)
v = append(v, stacks)
header := "***** BEGIN STUCK GOROUTINES (there may be more, read the preceding full dump) *****"
v = append(v, stuckGoroutineStacks(stacks, stuckTasks))
footer := "***** END STUCK GOROUTINES *****\n"
Warningf(format+":\n%s\n"+header+"%s\n"+footer, v...)
}

func goroutinesInState(stacks []byte, state string) map[int64]struct{} {
ids := make(map[int64]struct{})
regex := regexp.MustCompile(fmt.Sprintf(
`(?m)^\s*goroutine\s(\d+)\s\[%s(, \d+ minutes)?\]:$`, state))

matches := regex.FindAllSubmatch(stacks, -1)
for _, match := range matches {
if len(match) <= 1 {
continue
}
if id, err := strconv.ParseInt(string(match[1]), 10, 64); err == nil {
ids[id] = struct{}{}
}
}
return ids
}

func stuckGoroutineIDs(stacks []byte) map[int64]struct{} {
stuckStates := []string{"semacquire", "sync.Mutex.Lock"}
ids := make(map[int64]struct{})

for _, state := range stuckStates {
for id := range goroutinesInState(stacks, state) {
ids[id] = struct{}{}
}
}
return ids
}

func singleGoroutineStack(stacks []byte, goroutineID int64) []byte {
headerAndStack := fmt.Sprintf(`(?sm)(^\s*goroutine\s%d\s\[.*?\]:.*?)`, goroutineID)
footer := `(\n\n\s*goroutine\s\d+\s\[|\z)` // Another stack or end of the trace.
regex := regexp.MustCompile(headerAndStack + footer)

match := regex.FindSubmatch(stacks)
if match != nil && len(match) > 1 {
if len(match[1]) > MaxPerGoroutineStackBytesToLog {
return []byte(fmt.Sprintf("goroutine %d stack is too large, skipped.\n", goroutineID))
}
return match[1]
}
return nil
}

func stuckGoroutineStacks(stacks []byte, stuckTasks map[int64]struct{}) []byte {
stuckIDs := stuckGoroutineIDs(stacks)
if stuckTasks != nil {
// Stuck task goroutines are deemed stuck independent of the state of their goroutines.
for taskID := range stuckTasks {
stuckIDs[taskID] = struct{}{}
}
}

var sortedIDs []int64
for id := range stuckIDs {
sortedIDs = append(sortedIDs, id)
}
sort.Slice(sortedIDs, func(i, j int) bool {
return sortedIDs[i] < sortedIDs[j]
})

var buf bytes.Buffer
for i, id := range sortedIDs {
stack := singleGoroutineStack(stacks, id)
if stack != nil {
if i > 0 {
buf.WriteString("\n")
}
buf.Write(stack)
}
if i > MaxStuckGoroutinesToLog {
buf.WriteString("\n...<too many stuck goroutines, truncated>")
break
}
}
return buf.Bytes()
}

// IsLogging returns whether the global logger is logging.
func IsLogging(level Level) bool {
return Log().IsLogging(level)
Expand Down
65 changes: 65 additions & 0 deletions pkg/log/log_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,71 @@ func TestCaller(t *testing.T) {
}
}

func TestStuckGoroutineIDs(t *testing.T) {
stacksString := `
goroutine 0001 [sync.WaitGroup.Wait, 15 minutes]:
goroutine 0002 [select, 1 minutes]:
goroutine 0003 [select]:
goroutine 0004 [chan receive, 11 minutes]:
goroutine 6661 [semacquire, 3 minutes]:
goroutine 6662 [sync.Mutex.Lock, 3 minutes]:
goroutine 6663 [sync.Mutex.Lock]:
`
gotStuckIds := stuckGoroutineIDs([]byte(stacksString))
if len(gotStuckIds) != 3 {
t.Errorf("expected 3 stuck goroutines, got %d", len(gotStuckIds))
}
for _, gid := range []int64{6661, 6662, 6663} {
if _, ok := gotStuckIds[gid]; !ok {
t.Errorf("expected goroutine %d to be stuck, but it was not", gid)
}
}
}

func TestStuckGoroutineStacks(t *testing.T) {
innocent0 := `goroutine 124 [select, 1 minutes]:
gvisor.dev/gvisor/pkg/sentry/ktime.(*SampledTimer).runGoroutine(0xc0008466c0)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/sentry/ktime/sampled_timer.go:235 +0xd4
created by gvisor.dev/gvisor/pkg/sentry/ktime.(*SampledTimer).init in goroutine 123
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/sentry/ktime/sampled_timer.go:102 +0x239`

innocent1 := `goroutine 35144 [chan receive]:
gvisor.dev/gvisor/pkg/tcpip/link/veth.NewPair.func1()
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/tcpip/link/veth/veth.go:108 +0xfd
created by gvisor.dev/gvisor/pkg/tcpip/link/veth.NewPair in goroutine 35063
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/tcpip/link/veth/veth.go:107 +0x2f3`

stuckNetstackGoroutine := `goroutine 25916 [semacquire, 3 minutes]:
sync.runtime_Semacquire(0x41de33?)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/go/src/runtime/sema.go:71 +0x25
gvisor.dev/gvisor/pkg/sync.(*CrossGoroutineRWMutex).Lock(0xc0006a3b08?)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/sync/rwmutex_unsafe.go:154 +0x67
gvisor.dev/gvisor/pkg/sync.(*RWMutex).Lock(...)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/sync/rwmutex_unsafe.go:292
created by gvisor.dev/gvisor/pkg/tcpip/link/veth.NewPair in goroutine 25922
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/tcpip/link/veth/veth.go:107 +0x2f3`

stuckTaskGoroutine := `goroutine 26128 [deliberately-not-semacquire, 3 minutes]:
sync.runtime_Semacquire(0x47e205?)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/go/src/runtime/sema.go:71 +0x25
gvisor.dev/gvisor/pkg/sync.(*CrossGoroutineRWMutex).RLock(...)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/sync/rwmutex_unsafe.go:78
gvisor.dev/gvisor/pkg/sync.(*RWMutex).RLock(...)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/sync/rwmutex_unsafe.go:259
gvisor.dev/gvisor/pkg/sentry/kernel.(*Task).doSyscall(0xc000fae160?)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/sentry/kernel/task_run.go:97 +0x3e8`

stuckTasks := make(map[int64]struct{})
stuckTasks[26128] = struct{}{} // 26128 is stuckTaskGoroutine.
allStacks := []string{innocent0, stuckNetstackGoroutine, innocent1, stuckTaskGoroutine}
wantStacks := "\n" + strings.Join([]string{stuckNetstackGoroutine, stuckTaskGoroutine}, "\n\n")

gotStuckStacks := string(stuckGoroutineStacks([]byte(strings.Join(allStacks, "\n\n")), stuckTasks))
if gotStuckStacks != wantStacks {
t.Errorf("expected %s, got %s", wantStacks, gotStuckStacks)
}
}

func BenchmarkGoogleLogging(b *testing.B) {
tw := &testWriter{
limit: 1, // Only record one message.
Expand Down
16 changes: 11 additions & 5 deletions pkg/sentry/watchdog/watchdog.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ func (w *Watchdog) waitForStart() {

var buf bytes.Buffer
buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout))
w.doAction(w.StartupTimeoutAction, false, &buf)
w.doAction(w.StartupTimeoutAction, false /* forceStack */, nil /* stuckTasks */, &buf)
}

// loop is the main watchdog routine. It only returns when 'Stop()' is called.
Expand Down Expand Up @@ -320,26 +320,28 @@ func (w *Watchdog) runTurn() {
func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound bool, now ktime.Time) {
var buf bytes.Buffer
buf.WriteString(fmt.Sprintf("Sentry detected %d stuck task(s):\n", len(offenders)))
stuckTasks := make(map[int64]struct{})
for t, o := range offenders {
tid := w.k.TaskSet().Root.IDOfTask(t)
buf.WriteString(fmt.Sprintf("\tTask tid: %v (goroutine %d), entered RunSys state %v ago.\n", tid, t.GoroutineID(), now.Sub(o.lastUpdateTime)))
stuckTasks[t.GoroutineID()] = struct{}{}
}
buf.WriteString("Search for 'goroutine <id>' in the stack dump to find the offending goroutine(s)")

// Force stack dump only if a new task is detected.
w.doAction(w.TaskTimeoutAction, newTaskFound, &buf)
w.doAction(w.TaskTimeoutAction, newTaskFound, stuckTasks, &buf)
}

func (w *Watchdog) reportStuckWatchdog() {
var buf bytes.Buffer
buf.WriteString("Watchdog goroutine is stuck")
w.doAction(w.TaskTimeoutAction, false, &buf)
w.doAction(w.TaskTimeoutAction, false /* forceStack */, nil /* stuckTasks */, &buf)
}

// doAction will take the given action. If the action is LogWarning, the stack
// is not always dumped to the log to prevent log flooding. "forceStack"
// guarantees that the stack will be dumped regardless.
func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) {
func (w *Watchdog) doAction(action Action, forceStack bool, stuckTasks map[int64]struct{}, msg *bytes.Buffer) {
switch action {
case LogWarning:
// Dump stack only if forced or sometime has passed since the last time a
Expand All @@ -355,7 +357,11 @@ func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) {
case Panic:
// Panic will skip over running tasks, which is likely the culprit here. So manually
// dump all stacks before panic'ing.
log.TracebackAll(msg.String())
if stuckTasks != nil && len(stuckTasks) > 0 {
log.TracebackAllWithStuckSuffix(stuckTasks, msg.String())
} else {
log.TracebackAll(msg.String())
}

// Attempt to flush metrics, timeout and move on in case metrics are stuck as well.
metricsEmitted := make(chan struct{}, 1)
Expand Down