From a805a43d8f1c3d9c4f9491123967142f32edcb8c Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Wed, 5 Mar 2025 16:21:48 -0700 Subject: [PATCH 01/23] GODRIVER-3173 Complete pending reads on conn checkout --- event/monitoring.go | 33 +- internal/driverutil/context.go | 49 ++ internal/integration/client_test.go | 58 +- internal/integration/csot_test.go | 153 +++--- internal/integration/mtest/mongotest.go | 25 +- internal/logger/component.go | 3 + .../pending-reads.json | 519 ++++++++++++++++++ .../pending-reads.yml | 312 +++++++++++ x/mongo/driver/operation.go | 8 + x/mongo/driver/topology/connection.go | 45 +- x/mongo/driver/topology/pool.go | 235 +++++--- x/mongo/driver/topology/pool_test.go | 221 ++++---- 12 files changed, 1388 insertions(+), 273 deletions(-) create mode 100644 internal/driverutil/context.go create mode 100644 testdata/client-side-operations-timeout/pending-reads.json create mode 100644 testdata/client-side-operations-timeout/pending-reads.yml diff --git a/event/monitoring.go b/event/monitoring.go index 2ca98969d7..4965a55528 100644 --- a/event/monitoring.go +++ b/event/monitoring.go @@ -75,17 +75,20 @@ const ( // strings for pool command monitoring types const ( - ConnectionPoolCreated = "ConnectionPoolCreated" - ConnectionPoolReady = "ConnectionPoolReady" - ConnectionPoolCleared = "ConnectionPoolCleared" - ConnectionPoolClosed = "ConnectionPoolClosed" - ConnectionCreated = "ConnectionCreated" - ConnectionReady = "ConnectionReady" - ConnectionClosed = "ConnectionClosed" - ConnectionCheckOutStarted = "ConnectionCheckOutStarted" - ConnectionCheckOutFailed = "ConnectionCheckOutFailed" - ConnectionCheckedOut = "ConnectionCheckedOut" - ConnectionCheckedIn = "ConnectionCheckedIn" + ConnectionPoolCreated = "ConnectionPoolCreated" + ConnectionPoolReady = "ConnectionPoolReady" + ConnectionPoolCleared = "ConnectionPoolCleared" + ConnectionPoolClosed = "ConnectionPoolClosed" + ConnectionCreated = "ConnectionCreated" + ConnectionReady = "ConnectionReady" + ConnectionClosed = "ConnectionClosed" + ConnectionCheckOutStarted = "ConnectionCheckOutStarted" + ConnectionCheckOutFailed = "ConnectionCheckOutFailed" + ConnectionCheckedOut = "ConnectionCheckedOut" + ConnectionCheckedIn = "ConnectionCheckedIn" + ConnectionPendingReadStarted = "ConnectionPendingReadStarted" + ConnectionPendingReadSucceeded = "ConnectionPendingReadSucceeded" + ConnectionPendingReadFailed = "ConnectionPendingReadFailed" ) // MonitorPoolOptions contains pool options as formatted in pool events @@ -105,9 +108,11 @@ type PoolEvent struct { Reason string `json:"reason"` // ServiceID is only set if the Type is PoolCleared and the server is deployed behind a load balancer. This field // can be used to distinguish between individual servers in a load balanced deployment. - ServiceID *bson.ObjectID `json:"serviceId"` - Interruption bool `json:"interruptInUseConnections"` - Error error `json:"error"` + ServiceID *bson.ObjectID `json:"serviceId"` + Interruption bool `json:"interruptInUseConnections"` + Error error `json:"error"` + RequestID int32 `json:"requestId"` + RemainingTime time.Duration `json:"remainingTime"` } // PoolMonitor is a function that allows the user to gain access to events occurring in the pool diff --git a/internal/driverutil/context.go b/internal/driverutil/context.go new file mode 100644 index 0000000000..5b8cd54b3a --- /dev/null +++ b/internal/driverutil/context.go @@ -0,0 +1,49 @@ +// Copyright (C) MongoDB, Inc. 2025-present. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. You may obtain +// a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +package driverutil + +import "context" + +// ContextKey is a custom type used for the keys in context values to avoid +// collisions. +type ContextKey string + +const ( + // ContextKeyHasMaxTimeMS represents a boolean value that indicates if + // maxTimeMS will be set on the wire message for an operation. + ContextKeyHasMaxTimeMS ContextKey = "hasMaxTimeMS" + + // ContextKeyRequestID is the requestID for a given operation. This is used to + // propagate the requestID for a pending read during connection check out. + ContextKeyRequestID ContextKey = "requestID" +) + +// WithValueHasMaxTimeMS returns a copy of the parent context with an added +// value indicating whether an operation will append maxTimeMS to the wire +// message. +func WithValueHasMaxTimeMS(parentCtx context.Context, val bool) context.Context { + return context.WithValue(parentCtx, ContextKeyHasMaxTimeMS, val) +} + +// WithRequestID returns a copy of the parent context with an added request ID +// value. +func WithRequestID(parentCtx context.Context, requestID int32) context.Context { + return context.WithValue(parentCtx, ContextKeyRequestID, requestID) +} + +// HasMaxTimeMS checks if the context is for an operation that will append +// maxTimeMS to the wire message. +func HasMaxTimeMS(ctx context.Context) bool { + return ctx.Value(ContextKeyHasMaxTimeMS) != nil +} + +// GetRequestID retrieves the request ID from the context if it exists. +func GetRequestID(ctx context.Context) (int32, bool) { + val, ok := ctx.Value(ContextKeyRequestID).(int32) + + return val, ok +} diff --git a/internal/integration/client_test.go b/internal/integration/client_test.go index 6f18d9f146..7db256c070 100644 --- a/internal/integration/client_test.go +++ b/internal/integration/client_test.go @@ -13,6 +13,7 @@ import ( "os" "reflect" "strings" + "sync" "testing" "time" @@ -675,9 +676,9 @@ func TestClient(t *testing.T) { }, } + _, err := mt.Coll.InsertOne(context.Background(), bson.D{}) for _, tc := range testCases { mt.Run(tc.desc, func(mt *mtest.T) { - _, err := mt.Coll.InsertOne(context.Background(), bson.D{}) require.NoError(mt, err) mt.SetFailPoint(failpoint.FailPoint{ @@ -692,30 +693,47 @@ func TestClient(t *testing.T) { mt.ClearEvents() + wg := sync.WaitGroup{} + wg.Add(50) + for i := 0; i < 50; i++ { - // Run 50 operations, each with a timeout of 50ms. Expect + // Run 50 concurrent operations, each with a timeout of 50ms. Expect // them to all return a timeout error because the failpoint - // blocks find operations for 500ms. Run 50 to increase the + // blocks find operations for 50ms. Run 50 to increase the // probability that an operation will time out in a way that // can cause a retry. - ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) - err = tc.operation(ctx, mt.Coll) - cancel() - assert.ErrorIs(mt, err, context.DeadlineExceeded) - assert.True(mt, mongo.IsTimeout(err), "expected mongo.IsTimeout(err) to be true") - - // Assert that each operation reported exactly one command - // started events, which means the operation did not retry - // after the context timeout. - evts := mt.GetAllStartedEvents() - require.Len(mt, - mt.GetAllStartedEvents(), - 1, - "expected exactly 1 command started event per operation, but got %d after %d iterations", - len(evts), - i) - mt.ClearEvents() + go func() { + ctx, cancel := context.WithTimeout(context.Background(), 150*time.Millisecond) + err := tc.operation(ctx, mt.Coll) + cancel() + assert.ErrorIs(mt, err, context.DeadlineExceeded) + assert.True(mt, mongo.IsTimeout(err), "expected mongo.IsTimeout(err) to be true") + + wg.Done() + }() } + + wg.Wait() + + // Since an operation requires checking out a connection and because we + // attempt a pending read for socket timeouts and since the test forces + // 50 concurrent socket timeouts, then it's possible that an + // operation checks out a connection that has a pending read. In this + // case the operation will time out when checking out a connection, and + // a started event will not be propagated. So instead of + // checking that we got exactly 50 started events, we should instead + // ensure that the number of started events is equal to the number of + // unique connections used to process the operations. + pendingReadConns := mt.NumberConnectionsPendingReadStarted() + evts := mt.GetAllStartedEvents() + + require.Equal(mt, + len(evts)+pendingReadConns, + 50, + "expected exactly 1 command started event per operation (50), but got %d", + len(evts)+pendingReadConns) + mt.ClearEvents() + mt.ClearFailPoints() }) } }) diff --git a/internal/integration/csot_test.go b/internal/integration/csot_test.go index 6808efb2a4..877f5e0341 100644 --- a/internal/integration/csot_test.go +++ b/internal/integration/csot_test.go @@ -38,12 +38,13 @@ func TestCSOT_maxTimeMS(t *testing.T) { mt := mtest.New(t, mtest.NewOptions().CreateClient(false)) testCases := []struct { - desc string - commandName string - setup func(coll *mongo.Collection) error - operation func(ctx context.Context, coll *mongo.Collection) error - sendsMaxTimeMS bool - topologies []mtest.TopologyKind + desc string + commandName string + setup func(coll *mongo.Collection) error + operation func(ctx context.Context, coll *mongo.Collection) error + sendsMaxTimeMS bool + topologies []mtest.TopologyKind + preventsConnClosureWithTimeoutMS bool }{ { desc: "FindOne", @@ -55,7 +56,8 @@ func TestCSOT_maxTimeMS(t *testing.T) { operation: func(ctx context.Context, coll *mongo.Collection) error { return coll.FindOne(ctx, bson.D{}).Err() }, - sendsMaxTimeMS: true, + sendsMaxTimeMS: true, + preventsConnClosureWithTimeoutMS: true, }, { desc: "Find", @@ -68,7 +70,8 @@ func TestCSOT_maxTimeMS(t *testing.T) { _, err := coll.Find(ctx, bson.D{}) return err }, - sendsMaxTimeMS: false, + sendsMaxTimeMS: false, + preventsConnClosureWithTimeoutMS: false, }, { desc: "FindOneAndDelete", @@ -80,7 +83,8 @@ func TestCSOT_maxTimeMS(t *testing.T) { operation: func(ctx context.Context, coll *mongo.Collection) error { return coll.FindOneAndDelete(ctx, bson.D{}).Err() }, - sendsMaxTimeMS: true, + sendsMaxTimeMS: true, + preventsConnClosureWithTimeoutMS: true, }, { desc: "FindOneAndUpdate", @@ -92,7 +96,8 @@ func TestCSOT_maxTimeMS(t *testing.T) { operation: func(ctx context.Context, coll *mongo.Collection) error { return coll.FindOneAndUpdate(ctx, bson.D{}, bson.M{"$set": bson.M{"key": "value"}}).Err() }, - sendsMaxTimeMS: true, + sendsMaxTimeMS: true, + preventsConnClosureWithTimeoutMS: true, }, { desc: "FindOneAndReplace", @@ -104,7 +109,8 @@ func TestCSOT_maxTimeMS(t *testing.T) { operation: func(ctx context.Context, coll *mongo.Collection) error { return coll.FindOneAndReplace(ctx, bson.D{}, bson.D{}).Err() }, - sendsMaxTimeMS: true, + sendsMaxTimeMS: true, + preventsConnClosureWithTimeoutMS: true, }, { desc: "InsertOne", @@ -113,7 +119,8 @@ func TestCSOT_maxTimeMS(t *testing.T) { _, err := coll.InsertOne(ctx, bson.D{}) return err }, - sendsMaxTimeMS: true, + sendsMaxTimeMS: true, + preventsConnClosureWithTimeoutMS: true, }, { desc: "InsertMany", @@ -122,7 +129,8 @@ func TestCSOT_maxTimeMS(t *testing.T) { _, err := coll.InsertMany(ctx, []interface{}{bson.D{}}) return err }, - sendsMaxTimeMS: true, + sendsMaxTimeMS: true, + preventsConnClosureWithTimeoutMS: true, }, { desc: "UpdateOne", @@ -131,7 +139,8 @@ func TestCSOT_maxTimeMS(t *testing.T) { _, err := coll.UpdateOne(ctx, bson.D{}, bson.M{"$set": bson.M{"key": "value"}}) return err }, - sendsMaxTimeMS: true, + sendsMaxTimeMS: true, + preventsConnClosureWithTimeoutMS: true, }, { desc: "UpdateMany", @@ -140,7 +149,8 @@ func TestCSOT_maxTimeMS(t *testing.T) { _, err := coll.UpdateMany(ctx, bson.D{}, bson.M{"$set": bson.M{"key": "value"}}) return err }, - sendsMaxTimeMS: true, + sendsMaxTimeMS: true, + preventsConnClosureWithTimeoutMS: true, }, { desc: "ReplaceOne", @@ -149,7 +159,8 @@ func TestCSOT_maxTimeMS(t *testing.T) { _, err := coll.ReplaceOne(ctx, bson.D{}, bson.D{}) return err }, - sendsMaxTimeMS: true, + sendsMaxTimeMS: true, + preventsConnClosureWithTimeoutMS: true, }, { desc: "DeleteOne", @@ -158,7 +169,8 @@ func TestCSOT_maxTimeMS(t *testing.T) { _, err := coll.DeleteOne(ctx, bson.D{}) return err }, - sendsMaxTimeMS: true, + sendsMaxTimeMS: true, + preventsConnClosureWithTimeoutMS: true, }, { desc: "DeleteMany", @@ -168,6 +180,8 @@ func TestCSOT_maxTimeMS(t *testing.T) { return err }, sendsMaxTimeMS: true, + + preventsConnClosureWithTimeoutMS: true, }, { desc: "Distinct", @@ -175,7 +189,8 @@ func TestCSOT_maxTimeMS(t *testing.T) { operation: func(ctx context.Context, coll *mongo.Collection) error { return coll.Distinct(ctx, "name", bson.D{}).Err() }, - sendsMaxTimeMS: true, + sendsMaxTimeMS: true, + preventsConnClosureWithTimeoutMS: true, }, { desc: "Aggregate", @@ -184,7 +199,8 @@ func TestCSOT_maxTimeMS(t *testing.T) { _, err := coll.Aggregate(ctx, mongo.Pipeline{}) return err }, - sendsMaxTimeMS: false, + sendsMaxTimeMS: false, + preventsConnClosureWithTimeoutMS: false, }, { desc: "Watch", @@ -196,7 +212,8 @@ func TestCSOT_maxTimeMS(t *testing.T) { } return err }, - sendsMaxTimeMS: true, + sendsMaxTimeMS: true, + preventsConnClosureWithTimeoutMS: false, // Change Streams aren't supported on standalone topologies. topologies: []mtest.TopologyKind{ mtest.ReplicaSet, @@ -218,7 +235,8 @@ func TestCSOT_maxTimeMS(t *testing.T) { var res []bson.D return cursor.All(ctx, &res) }, - sendsMaxTimeMS: false, + sendsMaxTimeMS: false, + preventsConnClosureWithTimeoutMS: false, }, } @@ -348,56 +366,57 @@ func TestCSOT_maxTimeMS(t *testing.T) { assertMaxTimeMSNotSet(mt, evt.Command) } }) + if tc.preventsConnClosureWithTimeoutMS { + opts := mtest.NewOptions(). + // Blocking failpoints don't work on pre-4.2 and sharded + // clusters. + Topologies(mtest.Single, mtest.ReplicaSet). + MinServerVersion("4.2") + mt.RunOpts("prevents connection closure", opts, func(mt *mtest.T) { + if tc.setup != nil { + err := tc.setup(mt.Coll) + require.NoError(mt, err) + } - opts := mtest.NewOptions(). - // Blocking failpoints don't work on pre-4.2 and sharded - // clusters. - Topologies(mtest.Single, mtest.ReplicaSet). - MinServerVersion("4.2") - mt.RunOpts("prevents connection closure", opts, func(mt *mtest.T) { - if tc.setup != nil { - err := tc.setup(mt.Coll) - require.NoError(mt, err) - } - - mt.SetFailPoint(failpoint.FailPoint{ - ConfigureFailPoint: "failCommand", - Mode: failpoint.ModeAlwaysOn, - Data: failpoint.Data{ - FailCommands: []string{tc.commandName}, - BlockConnection: true, - // Note that some operations (currently Find and - // Aggregate) do not send maxTimeMS by default, meaning - // that the server will only respond after BlockTimeMS - // is elapsed. If the amount of time that the driver - // waits for responses after a timeout is significantly - // lower than BlockTimeMS, this test will start failing - // for those operations. - BlockTimeMS: 500, - }, - }) - - tpm := eventtest.NewTestPoolMonitor() - mt.ResetClient(options.Client(). - SetPoolMonitor(tpm.PoolMonitor)) - - // Run 5 operations that time out, then assert that no - // connections were closed. - for i := 0; i < 5; i++ { - ctx, cancel := context.WithTimeout(context.Background(), 15*time.Millisecond) - err := tc.operation(ctx, mt.Coll) - cancel() - - if !mongo.IsTimeout(err) { - t.Logf("Operation %d returned a non-timeout error: %v", i, err) + mt.SetFailPoint(failpoint.FailPoint{ + ConfigureFailPoint: "failCommand", + Mode: failpoint.ModeAlwaysOn, + Data: failpoint.Data{ + FailCommands: []string{tc.commandName}, + BlockConnection: true, + // Note that some operations (currently Find and + // Aggregate) do not send maxTimeMS by default, meaning + // that the server will only respond after BlockTimeMS + // is elapsed. If the amount of time that the driver + // waits for responses after a timeout is significantly + // lower than BlockTimeMS, this test will start failing + // for those operations. + BlockTimeMS: 500, + }, + }) + + tpm := eventtest.NewTestPoolMonitor() + mt.ResetClient(options.Client(). + SetPoolMonitor(tpm.PoolMonitor)) + + // Run 5 operations that time out, then assert that no + // connections were closed. + for i := 0; i < 5; i++ { + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Millisecond) + err := tc.operation(ctx, mt.Coll) + cancel() + + if !mongo.IsTimeout(err) { + t.Logf("Operation %d returned a non-timeout error: %v", i, err) + } } - } - closedEvents := tpm.Events(func(pe *event.PoolEvent) bool { - return pe.Type == event.ConnectionClosed + closedEvents := tpm.Events(func(pe *event.PoolEvent) bool { + return pe.Type == event.ConnectionClosed + }) + assert.Len(mt, closedEvents, 0, "expected no connection closed event") }) - assert.Len(mt, closedEvents, 0, "expected no connection closed event") - }) + } }) } diff --git a/internal/integration/mtest/mongotest.go b/internal/integration/mtest/mongotest.go index 3967bf7f82..901d71a3ff 100644 --- a/internal/integration/mtest/mongotest.go +++ b/internal/integration/mtest/mongotest.go @@ -55,7 +55,10 @@ type T struct { // It must be accessed using the atomic package and should be at the beginning of the struct. // - atomic bug: https://pkg.go.dev/sync/atomic#pkg-note-BUG // - suggested layout: https://go101.org/article/memory-layout.html - connsCheckedOut int64 + connsCheckedOut int64 + connPendingReadStarted int64 + connPendingReadSucceeded int64 + connPendingReadFailed int64 *testing.T @@ -348,6 +351,20 @@ func (t *T) NumberConnectionsCheckedOut() int { return int(atomic.LoadInt64(&t.connsCheckedOut)) } +// NumberConnectionsPendingReadStarted returns the number of connections that have +// started a pending read. +func (t *T) NumberConnectionsPendingReadStarted() int { + return int(atomic.LoadInt64(&t.connPendingReadStarted)) +} + +func (t *T) NumberConnectionsPendingReadSucceeded() int { + return int(atomic.LoadInt64(&t.connPendingReadSucceeded)) +} + +func (t *T) NumberConnectionsPendingReadFailed() int { + return int(atomic.LoadInt64(&t.connPendingReadFailed)) +} + // ClearEvents clears the existing command monitoring events. func (t *T) ClearEvents() { t.started = t.started[:0] @@ -640,6 +657,12 @@ func (t *T) createTestClient() { atomic.AddInt64(&t.connsCheckedOut, 1) case event.ConnectionCheckedIn: atomic.AddInt64(&t.connsCheckedOut, -1) + case event.ConnectionPendingReadStarted: + atomic.AddInt64(&t.connPendingReadStarted, 1) + case event.ConnectionPendingReadSucceeded: + atomic.AddInt64(&t.connPendingReadSucceeded, 1) + case event.ConnectionCheckOutFailed: + atomic.AddInt64(&t.connPendingReadFailed, 1) } }, }) diff --git a/internal/logger/component.go b/internal/logger/component.go index a601707cbf..5abc3f5f79 100644 --- a/internal/logger/component.go +++ b/internal/logger/component.go @@ -28,6 +28,9 @@ const ( ConnectionCheckoutFailed = "Connection checkout failed" ConnectionCheckedOut = "Connection checked out" ConnectionCheckedIn = "Connection checked in" + ConnectionPendingReadStarted = "Pending read started" + ConnectionPendingReadSucceeded = "Pending read succeeded" + ConnectionPendingReadFailed = "Pending read failed" ServerSelectionFailed = "Server selection failed" ServerSelectionStarted = "Server selection started" ServerSelectionSucceeded = "Server selection succeeded" diff --git a/testdata/client-side-operations-timeout/pending-reads.json b/testdata/client-side-operations-timeout/pending-reads.json new file mode 100644 index 0000000000..43b2745c7c --- /dev/null +++ b/testdata/client-side-operations-timeout/pending-reads.json @@ -0,0 +1,519 @@ +{ + "description": "Operation timeouts do not cause connection churn", + "schemaVersion": "1.9", + "runOnRequirements": [ + { + "minServerVersion": "4.4", + "topologies": [ + "standalone", + "replicaset" + ] + } + ], + "createEntities": [ + { + "client": { + "id": "failPointClient", + "useMultipleMongoses": false + } + }, + { + "client": { + "id": "client", + "uriOptions": { + "maxPoolSize": 1 + }, + "useMultipleMongoses": false, + "observeEvents": [ + "commandFailedEvent", + "commandSucceededEvent", + "connectionCheckedOutEvent", + "connectionCheckedInEvent", + "connectionClosedEvent" + ] + } + }, + { + "database": { + "id": "test", + "client": "client", + "databaseName": "test" + } + }, + { + "collection": { + "id": "coll", + "database": "test", + "collectionName": "coll" + } + } + ], + "initialData": [ + { + "collectionName": "coll", + "databaseName": "test", + "documents": [] + } + ], + "tests": [ + { + "description": "Write operation with successful pending read", + "operations": [ + { + "name": "failPoint", + "object": "testRunner", + "arguments": { + "client": "failPointClient", + "failPoint": { + "configureFailPoint": "failCommand", + "mode": { + "times": 1 + }, + "data": { + "failCommands": [ + "insert" + ], + "blockConnection": true, + "blockTimeMS": 750 + } + } + } + }, + { + "name": "insertOne", + "object": "coll", + "arguments": { + "timeoutMS": 50, + "document": { + "_id": 3, + "x": 1 + } + }, + "expectError": { + "isTimeoutError": true + } + }, + { + "name": "findOne", + "object": "coll", + "arguments": { + "filter": { + "_id": 1 + } + } + } + ], + "expectEvents": [ + { + "client": "client", + "events": [ + { + "commandFailedEvent": { + "commandName": "insert" + } + }, + { + "commandSucceededEvent": { + "commandName": "find" + } + } + ] + }, + { + "client": "client", + "eventType": "cmap", + "events": [ + { + "connectionCheckedOutEvent": {} + }, + { + "connectionCheckedInEvent": {} + }, + { + "connectionCheckedOutEvent": {} + }, + { + "connectionCheckedInEvent": {} + } + ] + } + ] + }, + { + "description": "Concurrent write operation with successful pending read", + "operations": [ + { + "name": "failPoint", + "object": "testRunner", + "arguments": { + "client": "failPointClient", + "failPoint": { + "configureFailPoint": "failCommand", + "mode": { + "times": 1 + }, + "data": { + "failCommands": [ + "insert" + ], + "blockConnection": true, + "blockTimeMS": 750 + } + } + } + }, + { + "name": "createEntities", + "object": "testRunner", + "arguments": { + "entities": [ + { + "thread": { + "id": "thread0" + } + }, + { + "thread": { + "id": "thread1" + } + } + ] + } + }, + { + "name": "runOnThread", + "object": "testRunner", + "arguments": { + "thread": "thread0", + "operation": { + "name": "insertOne", + "object": "coll", + "arguments": { + "timeoutMS": 500, + "document": { + "_id": 2 + } + } + }, + "expectError": { + "isTimeoutError": true + } + } + }, + { + "name": "waitForEvent", + "object": "testRunner", + "arguments": { + "client": "client", + "event": { + "connectionCheckedOutEvent": {} + }, + "count": 1 + } + }, + { + "name": "runOnThread", + "object": "testRunner", + "arguments": { + "thread": "thread1", + "operation": { + "name": "insertOne", + "object": "coll", + "arguments": { + "document": { + "_id": 3 + } + } + } + } + }, + { + "name": "waitForThread", + "object": "testRunner", + "arguments": { + "thread": "thread1" + } + } + ], + "expectEvents": [ + { + "client": "client", + "events": [ + { + "commandFailedEvent": { + "commandName": "insert" + } + }, + { + "commandSucceededEvent": { + "commandName": "insert" + } + } + ] + }, + { + "client": "client", + "eventType": "cmap", + "events": [ + { + "connectionCheckedOutEvent": {} + }, + { + "connectionCheckedInEvent": {} + }, + { + "connectionCheckedOutEvent": {} + }, + { + "connectionCheckedInEvent": {} + } + ] + } + ] + }, + { + "description": "Write operation with unsuccessful pending read", + "operations": [ + { + "name": "failPoint", + "object": "testRunner", + "arguments": { + "client": "failPointClient", + "failPoint": { + "configureFailPoint": "failCommand", + "mode": { + "times": 1 + }, + "data": { + "failCommands": [ + "insert" + ], + "blockConnection": true, + "blockTimeMS": 2100 + } + } + } + }, + { + "name": "insertOne", + "object": "coll", + "arguments": { + "timeoutMS": 50, + "document": { + "_id": 3, + "x": 1 + } + }, + "expectError": { + "isTimeoutError": true + } + }, + { + "name": "insertOne", + "object": "coll", + "arguments": { + "timeoutMS": 2000, + "document": { + "_id": 3, + "x": 1 + } + }, + "expectError": { + "isTimeoutError": true + } + } + ], + "expectEvents": [ + { + "client": "client", + "events": [ + { + "commandFailedEvent": { + "commandName": "insert" + } + } + ] + }, + { + "client": "client", + "eventType": "cmap", + "events": [ + { + "connectionCheckedOutEvent": {} + }, + { + "connectionCheckedInEvent": {} + }, + { + "connectionClosedEvent": { + "reason": "error" + } + } + ] + } + ] + }, + { + "description": "Read operation with successful pending read", + "operations": [ + { + "name": "failPoint", + "object": "testRunner", + "arguments": { + "client": "failPointClient", + "failPoint": { + "configureFailPoint": "failCommand", + "mode": { + "times": 1 + }, + "data": { + "failCommands": [ + "find" + ], + "blockConnection": true, + "blockTimeMS": 750 + } + } + } + }, + { + "name": "findOne", + "object": "coll", + "arguments": { + "timeoutMS": 50, + "filter": { + "_id": 1 + } + }, + "expectError": { + "isTimeoutError": true + } + }, + { + "name": "findOne", + "object": "coll", + "arguments": { + "filter": { + "_id": 1 + } + } + } + ], + "expectEvents": [ + { + "client": "client", + "events": [ + { + "commandFailedEvent": { + "commandName": "find" + } + }, + { + "commandSucceededEvent": { + "commandName": "find" + } + } + ] + }, + { + "client": "client", + "eventType": "cmap", + "events": [ + { + "connectionCheckedOutEvent": {} + }, + { + "connectionCheckedInEvent": {} + }, + { + "connectionCheckedOutEvent": {} + }, + { + "connectionCheckedInEvent": {} + } + ] + } + ] + }, + { + "description": "Read operation with unsuccessful pending read", + "operations": [ + { + "name": "failPoint", + "object": "testRunner", + "arguments": { + "client": "failPointClient", + "failPoint": { + "configureFailPoint": "failCommand", + "mode": { + "times": 1 + }, + "data": { + "failCommands": [ + "find" + ], + "blockConnection": true, + "blockTimeMS": 2100 + } + } + } + }, + { + "name": "findOne", + "object": "coll", + "arguments": { + "timeoutMS": 50, + "filter": { + "_id": 1 + } + }, + "expectError": { + "isTimeoutError": true + } + }, + { + "name": "findOne", + "object": "coll", + "arguments": { + "timeoutMS": 2000, + "filter": { + "_id": 1 + } + }, + "expectError": { + "isTimeoutError": true + } + } + ], + "expectEvents": [ + { + "client": "client", + "events": [ + { + "commandFailedEvent": { + "commandName": "find" + } + } + ] + }, + { + "client": "client", + "eventType": "cmap", + "events": [ + { + "connectionCheckedOutEvent": {} + }, + { + "connectionCheckedInEvent": {} + }, + { + "connectionClosedEvent": { + "reason": "error" + } + } + ] + } + ] + } + ] +} diff --git a/testdata/client-side-operations-timeout/pending-reads.yml b/testdata/client-side-operations-timeout/pending-reads.yml new file mode 100644 index 0000000000..c07728009e --- /dev/null +++ b/testdata/client-side-operations-timeout/pending-reads.yml @@ -0,0 +1,312 @@ +description: "Operation timeouts do not cause connection churn" + +schemaVersion: "1.9" + +runOnRequirements: + - minServerVersion: "4.4" + # TODO(SERVER-96344): When using failpoints, mongos returns MaxTimeMSExpired + # after maxTimeMS, whereas mongod returns it after + # max(blockTimeMS, maxTimeMS). Until this ticket is resolved, these tests + # will not pass on sharded clusters. + topologies: ["standalone", "replicaset"] + +createEntities: + - client: + id: &failPointClient failPointClient + useMultipleMongoses: false + - client: + id: &client client + uriOptions: + maxPoolSize: 1 + useMultipleMongoses: false + observeEvents: + - commandFailedEvent + - commandSucceededEvent + - connectionCheckedOutEvent + - connectionCheckedInEvent + - connectionClosedEvent + - database: + id: &database test + client: *client + databaseName: *database + - collection: + id: &collection coll + database: *database + collectionName: *collection + +initialData: + - collectionName: *collection + databaseName: *database + documents: [] + +tests: + - description: "Write operation with successful pending read" + operations: + # Create a failpoint to block the first operation + - name: failPoint + object: testRunner + arguments: + client: *failPointClient + failPoint: + configureFailPoint: failCommand + mode: { times: 1 } + data: + failCommands: ["insert"] + blockConnection: true + blockTimeMS: 750 + + # Execute operation with timeout less than block time + - name: insertOne + object: *collection + arguments: + timeoutMS: 50 + document: { _id: 3, x: 1 } + expectError: + isTimeoutError: true + + # Execute a subsequent operation to complete the read + - name: findOne + object: *collection + arguments: + filter: { _id: 1 } + + expectEvents: + - client: *client + events: + - commandFailedEvent: + commandName: insert + - commandSucceededEvent: + commandName: find + - client: *client + eventType: cmap + events: + - connectionCheckedOutEvent: {} # insert + - connectionCheckedInEvent: {} # insert fails + - connectionCheckedOutEvent: {} # find + - connectionCheckedInEvent: {} # find succeeds + + - description: "Concurrent write operation with successful pending read" + operations: + # Create a failpoint to block the first operation + - name: failPoint + object: testRunner + arguments: + client: *failPointClient + failPoint: + configureFailPoint: failCommand + mode: { times: 1 } + data: + failCommands: ["insert"] + blockConnection: true + blockTimeMS: 750 + + # Start threads. + - name: createEntities + object: testRunner + arguments: + entities: + - thread: + id: &thread0 thread0 + - thread: + id: &thread1 thread1 + + # Run an insert in two threads. We expect the first to time out and the + # second to finish the pending read from the first and complete + # successfully. + - name: runOnThread + object: testRunner + arguments: + thread: *thread0 + operation: + name: insertOne + object: *collection + arguments: + timeoutMS: 500 + document: + _id: 2 + expectError: + isTimeoutError: true + + # Ensure the first thread checks out a connection before executing the + # operation in the second thread. This maintains concurrent behavior but + # presents the worst case scenario. + - name: waitForEvent + object: testRunner + arguments: + client: *client + event: + connectionCheckedOutEvent: {} + count: 1 + + - name: runOnThread + object: testRunner + arguments: + thread: *thread1 + operation: + name: insertOne + object: *collection + arguments: + document: + _id: 3 + + # Stop threads. + - name: waitForThread + object: testRunner + arguments: + thread: *thread1 + + expectEvents: + - client: *client + events: + - commandFailedEvent: + commandName: insert + - commandSucceededEvent: + commandName: insert + - client: *client + eventType: cmap + events: + - connectionCheckedOutEvent: {} # insert + - connectionCheckedInEvent: {} # insert fails + - connectionCheckedOutEvent: {} # find + - connectionCheckedInEvent: {} # find succeeds + + - description: "Write operation with unsuccessful pending read" + operations: + # Create a failpoint to block the first operation + - name: failPoint + object: testRunner + arguments: + client: *failPointClient + failPoint: + configureFailPoint: failCommand + mode: { times: 1 } + data: + failCommands: ["insert"] + blockConnection: true + blockTimeMS: 2100 + + # Execute operation with timeout less than block time + - name: insertOne + object: *collection + arguments: + timeoutMS: 50 + document: { _id: 3, x: 1 } + expectError: + isTimeoutError: true + + # The pending read should fail + - name: insertOne + object: *collection + arguments: + timeoutMS: 2000 + document: { _id: 3, x: 1 } + expectError: + isTimeoutError: true + + expectEvents: + - client: *client + events: + - commandFailedEvent: + commandName: insert + # No second failed event since we timed out attempting to check out + # the connection for the second operation + - client: *client + eventType: cmap + events: + - connectionCheckedOutEvent: {} # first insert + - connectionCheckedInEvent: {} # first insert fails + - connectionClosedEvent: # second insert times out pending read in checkout, closes + reason: error + + - description: "Read operation with successful pending read" + operations: + # Create a failpoint to block the first operation + - name: failPoint + object: testRunner + arguments: + client: *failPointClient + failPoint: + configureFailPoint: failCommand + mode: { times: 1 } + data: + failCommands: ["find"] + blockConnection: true + blockTimeMS: 750 + + # Execute operation with timeout less than block time + - name: findOne + object: *collection + arguments: + timeoutMS: 50 + filter: { _id: 1 } + expectError: + isTimeoutError: true + + # Execute a subsequent operation to complete the read + - name: findOne + object: *collection + arguments: + filter: { _id: 1 } + + expectEvents: + - client: *client + events: + - commandFailedEvent: + commandName: find + - commandSucceededEvent: + commandName: find + - client: *client + eventType: cmap + events: + - connectionCheckedOutEvent: {} # first find + - connectionCheckedInEvent: {} # first find fails + - connectionCheckedOutEvent: {} # second find + - connectionCheckedInEvent: {} # second find succeeds + + - description: "Read operation with unsuccessful pending read" + operations: + # Create a failpoint to block the first operation + - name: failPoint + object: testRunner + arguments: + client: *failPointClient + failPoint: + configureFailPoint: failCommand + mode: { times: 1 } + data: + failCommands: ["find"] + blockConnection: true + blockTimeMS: 2100 + + # Execute operation with timeout less than block time + - name: findOne + object: *collection + arguments: + timeoutMS: 50 + filter: { _id: 1 } + expectError: + isTimeoutError: true + + # The pending read should fail + - name: findOne + object: *collection + arguments: + timeoutMS: 2000 + filter: { _id: 1 } + expectError: + isTimeoutError: true + + expectEvents: + - client: *client + events: + - commandFailedEvent: + commandName: find + # No second failed event since we timed out attempting to check out + # the connection for the second operation + - client: *client + eventType: cmap + events: + - connectionCheckedOutEvent: {} # first find + - connectionCheckedInEvent: {} # first find fails + - connectionClosedEvent: # second find times out pending read in checkout, closes + reason: error diff --git a/x/mongo/driver/operation.go b/x/mongo/driver/operation.go index 2597a5de66..cc9f631a99 100644 --- a/x/mongo/driver/operation.go +++ b/x/mongo/driver/operation.go @@ -785,6 +785,14 @@ func (op Operation) Execute(ctx context.Context) error { if moreToCome { roundTrip = op.moreToComeRoundTrip } + + // Set context values to handle a pending read in case of a socket + // timeout. + if maxTimeMS != 0 { + ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) + ctx = driverutil.WithRequestID(ctx, startedInfo.requestID) + } + res, err = roundTrip(ctx, conn, *wm) if ep, ok := srvr.(ErrorProcessor); ok { diff --git a/x/mongo/driver/topology/connection.go b/x/mongo/driver/topology/connection.go index 24ad6a3a51..85d2aecf9c 100644 --- a/x/mongo/driver/topology/connection.go +++ b/x/mongo/driver/topology/connection.go @@ -47,6 +47,12 @@ var ( func nextConnectionID() uint64 { return atomic.AddUint64(&globalConnectionID, 1) } +type pendingReadState struct { + remainingBytes int32 + requestID int32 + remainingTime *time.Duration +} + type connection struct { // state must be accessed using the atomic package and should be at the beginning of the struct. // - atomic bug: https://pkg.go.dev/sync/atomic#pkg-note-BUG @@ -82,9 +88,11 @@ type connection struct { // accessTokens in the OIDC authenticator cache. oidcTokenGenID uint64 - // awaitRemainingBytes indicates the size of server response that was not completely - // read before returning the connection to the pool. - awaitRemainingBytes *int32 + // pendingReadState contains information required to attempt a pending read + // in the event of a socket timeout for an operation that has appended + // maxTimeMS to the wire message. + pendingReadState *pendingReadState + pendingReadMu sync.Mutex } // newConnection handles the creation of a connection. It does not connect the connection. @@ -407,11 +415,14 @@ func (c *connection) readWireMessage(ctx context.Context) ([]byte, error) { dst, errMsg, err := c.read(ctx) if err != nil { - if c.awaitRemainingBytes == nil { - // If the connection was not marked as awaiting response, close the - // connection because we don't know what the connection state is. + c.pendingReadMu.Lock() + if c.pendingReadState == nil { + // If there is no pending read on the connection, use the pre-CSOT + // behavior and close the connection because we don't know if there are + // other bytes left to read. c.close() } + c.pendingReadMu.Unlock() message := errMsg if errors.Is(err, io.EOF) { message = "socket was unexpectedly closed" @@ -476,8 +487,15 @@ func (c *connection) read(ctx context.Context) (bytesRead []byte, errMsg string, // reading messages from an exhaust cursor. n, err := io.ReadFull(c.nc, sizeBuf[:]) if err != nil { - if l := int32(n); l == 0 && isCSOTTimeout(err) { - c.awaitRemainingBytes = &l + if l := int32(n); l == 0 && isCSOTTimeout(err) && driverutil.HasMaxTimeMS(ctx) { + requestID, _ := driverutil.GetRequestID(ctx) + + c.pendingReadMu.Lock() + c.pendingReadState = &pendingReadState{ + remainingBytes: l, + requestID: requestID, + } + c.pendingReadMu.Unlock() } return nil, "incomplete read of message header", err } @@ -492,8 +510,15 @@ func (c *connection) read(ctx context.Context) (bytesRead []byte, errMsg string, n, err = io.ReadFull(c.nc, dst[4:]) if err != nil { remainingBytes := size - 4 - int32(n) - if remainingBytes > 0 && isCSOTTimeout(err) { - c.awaitRemainingBytes = &remainingBytes + if remainingBytes > 0 && isCSOTTimeout(err) && driverutil.HasMaxTimeMS(ctx) { + requestID, _ := driverutil.GetRequestID(ctx) + + c.pendingReadMu.Lock() + c.pendingReadState = &pendingReadState{ + remainingBytes: remainingBytes, + requestID: requestID, + } + c.pendingReadMu.Unlock() } return dst, "incomplete read of full message", err } diff --git a/x/mongo/driver/topology/pool.go b/x/mongo/driver/topology/pool.go index d6568e844f..7a596cc77f 100644 --- a/x/mongo/driver/topology/pool.go +++ b/x/mongo/driver/topology/pool.go @@ -8,6 +8,7 @@ package topology import ( "context" + "errors" "fmt" "io" "net" @@ -18,6 +19,7 @@ import ( "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/event" "go.mongodb.org/mongo-driver/v2/internal/logger" + "go.mongodb.org/mongo-driver/v2/internal/ptrutil" "go.mongodb.org/mongo-driver/v2/mongo/address" "go.mongodb.org/mongo-driver/v2/x/mongo/driver" ) @@ -576,6 +578,10 @@ func (p *pool) checkOut(ctx context.Context) (conn *connection, err error) { return nil, w.err } + if err := awaitPendingRead(ctx, p, w.conn); err != nil { + return nil, err + } + duration = time.Since(start) if mustLogPoolMessage(p) { keysAndValues := logger.KeyValues{ @@ -632,6 +638,10 @@ func (p *pool) checkOut(ctx context.Context) (conn *connection, err error) { return nil, w.err } + if err := awaitPendingRead(ctx, p, w.conn); err != nil { + return nil, err + } + duration := time.Since(start) if mustLogPoolMessage(p) { keysAndValues := logger.KeyValues{ @@ -771,82 +781,190 @@ func (p *pool) removeConnection(conn *connection, reason reason, err error) erro return nil } -var ( - // BGReadTimeout is the maximum amount of the to wait when trying to read - // the server reply on a connection after an operation timed out. The - // default is 400ms. - // - // Deprecated: BGReadTimeout is intended for internal use only and may be - // removed or modified at any time. - BGReadTimeout = 400 * time.Millisecond +// PendingReadTimeout is the maximum amount of the to wait when trying to read +// the server reply on a connection after an operation timed out. The +// default is 1 second. +// +// Deprecated: PendingReadTimeout is intended for internal use only and may be +// removed or modified at any time. +var PendingReadTimeout = 2000 * time.Millisecond + +// awaitPendingRead sets a new read deadline on the provided connection and +// tries to read any bytes returned by the server. If there are any errors, the +// connection will be checked back into the pool to be retried. +func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { + conn.pendingReadMu.Lock() + defer conn.pendingReadMu.Unlock() + + // If there are no bytes pending read, do nothing. + if conn.pendingReadState == nil { + return nil + } - // BGReadCallback is a callback for monitoring the behavior of the - // background-read-on-timeout connection preserving mechanism. - // - // Deprecated: BGReadCallback is intended for internal use only and may be - // removed or modified at any time. - BGReadCallback func(addr string, start, read time.Time, errs []error, connClosed bool) -) + prs := conn.pendingReadState + if prs.remainingTime == nil { + prs.remainingTime = ptrutil.Ptr(PendingReadTimeout) + } -// bgRead sets a new read deadline on the provided connection and tries to read -// any bytes returned by the server. If successful, it checks the connection -// into the provided pool. If there are any errors, it closes the connection. -// -// It calls the package-global BGReadCallback function, if set, with the -// address, timings, and any errors that occurred. -func bgRead(pool *pool, conn *connection, size int32) { - var err error - start := time.Now() + if mustLogPoolMessage(pool) { + keysAndValues := logger.KeyValues{ + logger.KeyDriverConnectionID, conn.driverConnectionID, + logger.KeyRequestID, prs.requestID, + } + + logPoolMessage(pool, logger.ConnectionPendingReadStarted, keysAndValues...) + } + + if pool.monitor != nil { + event := &event.PoolEvent{ + Type: event.ConnectionPendingReadStarted, + ConnectionID: conn.driverConnectionID, + RequestID: prs.requestID, + } + + pool.monitor.Event(event) + } + + size := prs.remainingBytes + + checkIn := false + var someErr error defer func() { - read := time.Now() - errs := make([]error, 0) - connClosed := false - if err != nil { - errs = append(errs, err) - connClosed = true - err = conn.close() - if err != nil { - errs = append(errs, fmt.Errorf("error closing conn after reading: %w", err)) + if mustLogPoolMessage(pool) && someErr != nil { + keysAndValues := logger.KeyValues{ + logger.KeyDriverConnectionID, conn.driverConnectionID, + logger.KeyRequestID, prs.requestID, + logger.KeyReason, someErr.Error(), + logger.KeyRemainingTimeMS, *prs.remainingTime, } + + logPoolMessage(pool, logger.ConnectionPendingReadFailed, keysAndValues...) + } + + if pool.monitor != nil && someErr != nil { + event := &event.PoolEvent{ + Type: event.ConnectionPendingReadFailed, + Address: pool.address.String(), + ConnectionID: conn.driverConnectionID, + RequestID: prs.requestID, + RemainingTime: *prs.remainingTime, + Reason: someErr.Error(), + Error: someErr, + } + + pool.monitor.Event(event) + } + + // If we have exceeded the time limit, then close the connection. + if prs.remainingTime != nil && *prs.remainingTime < 0 { + if err := conn.close(); err != nil { + panic(err) + } + + return + } + + if !checkIn { + return } // No matter what happens, always check the connection back into the // pool, which will either make it available for other operations or // remove it from the pool if it was closed. - err = pool.checkInNoEvent(conn) - if err != nil { - errs = append(errs, fmt.Errorf("error checking in: %w", err)) - } + // + // TODO(GODRIVER-3385): Figure out how to handle this error. It's possible + // that a single connection can be checked out to handle multiple concurrent + // operations. This is likely a bug in the Go Driver. So it's possible that + // the connection is idle at the point of check-in. + _ = pool.checkInNoEvent(conn) + }() - if BGReadCallback != nil { - BGReadCallback(conn.addr.String(), start, read, errs, connClosed) + dl, contextDeadlineUsed := ctx.Deadline() + if !contextDeadlineUsed { + // If there is a remainingTime, use that. If not, use the static + // PendingReadTimeout. This is required since a user could provide a timeout + // for the first try that does not exceed the pending read timeout, fail, + // and then not use a timeout for a subsequent try. + if prs.remainingTime != nil { + dl = time.Now().Add(*prs.remainingTime) + } else { + dl = time.Now().Add(PendingReadTimeout) } - }() + } - err = conn.nc.SetReadDeadline(time.Now().Add(BGReadTimeout)) + err := conn.nc.SetReadDeadline(dl) if err != nil { - err = fmt.Errorf("error setting a read deadline: %w", err) - return + checkIn = true + + someErr = fmt.Errorf("error setting a read deadline: %w", err) + + return someErr } - if size == 0 { + st := time.Now() + + if size == 0 { // Question: Would this alawys equal to zero? var sizeBuf [4]byte - _, err = io.ReadFull(conn.nc, sizeBuf[:]) - if err != nil { - err = fmt.Errorf("error reading the message size: %w", err) - return + if _, err := io.ReadFull(conn.nc, sizeBuf[:]); err != nil { + prs.remainingTime = ptrutil.Ptr(*prs.remainingTime - time.Since(st)) + checkIn = true + + err = transformNetworkError(ctx, err, contextDeadlineUsed) + someErr = fmt.Errorf("error reading the message size: %w", err) + + return someErr } size, err = conn.parseWmSizeBytes(sizeBuf) if err != nil { - return + checkIn = true + someErr = transformNetworkError(ctx, err, contextDeadlineUsed) + + return someErr } size -= 4 } - _, err = io.CopyN(io.Discard, conn.nc, int64(size)) + + n, err := io.CopyN(io.Discard, conn.nc, int64(size)) if err != nil { - err = fmt.Errorf("error discarding %d byte message: %w", size, err) + // If the read times out, record the bytes left to read before exiting. + nerr := net.Error(nil) + if l := int32(n); l == 0 && errors.As(err, &nerr) && nerr.Timeout() { + prs.remainingBytes = l + prs.remainingBytes + prs.remainingTime = ptrutil.Ptr(*prs.remainingTime - time.Since(st)) + } + + checkIn = true + + err = transformNetworkError(ctx, err, contextDeadlineUsed) + someErr = fmt.Errorf("error discarding %d byte message: %w", size, err) + + return someErr } + + if mustLogPoolMessage(pool) { + keysAndValues := logger.KeyValues{ + logger.KeyDriverConnectionID, conn.driverConnectionID, + logger.KeyRequestID, prs.requestID, + } + + logPoolMessage(pool, logger.ConnectionPendingReadSucceeded, keysAndValues...) + } + + if pool.monitor != nil { + event := &event.PoolEvent{ + Type: event.ConnectionPendingReadSucceeded, + Address: pool.address.String(), + ConnectionID: conn.driverConnectionID, + Duration: time.Since(st), + } + + pool.monitor.Event(event) + } + + conn.pendingReadState = nil + + return nil } // checkIn returns an idle connection to the pool. If the connection is perished or the pool is @@ -888,21 +1006,6 @@ func (p *pool) checkInNoEvent(conn *connection) error { return ErrWrongPool } - // If the connection has an awaiting server response, try to read the - // response in another goroutine before checking it back into the pool. - // - // Do this here because we want to publish checkIn events when the operation - // is done with the connection, not when it's ready to be used again. That - // means that connections in "awaiting response" state are checked in but - // not usable, which is not covered by the current pool events. We may need - // to add pool event information in the future to communicate that. - if conn.awaitRemainingBytes != nil { - size := *conn.awaitRemainingBytes - conn.awaitRemainingBytes = nil - go bgRead(p, conn, size) - return nil - } - // Bump the connection idle start time here because we're about to make the // connection "available". The idle start time is used to determine how long // a connection has been idle and when it has reached its max idle time and diff --git a/x/mongo/driver/topology/pool_test.go b/x/mongo/driver/topology/pool_test.go index 3d270de2e0..45640667f8 100644 --- a/x/mongo/driver/topology/pool_test.go +++ b/x/mongo/driver/topology/pool_test.go @@ -18,6 +18,7 @@ import ( "go.mongodb.org/mongo-driver/v2/event" "go.mongodb.org/mongo-driver/v2/internal/assert" "go.mongodb.org/mongo-driver/v2/internal/csot" + "go.mongodb.org/mongo-driver/v2/internal/driverutil" "go.mongodb.org/mongo-driver/v2/internal/eventtest" "go.mongodb.org/mongo-driver/v2/internal/require" "go.mongodb.org/mongo-driver/v2/mongo/address" @@ -1233,24 +1234,10 @@ func TestPool_maintain(t *testing.T) { }) } -func TestBackgroundRead(t *testing.T) { +func TestAwaitPendingRead(t *testing.T) { t.Parallel() - newBGReadCallback := func(errsCh chan []error) func(string, time.Time, time.Time, []error, bool) { - return func(_ string, _, _ time.Time, errs []error, _ bool) { - errsCh <- errs - close(errsCh) - } - } - t.Run("incomplete read of message header", func(t *testing.T) { - errsCh := make(chan []error) - var originalCallback func(string, time.Time, time.Time, []error, bool) - originalCallback, BGReadCallback = BGReadCallback, newBGReadCallback(errsCh) - t.Cleanup(func() { - BGReadCallback = originalCallback - }) - timeout := 10 * time.Millisecond cleanup := make(chan struct{}) @@ -1274,24 +1261,21 @@ func TestBackgroundRead(t *testing.T) { conn, err := p.checkOut(context.Background()) require.NoError(t, err) + ctx, cancel := csot.WithTimeout(context.Background(), &timeout) defer cancel() + + ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) + ctx = driverutil.WithRequestID(ctx, -1) + _, err = conn.readWireMessage(ctx) regex := regexp.MustCompile( `^connection\(.*\[-\d+\]\) incomplete read of message header: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, ) assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) - assert.Nil(t, conn.awaitRemainingBytes, "conn.awaitRemainingBytes should be nil") - close(errsCh) // this line causes a double close if BGReadCallback is ever called. + assert.Nil(t, conn.pendingReadState, "conn.awaitRemainingBytes should be nil") }) t.Run("timeout reading message header, successful background read", func(t *testing.T) { - errsCh := make(chan []error) - var originalCallback func(string, time.Time, time.Time, []error, bool) - originalCallback, BGReadCallback = BGReadCallback, newBGReadCallback(errsCh) - t.Cleanup(func() { - BGReadCallback = originalCallback - }) - timeout := 10 * time.Millisecond addr := bootstrapConnections(t, 1, func(nc net.Conn) { @@ -1305,8 +1289,20 @@ func TestBackgroundRead(t *testing.T) { require.NoError(t, err) }) + var pendingReadError error + monitor := &event.PoolMonitor{ + Event: func(pe *event.PoolEvent) { + if pe.Type == event.ConnectionPendingReadFailed { + pendingReadError = pe.Error + } + }, + } + p := newPool( - poolConfig{Address: address.Address(addr.String())}, + poolConfig{ + Address: address.Address(addr.String()), + PoolMonitor: monitor, + }, ) defer p.close(context.Background()) err := p.ready() @@ -1314,8 +1310,13 @@ func TestBackgroundRead(t *testing.T) { conn, err := p.checkOut(context.Background()) require.NoError(t, err) + ctx, cancel := csot.WithTimeout(context.Background(), &timeout) defer cancel() + + ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) + ctx = driverutil.WithRequestID(ctx, -1) + _, err = conn.readWireMessage(ctx) regex := regexp.MustCompile( `^connection\(.*\[-\d+\]\) incomplete read of message header: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, @@ -1323,22 +1324,13 @@ func TestBackgroundRead(t *testing.T) { assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) err = p.checkIn(conn) require.NoError(t, err) - var bgErrs []error - select { - case bgErrs = <-errsCh: - case <-time.After(3 * time.Second): - assert.Fail(t, "did not receive expected error after waiting for 3 seconds") - } - require.Len(t, bgErrs, 0, "expected no error from bgRead()") + + _, err = p.checkOut(context.Background()) + require.NoError(t, err) + + require.NoError(t, pendingReadError) }) t.Run("timeout reading message header, incomplete head during background read", func(t *testing.T) { - errsCh := make(chan []error) - var originalCallback func(string, time.Time, time.Time, []error, bool) - originalCallback, BGReadCallback = BGReadCallback, newBGReadCallback(errsCh) - t.Cleanup(func() { - BGReadCallback = originalCallback - }) - timeout := 10 * time.Millisecond addr := bootstrapConnections(t, 1, func(nc net.Conn) { @@ -1352,8 +1344,20 @@ func TestBackgroundRead(t *testing.T) { require.NoError(t, err) }) + var pendingReadError error + monitor := &event.PoolMonitor{ + Event: func(pe *event.PoolEvent) { + if pe.Type == event.ConnectionPendingReadFailed { + pendingReadError = pe.Error + } + }, + } + p := newPool( - poolConfig{Address: address.Address(addr.String())}, + poolConfig{ + Address: address.Address(addr.String()), + PoolMonitor: monitor, + }, ) defer p.close(context.Background()) err := p.ready() @@ -1361,8 +1365,13 @@ func TestBackgroundRead(t *testing.T) { conn, err := p.checkOut(context.Background()) require.NoError(t, err) + ctx, cancel := csot.WithTimeout(context.Background(), &timeout) defer cancel() + + ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) + ctx = driverutil.WithRequestID(ctx, -1) + _, err = conn.readWireMessage(ctx) regex := regexp.MustCompile( `^connection\(.*\[-\d+\]\) incomplete read of message header: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, @@ -1370,23 +1379,13 @@ func TestBackgroundRead(t *testing.T) { assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) err = p.checkIn(conn) require.NoError(t, err) - var bgErrs []error - select { - case bgErrs = <-errsCh: - case <-time.After(3 * time.Second): - assert.Fail(t, "did not receive expected error after waiting for 3 seconds") - } - require.Len(t, bgErrs, 1, "expected 1 error from bgRead()") - assert.EqualError(t, bgErrs[0], "error reading the message size: unexpected EOF") + + _, err = p.checkOut(context.Background()) + require.Error(t, err) + + assert.EqualError(t, pendingReadError, "error reading the message size: unexpected EOF") }) t.Run("timeout reading message header, background read timeout", func(t *testing.T) { - errsCh := make(chan []error) - var originalCallback func(string, time.Time, time.Time, []error, bool) - originalCallback, BGReadCallback = BGReadCallback, newBGReadCallback(errsCh) - t.Cleanup(func() { - BGReadCallback = originalCallback - }) - timeout := 10 * time.Millisecond cleanup := make(chan struct{}) @@ -1404,17 +1403,35 @@ func TestBackgroundRead(t *testing.T) { require.NoError(t, err) }) + var pendingReadError error + monitor := &event.PoolMonitor{ + Event: func(pe *event.PoolEvent) { + if pe.Type == event.ConnectionPendingReadFailed { + pendingReadError = pe.Error + } + }, + } + p := newPool( - poolConfig{Address: address.Address(addr.String())}, + poolConfig{ + Address: address.Address(addr.String()), + PoolMonitor: monitor, + }, ) + defer p.close(context.Background()) err := p.ready() require.NoError(t, err) conn, err := p.checkOut(context.Background()) require.NoError(t, err) + ctx, cancel := csot.WithTimeout(context.Background(), &timeout) defer cancel() + + ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) + ctx = driverutil.WithRequestID(ctx, -1) + _, err = conn.readWireMessage(ctx) regex := regexp.MustCompile( `^connection\(.*\[-\d+\]\) incomplete read of message header: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, @@ -1422,26 +1439,16 @@ func TestBackgroundRead(t *testing.T) { assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) err = p.checkIn(conn) require.NoError(t, err) - var bgErrs []error - select { - case bgErrs = <-errsCh: - case <-time.After(3 * time.Second): - assert.Fail(t, "did not receive expected error after waiting for 3 seconds") - } - require.Len(t, bgErrs, 1, "expected 1 error from bgRead()") + + _, err = p.checkOut(context.Background()) + require.Error(t, err) + wantErr := regexp.MustCompile( `^error discarding 6 byte message: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, ) - assert.True(t, wantErr.MatchString(bgErrs[0].Error()), "error %q does not match pattern %q", bgErrs[0], wantErr) + assert.True(t, wantErr.MatchString(pendingReadError.Error()), "error %q does not match pattern %q", pendingReadError, wantErr) }) t.Run("timeout reading full message, successful background read", func(t *testing.T) { - errsCh := make(chan []error) - var originalCallback func(string, time.Time, time.Time, []error, bool) - originalCallback, BGReadCallback = BGReadCallback, newBGReadCallback(errsCh) - t.Cleanup(func() { - BGReadCallback = originalCallback - }) - timeout := 10 * time.Millisecond addr := bootstrapConnections(t, 1, func(nc net.Conn) { @@ -1458,17 +1465,35 @@ func TestBackgroundRead(t *testing.T) { require.NoError(t, err) }) + var pendingReadError error + monitor := &event.PoolMonitor{ + Event: func(pe *event.PoolEvent) { + if pe.Type == event.ConnectionPendingReadFailed { + pendingReadError = pe.Error + } + }, + } + p := newPool( - poolConfig{Address: address.Address(addr.String())}, + poolConfig{ + Address: address.Address(addr.String()), + PoolMonitor: monitor, + }, ) + defer p.close(context.Background()) err := p.ready() require.NoError(t, err) conn, err := p.checkOut(context.Background()) require.NoError(t, err) + ctx, cancel := csot.WithTimeout(context.Background(), &timeout) defer cancel() + + ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) + ctx = driverutil.WithRequestID(ctx, -1) + _, err = conn.readWireMessage(ctx) regex := regexp.MustCompile( `^connection\(.*\[-\d+\]\) incomplete read of full message: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, @@ -1476,22 +1501,13 @@ func TestBackgroundRead(t *testing.T) { assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) err = p.checkIn(conn) require.NoError(t, err) - var bgErrs []error - select { - case bgErrs = <-errsCh: - case <-time.After(3 * time.Second): - assert.Fail(t, "did not receive expected error after waiting for 3 seconds") - } - require.Len(t, bgErrs, 0, "expected no error from bgRead()") + + _, err = p.checkOut(context.Background()) + require.NoError(t, err) + + require.NoError(t, pendingReadError) }) t.Run("timeout reading full message, background read EOF", func(t *testing.T) { - errsCh := make(chan []error) - var originalCallback func(string, time.Time, time.Time, []error, bool) - originalCallback, BGReadCallback = BGReadCallback, newBGReadCallback(errsCh) - t.Cleanup(func() { - BGReadCallback = originalCallback - }) - timeout := 10 * time.Millisecond addr := bootstrapConnections(t, 1, func(nc net.Conn) { @@ -1508,17 +1524,35 @@ func TestBackgroundRead(t *testing.T) { require.NoError(t, err) }) + var pendingReadError error + monitor := &event.PoolMonitor{ + Event: func(pe *event.PoolEvent) { + if pe.Type == event.ConnectionPendingReadFailed { + pendingReadError = pe.Error + } + }, + } + p := newPool( - poolConfig{Address: address.Address(addr.String())}, + poolConfig{ + Address: address.Address(addr.String()), + PoolMonitor: monitor, + }, ) + defer p.close(context.Background()) err := p.ready() require.NoError(t, err) conn, err := p.checkOut(context.Background()) require.NoError(t, err) + ctx, cancel := csot.WithTimeout(context.Background(), &timeout) defer cancel() + + ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) + ctx = driverutil.WithRequestID(ctx, -1) + _, err = conn.readWireMessage(ctx) regex := regexp.MustCompile( `^connection\(.*\[-\d+\]\) incomplete read of full message: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, @@ -1526,14 +1560,11 @@ func TestBackgroundRead(t *testing.T) { assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) err = p.checkIn(conn) require.NoError(t, err) - var bgErrs []error - select { - case bgErrs = <-errsCh: - case <-time.After(3 * time.Second): - assert.Fail(t, "did not receive expected error after waiting for 3 seconds") - } - require.Len(t, bgErrs, 1, "expected 1 error from bgRead()") - assert.EqualError(t, bgErrs[0], "error discarding 3 byte message: EOF") + + _, err = p.checkOut(context.Background()) + require.Error(t, err) + + assert.EqualError(t, pendingReadError, "error discarding 3 byte message: EOF") }) } From 70cfd39c34a11d323e5a708c26084b480d1c87a5 Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Thu, 6 Mar 2025 19:18:30 -0700 Subject: [PATCH 02/23] GODRIVER-3173 Update unified spect tests --- internal/integration/unified/event.go | 15 ++++++++ .../integration/unified/event_verification.go | 17 ++++++++- .../pending-reads.json | 35 ++++++++++++++++++- .../pending-reads.yml | 21 ++++++++--- x/mongo/driver/topology/pool.go | 1 + 5 files changed, 83 insertions(+), 6 deletions(-) diff --git a/internal/integration/unified/event.go b/internal/integration/unified/event.go index 9e6d6d5a35..7792ab949a 100644 --- a/internal/integration/unified/event.go +++ b/internal/integration/unified/event.go @@ -30,6 +30,9 @@ const ( connectionCheckOutFailedEvent monitoringEventType = "ConnectionCheckOutFailedEvent" connectionCheckedOutEvent monitoringEventType = "ConnectionCheckedOutEvent" connectionCheckedInEvent monitoringEventType = "ConnectionCheckedInEvent" + connectionPendingReadStarted monitoringEventType = "ConnectionPendingReadStarted" + connectionPendingReadSucceeded monitoringEventType = "ConnectionPendingReadSucceeded" + connectionPendingReadFailed monitoringEventType = "ConnectionPendingReadFailed" serverDescriptionChangedEvent monitoringEventType = "ServerDescriptionChangedEvent" serverHeartbeatFailedEvent monitoringEventType = "ServerHeartbeatFailedEvent" serverHeartbeatStartedEvent monitoringEventType = "ServerHeartbeatStartedEvent" @@ -67,6 +70,12 @@ func monitoringEventTypeFromString(eventStr string) (monitoringEventType, bool) return connectionCheckedOutEvent, true case "connectioncheckedinevent": return connectionCheckedInEvent, true + case "connectionpendingreadstarted": + return connectionPendingReadStarted, true + case "connectionpendingreadsucceeded": + return connectionPendingReadSucceeded, true + case "connectionpendingreadfailed": + return connectionPendingReadFailed, true case "serverdescriptionchangedevent": return serverDescriptionChangedEvent, true case "serverheartbeatfailedevent": @@ -106,6 +115,12 @@ func monitoringEventTypeFromPoolEvent(evt *event.PoolEvent) monitoringEventType return connectionCheckedOutEvent case event.ConnectionCheckedIn: return connectionCheckedInEvent + case event.ConnectionPendingReadStarted: + return connectionPendingReadStarted + case event.ConnectionPendingReadSucceeded: + return connectionPendingReadSucceeded + case event.ConnectionPendingReadFailed: + return connectionPendingReadFailed default: return "" } diff --git a/internal/integration/unified/event_verification.go b/internal/integration/unified/event_verification.go index ebdb0b19c3..c488fe0465 100644 --- a/internal/integration/unified/event_verification.go +++ b/internal/integration/unified/event_verification.go @@ -56,7 +56,10 @@ type cmapEvent struct { Reason *string `bson:"reason"` } `bson:"connectionCheckOutFailedEvent"` - ConnectionCheckedInEvent *struct{} `bson:"connectionCheckedInEvent"` + ConnectionCheckedInEvent *struct{} `bson:"connectionCheckedInEvent"` + ConnectionPendingReadStarted *struct{} `bson:"connectionPendingReadStarted"` + ConnectionPendingreadSucceeded *struct{} `bson:"connectionPendingReadSucceeded"` + ConnectionPendingReadFailed *struct{} `bson:"connectionPendingReadFailed"` PoolClearedEvent *struct { HasServiceID *bool `bson:"hasServiceId"` @@ -350,6 +353,18 @@ func verifyCMAPEvents(client *clientEntity, expectedEvents *expectedEvents) erro if _, pooled, err = getNextPoolEvent(pooled, event.ConnectionCheckedIn); err != nil { return newEventVerificationError(idx, client, "failed to get next pool event: %v", err.Error()) } + case evt.ConnectionPendingReadStarted != nil: + if _, pooled, err = getNextPoolEvent(pooled, event.ConnectionPendingReadStarted); err != nil { + return newEventVerificationError(idx, client, "failed to get next pool event: %v", err.Error()) + } + case evt.ConnectionPendingreadSucceeded != nil: + if _, pooled, err = getNextPoolEvent(pooled, event.ConnectionPendingReadSucceeded); err != nil { + return newEventVerificationError(idx, client, "failed to get next pool event: %v", err.Error()) + } + case evt.ConnectionPendingReadFailed != nil: + if _, pooled, err = getNextPoolEvent(pooled, event.ConnectionPendingReadFailed); err != nil { + return newEventVerificationError(idx, client, "failed to get next pool event: %v", err.Error()) + } case evt.PoolClearedEvent != nil: var actual *event.PoolEvent if actual, pooled, err = getNextPoolEvent(pooled, event.ConnectionPoolCleared); err != nil { diff --git a/testdata/client-side-operations-timeout/pending-reads.json b/testdata/client-side-operations-timeout/pending-reads.json index 43b2745c7c..3fd2e8b2ef 100644 --- a/testdata/client-side-operations-timeout/pending-reads.json +++ b/testdata/client-side-operations-timeout/pending-reads.json @@ -29,7 +29,10 @@ "commandSucceededEvent", "connectionCheckedOutEvent", "connectionCheckedInEvent", - "connectionClosedEvent" + "connectionClosedEvent", + "connectionPendingReadSucceeded", + "connectionPendingReadStarted", + "connectionPendingReadFailed" ] } }, @@ -129,6 +132,12 @@ { "connectionCheckedInEvent": {} }, + { + "connectionPendingReadStarted": {} + }, + { + "connectionPendingReadSucceeded": {} + }, { "connectionCheckedOutEvent": {} }, @@ -261,6 +270,12 @@ { "connectionCheckedInEvent": {} }, + { + "connectionPendingReadStarted": {} + }, + { + "connectionPendingReadSucceeded": {} + }, { "connectionCheckedOutEvent": {} }, @@ -344,6 +359,12 @@ { "connectionCheckedInEvent": {} }, + { + "connectionPendingReadStarted": {} + }, + { + "connectionPendingReadFailed": {} + }, { "connectionClosedEvent": { "reason": "error" @@ -425,6 +446,12 @@ { "connectionCheckedInEvent": {} }, + { + "connectionPendingReadStarted": {} + }, + { + "connectionPendingReadSucceeded": {} + }, { "connectionCheckedOutEvent": {} }, @@ -506,6 +533,12 @@ { "connectionCheckedInEvent": {} }, + { + "connectionPendingReadStarted": {} + }, + { + "connectionPendingReadFailed": {} + }, { "connectionClosedEvent": { "reason": "error" diff --git a/testdata/client-side-operations-timeout/pending-reads.yml b/testdata/client-side-operations-timeout/pending-reads.yml index c07728009e..fe5344292e 100644 --- a/testdata/client-side-operations-timeout/pending-reads.yml +++ b/testdata/client-side-operations-timeout/pending-reads.yml @@ -25,6 +25,9 @@ createEntities: - connectionCheckedOutEvent - connectionCheckedInEvent - connectionClosedEvent + - connectionPendingReadSucceeded + - connectionPendingReadStarted + - connectionPendingReadFailed - database: id: &database test client: *client @@ -80,10 +83,12 @@ tests: - client: *client eventType: cmap events: - - connectionCheckedOutEvent: {} # insert - - connectionCheckedInEvent: {} # insert fails - - connectionCheckedOutEvent: {} # find - - connectionCheckedInEvent: {} # find succeeds + - connectionCheckedOutEvent: {} + - connectionCheckedInEvent: {} # insert fails + - connectionPendingReadStarted: {} + - connectionPendingReadSucceeded: {} # find op drains conn + - connectionCheckedOutEvent: {} + - connectionCheckedInEvent: {} # find succeeds - description: "Concurrent write operation with successful pending read" operations: @@ -167,6 +172,8 @@ tests: events: - connectionCheckedOutEvent: {} # insert - connectionCheckedInEvent: {} # insert fails + - connectionPendingReadStarted: {} + - connectionPendingReadSucceeded: {} - connectionCheckedOutEvent: {} # find - connectionCheckedInEvent: {} # find succeeds @@ -215,6 +222,8 @@ tests: events: - connectionCheckedOutEvent: {} # first insert - connectionCheckedInEvent: {} # first insert fails + - connectionPendingReadStarted: {} + - connectionPendingReadFailed: {} - connectionClosedEvent: # second insert times out pending read in checkout, closes reason: error @@ -260,6 +269,8 @@ tests: events: - connectionCheckedOutEvent: {} # first find - connectionCheckedInEvent: {} # first find fails + - connectionPendingReadStarted: {} + - connectionPendingReadSucceeded: {} - connectionCheckedOutEvent: {} # second find - connectionCheckedInEvent: {} # second find succeeds @@ -308,5 +319,7 @@ tests: events: - connectionCheckedOutEvent: {} # first find - connectionCheckedInEvent: {} # first find fails + - connectionPendingReadStarted: {} + - connectionPendingReadFailed: {} - connectionClosedEvent: # second find times out pending read in checkout, closes reason: error diff --git a/x/mongo/driver/topology/pool.go b/x/mongo/driver/topology/pool.go index 7a596cc77f..228773cebc 100644 --- a/x/mongo/driver/topology/pool.go +++ b/x/mongo/driver/topology/pool.go @@ -818,6 +818,7 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { if pool.monitor != nil { event := &event.PoolEvent{ Type: event.ConnectionPendingReadStarted, + Address: pool.address.String(), ConnectionID: conn.driverConnectionID, RequestID: prs.requestID, } From f060a090e49c4300f869877ad80a9a02c8c07191 Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Wed, 19 Mar 2025 13:19:43 -0600 Subject: [PATCH 03/23] GODRIVER-3173 Refresh pending read every 4KB --- .../pending-reads.json | 8 +-- .../pending-reads.yml | 8 +-- x/mongo/driver/topology/pool.go | 60 +++++++++++++++---- 3 files changed, 55 insertions(+), 21 deletions(-) diff --git a/testdata/client-side-operations-timeout/pending-reads.json b/testdata/client-side-operations-timeout/pending-reads.json index 3fd2e8b2ef..ef7c20dedb 100644 --- a/testdata/client-side-operations-timeout/pending-reads.json +++ b/testdata/client-side-operations-timeout/pending-reads.json @@ -304,7 +304,7 @@ "insert" ], "blockConnection": true, - "blockTimeMS": 2100 + "blockTimeMS": 500 } } } @@ -327,7 +327,7 @@ "name": "insertOne", "object": "coll", "arguments": { - "timeoutMS": 2000, + "timeoutMS": 400, "document": { "_id": 3, "x": 1 @@ -480,7 +480,7 @@ "find" ], "blockConnection": true, - "blockTimeMS": 2100 + "blockTimeMS": 500 } } } @@ -502,7 +502,7 @@ "name": "findOne", "object": "coll", "arguments": { - "timeoutMS": 2000, + "timeoutMS": 400, "filter": { "_id": 1 } diff --git a/testdata/client-side-operations-timeout/pending-reads.yml b/testdata/client-side-operations-timeout/pending-reads.yml index fe5344292e..c76f6bf1e2 100644 --- a/testdata/client-side-operations-timeout/pending-reads.yml +++ b/testdata/client-side-operations-timeout/pending-reads.yml @@ -190,7 +190,7 @@ tests: data: failCommands: ["insert"] blockConnection: true - blockTimeMS: 2100 + blockTimeMS: 500 # Execute operation with timeout less than block time - name: insertOne @@ -205,7 +205,7 @@ tests: - name: insertOne object: *collection arguments: - timeoutMS: 2000 + timeoutMS: 400 document: { _id: 3, x: 1 } expectError: isTimeoutError: true @@ -287,7 +287,7 @@ tests: data: failCommands: ["find"] blockConnection: true - blockTimeMS: 2100 + blockTimeMS: 500 # Execute operation with timeout less than block time - name: findOne @@ -302,7 +302,7 @@ tests: - name: findOne object: *collection arguments: - timeoutMS: 2000 + timeoutMS: 400 filter: { _id: 1 } expectError: isTimeoutError: true diff --git a/x/mongo/driver/topology/pool.go b/x/mongo/driver/topology/pool.go index 228773cebc..735f7fcede 100644 --- a/x/mongo/driver/topology/pool.go +++ b/x/mongo/driver/topology/pool.go @@ -783,11 +783,12 @@ func (p *pool) removeConnection(conn *connection, reason reason, err error) erro // PendingReadTimeout is the maximum amount of the to wait when trying to read // the server reply on a connection after an operation timed out. The -// default is 1 second. +// default is 400 milliseconds. This value is refreshed for every 4KB read from +// the TCP stream. // // Deprecated: PendingReadTimeout is intended for internal use only and may be // removed or modified at any time. -var PendingReadTimeout = 2000 * time.Millisecond +var PendingReadTimeout = 400 * time.Millisecond // awaitPendingRead sets a new read deadline on the provided connection and // tries to read any bytes returned by the server. If there are any errors, the @@ -926,21 +927,54 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { size -= 4 } - n, err := io.CopyN(io.Discard, conn.nc, int64(size)) - if err != nil { - // If the read times out, record the bytes left to read before exiting. - nerr := net.Error(nil) - if l := int32(n); l == 0 && errors.As(err, &nerr) && nerr.Timeout() { - prs.remainingBytes = l + prs.remainingBytes - prs.remainingTime = ptrutil.Ptr(*prs.remainingTime - time.Since(st)) + const bufSize = 4096 + buf := make([]byte, bufSize) + + var totalRead int64 + + // Iterate every 4KB of the TCP stream, refreshing the remainingTimeout for + // each successful read to avoid closing while streaming large (upto 16MiB) + // response messages. + for totalRead < int64(size) { + newDeadline := time.Now().Add(*prs.remainingTime) + if err := conn.nc.SetReadDeadline(newDeadline); err != nil { + checkIn = true + someErr = fmt.Errorf("error renewing read deadline: %w", err) + + return someErr } - checkIn = true + remaining := int64(size) - totalRead - err = transformNetworkError(ctx, err, contextDeadlineUsed) - someErr = fmt.Errorf("error discarding %d byte message: %w", size, err) + readSize := bufSize + if int64(readSize) > remaining { + readSize = int(remaining) + } - return someErr + n, err := conn.nc.Read(buf[:readSize]) + if n > 0 { + totalRead += int64(n) + + // Refresh the remaining time if we get are receiving data. + prs.remainingTime = ptrutil.Ptr(PendingReadTimeout) + } + + if err != nil { + // If the read times out, record the bytes left to read before exiting. + // Reduce the remainingTime. + nerr := net.Error(nil) + if l := int32(n); l == 0 && errors.As(err, &nerr) && nerr.Timeout() { + prs.remainingBytes = l + prs.remainingBytes + prs.remainingTime = ptrutil.Ptr(*prs.remainingTime - time.Since(st)) + } + + checkIn = true + + err = transformNetworkError(ctx, err, contextDeadlineUsed) + someErr = fmt.Errorf("error discarding %d byte message: %w", size, err) + + return someErr + } } if mustLogPoolMessage(pool) { From 61a0fc668602d6d7f90f8c18b8767c6af19b0194 Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Wed, 16 Apr 2025 10:29:22 -0600 Subject: [PATCH 04/23] GODRIVER-3173 Update APR --- bson/default_value_encoders.go | 2 + internal/integration/csot_prose_test.go | 4 + .../unified/unified_spec_runner.go | 2 + x/mongo/driver/topology/connection.go | 11 +- x/mongo/driver/topology/pool.go | 298 +++++++++++------- x/mongo/driver/topology/pool_test.go | 55 ++++ 6 files changed, 259 insertions(+), 113 deletions(-) diff --git a/bson/default_value_encoders.go b/bson/default_value_encoders.go index bd5a20f2f9..83c409375d 100644 --- a/bson/default_value_encoders.go +++ b/bson/default_value_encoders.go @@ -9,6 +9,7 @@ package bson import ( "encoding/json" "errors" + "fmt" "math" "net/url" "reflect" @@ -165,6 +166,7 @@ func decimal128EncodeValue(_ EncodeContext, vw ValueWriter, val reflect.Value) e if !val.IsValid() || val.Type() != tDecimal { return ValueEncoderError{Name: "Decimal128EncodeValue", Types: []reflect.Type{tDecimal}, Received: val} } + fmt.Println(val.Interface().(Decimal128)) return vw.WriteDecimal128(val.Interface().(Decimal128)) } diff --git a/internal/integration/csot_prose_test.go b/internal/integration/csot_prose_test.go index ce7219b042..dd71f35ee2 100644 --- a/internal/integration/csot_prose_test.go +++ b/internal/integration/csot_prose_test.go @@ -176,6 +176,7 @@ func TestCSOTProse(t *testing.T) { time.Millisecond, "expected ping to fail within 150ms") }) + }) mt.RunOpts("11. multi-batch bulkWrites", mtest.NewOptions().MinServerVersion("8.0"). @@ -232,6 +233,9 @@ func TestCSOTProse(t *testing.T) { assert.ErrorIs(mt, err, context.DeadlineExceeded, "expected a timeout error, got: %v", err) assert.Equal(mt, 2, cnt, "expected bulkWrite calls: %d, got: %d", 2, cnt) }) + + //mt.Run("pending read", func(mt *mtest.T) { + //}) } func TestCSOTProse_GridFS(t *testing.T) { diff --git a/internal/integration/unified/unified_spec_runner.go b/internal/integration/unified/unified_spec_runner.go index a81c61407c..cb8a98ae22 100644 --- a/internal/integration/unified/unified_spec_runner.go +++ b/internal/integration/unified/unified_spec_runner.go @@ -122,6 +122,8 @@ func runTestFile(t *testing.T, filepath string, expectValidFail bool, opts ...*O defer func() { // catch panics from looking up elements and fail if it's unexpected if r := recover(); r != nil { + + panic(r) if !expectValidFail { mt.Fatal(r) } diff --git a/x/mongo/driver/topology/connection.go b/x/mongo/driver/topology/connection.go index 85d2aecf9c..e22918beae 100644 --- a/x/mongo/driver/topology/connection.go +++ b/x/mongo/driver/topology/connection.go @@ -51,6 +51,7 @@ type pendingReadState struct { remainingBytes int32 requestID int32 remainingTime *time.Duration + start time.Time } type connection struct { @@ -490,12 +491,13 @@ func (c *connection) read(ctx context.Context) (bytesRead []byte, errMsg string, if l := int32(n); l == 0 && isCSOTTimeout(err) && driverutil.HasMaxTimeMS(ctx) { requestID, _ := driverutil.GetRequestID(ctx) - c.pendingReadMu.Lock() + //c.pendingReadMu.Lock() c.pendingReadState = &pendingReadState{ remainingBytes: l, requestID: requestID, + start: time.Now(), } - c.pendingReadMu.Unlock() + //c.pendingReadMu.Unlock() } return nil, "incomplete read of message header", err } @@ -513,12 +515,13 @@ func (c *connection) read(ctx context.Context) (bytesRead []byte, errMsg string, if remainingBytes > 0 && isCSOTTimeout(err) && driverutil.HasMaxTimeMS(ctx) { requestID, _ := driverutil.GetRequestID(ctx) - c.pendingReadMu.Lock() + //c.pendingReadMu.Lock() c.pendingReadState = &pendingReadState{ remainingBytes: remainingBytes, requestID: requestID, + start: time.Now(), } - c.pendingReadMu.Unlock() + //c.pendingReadMu.Unlock() } return dst, "incomplete read of full message", err } diff --git a/x/mongo/driver/topology/pool.go b/x/mongo/driver/topology/pool.go index 735f7fcede..4ca494392e 100644 --- a/x/mongo/driver/topology/pool.go +++ b/x/mongo/driver/topology/pool.go @@ -7,6 +7,7 @@ package topology import ( + "bufio" "context" "errors" "fmt" @@ -19,7 +20,6 @@ import ( "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/event" "go.mongodb.org/mongo-driver/v2/internal/logger" - "go.mongodb.org/mongo-driver/v2/internal/ptrutil" "go.mongodb.org/mongo-driver/v2/mongo/address" "go.mongodb.org/mongo-driver/v2/x/mongo/driver" ) @@ -790,23 +790,15 @@ func (p *pool) removeConnection(conn *connection, reason reason, err error) erro // removed or modified at any time. var PendingReadTimeout = 400 * time.Millisecond -// awaitPendingRead sets a new read deadline on the provided connection and -// tries to read any bytes returned by the server. If there are any errors, the -// connection will be checked back into the pool to be retried. -func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { - conn.pendingReadMu.Lock() - defer conn.pendingReadMu.Unlock() - - // If there are no bytes pending read, do nothing. - if conn.pendingReadState == nil { - return nil - } - +// publishPendingReadStarted will log a message to the pool logger and +// publish an event to the pool monitor if they are set. +func publishPendingReadStarted(pool *pool, conn *connection) { prs := conn.pendingReadState - if prs.remainingTime == nil { - prs.remainingTime = ptrutil.Ptr(PendingReadTimeout) + if prs == nil { + return } + // log a message to the pool logger if it is set. if mustLogPoolMessage(pool) { keysAndValues := logger.KeyValues{ logger.KeyDriverConnectionID, conn.driverConnectionID, @@ -816,6 +808,7 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { logPoolMessage(pool, logger.ConnectionPendingReadStarted, keysAndValues...) } + // publish an event to the pool monitor if it is set. if pool.monitor != nil { event := &event.PoolEvent{ Type: event.ConnectionPendingReadStarted, @@ -826,103 +819,140 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { pool.monitor.Event(event) } +} - size := prs.remainingBytes +func publishPendingReadFailed(pool *pool, conn *connection, err error) { + prs := conn.pendingReadState + if prs == nil { + return + } - checkIn := false - var someErr error + if mustLogPoolMessage(pool) { + keysAndValues := logger.KeyValues{ + logger.KeyDriverConnectionID, conn.driverConnectionID, + logger.KeyRequestID, prs.requestID, + logger.KeyReason, err.Error(), + logger.KeyRemainingTimeMS, *prs.remainingTime, + } - defer func() { - if mustLogPoolMessage(pool) && someErr != nil { - keysAndValues := logger.KeyValues{ - logger.KeyDriverConnectionID, conn.driverConnectionID, - logger.KeyRequestID, prs.requestID, - logger.KeyReason, someErr.Error(), - logger.KeyRemainingTimeMS, *prs.remainingTime, - } + logPoolMessage(pool, logger.ConnectionPendingReadFailed, keysAndValues...) + } - logPoolMessage(pool, logger.ConnectionPendingReadFailed, keysAndValues...) + if pool.monitor != nil { + event := &event.PoolEvent{ + Type: event.ConnectionPendingReadFailed, + Address: pool.address.String(), + ConnectionID: conn.driverConnectionID, + RequestID: prs.requestID, + //RemainingTime: remainingTime, + Reason: err.Error(), + Error: err, } - if pool.monitor != nil && someErr != nil { - event := &event.PoolEvent{ - Type: event.ConnectionPendingReadFailed, - Address: pool.address.String(), - ConnectionID: conn.driverConnectionID, - RequestID: prs.requestID, - RemainingTime: *prs.remainingTime, - Reason: someErr.Error(), - Error: someErr, - } + pool.monitor.Event(event) + } +} + +func publishPendingReadSucceeded(pool *pool, conn *connection) { + prs := conn.pendingReadState + if prs == nil { + return + } - pool.monitor.Event(event) + if mustLogPoolMessage(pool) { + keysAndValues := logger.KeyValues{ + logger.KeyDriverConnectionID, conn.driverConnectionID, + logger.KeyRequestID, prs.requestID, } - // If we have exceeded the time limit, then close the connection. - if prs.remainingTime != nil && *prs.remainingTime < 0 { - if err := conn.close(); err != nil { - panic(err) - } + logPoolMessage(pool, logger.ConnectionPendingReadSucceeded, keysAndValues...) + } - return + if pool.monitor != nil { + event := &event.PoolEvent{ + Type: event.ConnectionPendingReadSucceeded, + Address: pool.address.String(), + ConnectionID: conn.driverConnectionID, + //Duration: time.Since(st), } - if !checkIn { - return + pool.monitor.Event(event) + } +} + +// newPendingReadContext creates a new context with a deadline that is the +// minimum of the parent context's deadline and the remaining time. +func newPendingReadContext(parent context.Context, remainingTime time.Duration) (context.Context, context.CancelFunc) { + parentDeadline, hasDeadline := parent.Deadline() + + calculatedDeadline := time.Now().Add(remainingTime) + + if hasDeadline { + // Chose the earliest of the two deadlines. + var minDeadline time.Time + if calculatedDeadline.Before(parentDeadline) { + minDeadline = calculatedDeadline + } else { + minDeadline = parentDeadline } - // No matter what happens, always check the connection back into the - // pool, which will either make it available for other operations or - // remove it from the pool if it was closed. - // - // TODO(GODRIVER-3385): Figure out how to handle this error. It's possible - // that a single connection can be checked out to handle multiple concurrent - // operations. This is likely a bug in the Go Driver. So it's possible that - // the connection is idle at the point of check-in. - _ = pool.checkInNoEvent(conn) - }() + return context.WithDeadline(parent, minDeadline) + } + + // If no deadline was set on the parent context, use the remaining time. + return context.WithTimeout(parent, remainingTime) +} + +// peekConnectionAlive checks if the connection is alive by peeking at the +// buffered reader. If the connection is closed, it will return false. +func peekConnectionAlive(conn *connection) (int, error) { + // Set a very short deadline to avoid blocking. + conn.nc.SetReadDeadline(time.Now().Add(1 * time.Nanosecond)) + + // Wrap the connection in a buffered reader to use peek. + reader := bufio.NewReader(conn.nc) + + // Try to peek at one byte. + bytes, err := reader.Peek(1) + return len(bytes), err +} + +func attemptPendingRead(ctx context.Context, conn *connection, remainingTime time.Duration) (int, error) { + pendingreadState := conn.pendingReadState + if pendingreadState == nil { + return 0, fmt.Errorf("no pending read state") + } dl, contextDeadlineUsed := ctx.Deadline() - if !contextDeadlineUsed { - // If there is a remainingTime, use that. If not, use the static - // PendingReadTimeout. This is required since a user could provide a timeout - // for the first try that does not exceed the pending read timeout, fail, - // and then not use a timeout for a subsequent try. - if prs.remainingTime != nil { - dl = time.Now().Add(*prs.remainingTime) - } else { - dl = time.Now().Add(PendingReadTimeout) + calculatedDeadline := time.Now().Add(remainingTime) + + if contextDeadlineUsed { + if calculatedDeadline.Before(dl) { + dl = calculatedDeadline } + } else { + dl = calculatedDeadline } err := conn.nc.SetReadDeadline(dl) if err != nil { - checkIn = true - - someErr = fmt.Errorf("error setting a read deadline: %w", err) - - return someErr + return 0, fmt.Errorf("error setting a read deadline: %w", err) } - st := time.Now() + size := pendingreadState.remainingBytes + //st := time.Now() if size == 0 { // Question: Would this alawys equal to zero? var sizeBuf [4]byte - if _, err := io.ReadFull(conn.nc, sizeBuf[:]); err != nil { - prs.remainingTime = ptrutil.Ptr(*prs.remainingTime - time.Since(st)) - checkIn = true - + if bytesRead, err := io.ReadFull(conn.nc, sizeBuf[:]); err != nil { err = transformNetworkError(ctx, err, contextDeadlineUsed) - someErr = fmt.Errorf("error reading the message size: %w", err) - return someErr + return bytesRead, fmt.Errorf("error reading the message size: %w", err) } + size, err = conn.parseWmSizeBytes(sizeBuf) if err != nil { - checkIn = true - someErr = transformNetworkError(ctx, err, contextDeadlineUsed) - - return someErr + return int(size), transformNetworkError(ctx, err, contextDeadlineUsed) } size -= 4 } @@ -936,12 +966,9 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { // each successful read to avoid closing while streaming large (upto 16MiB) // response messages. for totalRead < int64(size) { - newDeadline := time.Now().Add(*prs.remainingTime) + newDeadline := time.Now().Add(time.Until(dl)) if err := conn.nc.SetReadDeadline(newDeadline); err != nil { - checkIn = true - someErr = fmt.Errorf("error renewing read deadline: %w", err) - - return someErr + return int(totalRead), fmt.Errorf("error renewing read deadline: %w", err) } remaining := int64(size) - totalRead @@ -954,9 +981,6 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { n, err := conn.nc.Read(buf[:readSize]) if n > 0 { totalRead += int64(n) - - // Refresh the remaining time if we get are receiving data. - prs.remainingTime = ptrutil.Ptr(PendingReadTimeout) } if err != nil { @@ -964,40 +988,96 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { // Reduce the remainingTime. nerr := net.Error(nil) if l := int32(n); l == 0 && errors.As(err, &nerr) && nerr.Timeout() { - prs.remainingBytes = l + prs.remainingBytes - prs.remainingTime = ptrutil.Ptr(*prs.remainingTime - time.Since(st)) + pendingreadState.remainingBytes = l + pendingreadState.remainingBytes + //prs.remainingTime = ptrutil.Ptr(*prs.remainingTime - time.Since(st)) } - checkIn = true - err = transformNetworkError(ctx, err, contextDeadlineUsed) - someErr = fmt.Errorf("error discarding %d byte message: %w", size, err) - - return someErr + return n, fmt.Errorf("error discarding %d byte message: %w", size, err) } + + pendingreadState.start = time.Now() } - if mustLogPoolMessage(pool) { - keysAndValues := logger.KeyValues{ - logger.KeyDriverConnectionID, conn.driverConnectionID, - logger.KeyRequestID, prs.requestID, + return int(totalRead), nil +} + +// awaitPendingRead sets a new read deadline on the provided connection and +// tries to read any bytes returned by the server. If there are any errors, the +// connection will be checked back into the pool to be retried. +func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { + conn.pendingReadMu.Lock() + defer conn.pendingReadMu.Unlock() + + // If there are no bytes pending read, do nothing. + if conn.pendingReadState == nil { + return nil + } + + publishPendingReadStarted(pool, conn) + + var ( + pendingReadState = conn.pendingReadState + remainingTime = pendingReadState.start.Add(PendingReadTimeout).Sub(time.Now()) + err error + bytesRead int + contextDeadlineUsed bool + ) + + if remainingTime <= 0 { + // If there is no remaining time, we can just peek at the connection to check + // aliveness. In such cases, we don't want to close the connection. + bytesRead, err = peekConnectionAlive(conn) + } else { + //pendingReadContext, cancel := newPendingReadContext(ctx, remainingTime) + //defer cancel() + + contextDeadlineUsed = true + bytesRead, err = attemptPendingRead(ctx, conn, remainingTime) + } + + endTime := time.Now() + + if err != nil { + // No matter what happens, always check the connection back into the + // pool, which will either make it available for other operations or + // remove it from the pool if it was closed. + // + // TODO(GODRIVER-3385): Figure out how to handle this error. It's possible + // that a single connection can be checked out to handle multiple concurrent + // operations. This is likely a bug in the Go Driver. So it's possible that + // the connection is idle at the point of check-in. + defer func() { + publishPendingReadFailed(pool, conn, err) + + _ = pool.checkInNoEvent(conn) + }() + + if netErr, ok := err.(net.Error); ok && !netErr.Timeout() { + if err := conn.close(); err != nil { + return err + } + return transformNetworkError(ctx, err, contextDeadlineUsed) } + } - logPoolMessage(pool, logger.ConnectionPendingReadSucceeded, keysAndValues...) + // If the read was successful, then we should refresh the remaining timeout. + if bytesRead > 0 { + pendingReadState.start = endTime } - if pool.monitor != nil { - event := &event.PoolEvent{ - Type: event.ConnectionPendingReadSucceeded, - Address: pool.address.String(), - ConnectionID: conn.driverConnectionID, - Duration: time.Since(st), + // If the remaining time has been exceeded, then close the connection. + if endTime.Sub(pendingReadState.start) > PendingReadTimeout { + if err := conn.close(); err != nil { + return err } + } - pool.monitor.Event(event) + if err != nil { + return err } - conn.pendingReadState = nil + publishPendingReadSucceeded(pool, conn) return nil } diff --git a/x/mongo/driver/topology/pool_test.go b/x/mongo/driver/topology/pool_test.go index 45640667f8..6f0a925d3c 100644 --- a/x/mongo/driver/topology/pool_test.go +++ b/x/mongo/driver/topology/pool_test.go @@ -1566,6 +1566,61 @@ func TestAwaitPendingRead(t *testing.T) { assert.EqualError(t, pendingReadError, "error discarding 3 byte message: EOF") }) + //t.Run("no remaining time with no response", func(t *testing.T) { + // timeout := 10 * time.Millisecond + + // addr := bootstrapConnections(t, 1, func(nc net.Conn) { + // defer func() { + // _ = nc.Close() + // }() + + // // Write nothing so that the 10 microseconds "non-blocking" + // }) + + // var pendingReadError error + // monitor := &event.PoolMonitor{ + // Event: func(pe *event.PoolEvent) { + // if pe.Type == event.ConnectionPendingReadFailed { + // pendingReadError = pe.Error + // } + // }, + // } + + // p := newPool( + // poolConfig{ + // Address: address.Address(addr.String()), + // PoolMonitor: monitor, + // }, + // ) + // defer p.close(context.Background()) + // err := p.ready() + // require.NoError(t, err) + + // conn, err := p.checkOut(context.Background()) + // require.NoError(t, err) + + // ctx, cancel := csot.WithTimeout(context.Background(), &timeout) + // defer cancel() + + // ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) + // ctx = driverutil.WithRequestID(ctx, -1) + + // _, err = conn.readWireMessage(ctx) + // regex := regexp.MustCompile( + // `^connection\(.*\[-\d+\]\) incomplete read of message header: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, + // ) + // assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) + // err = p.checkIn(conn) + // require.NoError(t, err) + + // // Wait 400ms to ensure there is no time left on the pending read. + // time.Sleep(400 * time.Millisecond) + + // _, err = p.checkOut(context.Background()) + // require.NoError(t, err) + + // require.NoError(t, pendingReadError) + //}) } func assertConnectionsClosed(t *testing.T, dialer *dialer, count int) { From 9cc397a54e9a158f216de37477b92121d5ccd196 Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Thu, 17 Apr 2025 16:38:31 -0600 Subject: [PATCH 05/23] GODRIVER-3173 Unset pending read state when succeeded --- x/mongo/driver/topology/pool.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/x/mongo/driver/topology/pool.go b/x/mongo/driver/topology/pool.go index 4ca494392e..a9b241196d 100644 --- a/x/mongo/driver/topology/pool.go +++ b/x/mongo/driver/topology/pool.go @@ -1079,6 +1079,8 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { publishPendingReadSucceeded(pool, conn) + conn.pendingReadState = nil + return nil } From 061575922d60c1548640995ac93211c4cc3add8e Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Fri, 18 Apr 2025 19:49:05 -0600 Subject: [PATCH 06/23] GODRIVER-3173 Clean up code --- internal/integration/mtest/mongotest.go | 5 +++++ x/mongo/driver/topology/connection.go | 4 ---- x/mongo/driver/topology/pool.go | 23 ++++++++++------------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/internal/integration/mtest/mongotest.go b/internal/integration/mtest/mongotest.go index 901d71a3ff..ce16899972 100644 --- a/internal/integration/mtest/mongotest.go +++ b/internal/integration/mtest/mongotest.go @@ -564,6 +564,11 @@ func (t *T) TrackFailPoint(fpName string) { // ClearFailPoints disables all previously set failpoints for this test. func (t *T) ClearFailPoints() { + // Run some arbitrary command to ensure that any connection that would + // otherwise blocking during a pending read is closed. This could happen if + // the mode times > 1 and the blocking time is > default pending read timeout. + _ = t.Client.Ping(context.Background(), nil) + db := t.Client.Database("admin") for _, fp := range t.failPointNames { cmd := failpoint.FailPoint{ diff --git a/x/mongo/driver/topology/connection.go b/x/mongo/driver/topology/connection.go index e22918beae..884f73187f 100644 --- a/x/mongo/driver/topology/connection.go +++ b/x/mongo/driver/topology/connection.go @@ -491,13 +491,11 @@ func (c *connection) read(ctx context.Context) (bytesRead []byte, errMsg string, if l := int32(n); l == 0 && isCSOTTimeout(err) && driverutil.HasMaxTimeMS(ctx) { requestID, _ := driverutil.GetRequestID(ctx) - //c.pendingReadMu.Lock() c.pendingReadState = &pendingReadState{ remainingBytes: l, requestID: requestID, start: time.Now(), } - //c.pendingReadMu.Unlock() } return nil, "incomplete read of message header", err } @@ -515,13 +513,11 @@ func (c *connection) read(ctx context.Context) (bytesRead []byte, errMsg string, if remainingBytes > 0 && isCSOTTimeout(err) && driverutil.HasMaxTimeMS(ctx) { requestID, _ := driverutil.GetRequestID(ctx) - //c.pendingReadMu.Lock() c.pendingReadState = &pendingReadState{ remainingBytes: remainingBytes, requestID: requestID, start: time.Now(), } - //c.pendingReadMu.Unlock() } return dst, "incomplete read of full message", err } diff --git a/x/mongo/driver/topology/pool.go b/x/mongo/driver/topology/pool.go index a9b241196d..967abd4a63 100644 --- a/x/mongo/driver/topology/pool.go +++ b/x/mongo/driver/topology/pool.go @@ -907,7 +907,9 @@ func newPendingReadContext(parent context.Context, remainingTime time.Duration) // buffered reader. If the connection is closed, it will return false. func peekConnectionAlive(conn *connection) (int, error) { // Set a very short deadline to avoid blocking. - conn.nc.SetReadDeadline(time.Now().Add(1 * time.Nanosecond)) + if err := conn.nc.SetReadDeadline(time.Now().Add(1 * time.Nanosecond)); err != nil { + return 0, err + } // Wrap the connection in a buffered reader to use peek. reader := bufio.NewReader(conn.nc) @@ -927,6 +929,8 @@ func attemptPendingRead(ctx context.Context, conn *connection, remainingTime tim calculatedDeadline := time.Now().Add(remainingTime) if contextDeadlineUsed { + // Use the minimum of the user-provided deadline and the calculated + // deadline. if calculatedDeadline.Before(dl) { dl = calculatedDeadline } @@ -941,7 +945,6 @@ func attemptPendingRead(ctx context.Context, conn *connection, remainingTime tim size := pendingreadState.remainingBytes - //st := time.Now() if size == 0 { // Question: Would this alawys equal to zero? var sizeBuf [4]byte if bytesRead, err := io.ReadFull(conn.nc, sizeBuf[:]); err != nil { @@ -989,7 +992,6 @@ func attemptPendingRead(ctx context.Context, conn *connection, remainingTime tim nerr := net.Error(nil) if l := int32(n); l == 0 && errors.As(err, &nerr) && nerr.Timeout() { pendingreadState.remainingBytes = l + pendingreadState.remainingBytes - //prs.remainingTime = ptrutil.Ptr(*prs.remainingTime - time.Since(st)) } err = transformNetworkError(ctx, err, contextDeadlineUsed) @@ -1017,11 +1019,10 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { publishPendingReadStarted(pool, conn) var ( - pendingReadState = conn.pendingReadState - remainingTime = pendingReadState.start.Add(PendingReadTimeout).Sub(time.Now()) - err error - bytesRead int - contextDeadlineUsed bool + pendingReadState = conn.pendingReadState + remainingTime = pendingReadState.start.Add(PendingReadTimeout).Sub(time.Now()) + err error + bytesRead int ) if remainingTime <= 0 { @@ -1029,10 +1030,6 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { // aliveness. In such cases, we don't want to close the connection. bytesRead, err = peekConnectionAlive(conn) } else { - //pendingReadContext, cancel := newPendingReadContext(ctx, remainingTime) - //defer cancel() - - contextDeadlineUsed = true bytesRead, err = attemptPendingRead(ctx, conn, remainingTime) } @@ -1057,7 +1054,7 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { if err := conn.close(); err != nil { return err } - return transformNetworkError(ctx, err, contextDeadlineUsed) + return err } } From 5afbc961f2f1159957af9ddd8c820e705ea8dce2 Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Tue, 22 Apr 2025 09:25:40 -0600 Subject: [PATCH 07/23] GODRIVER-3173 Add prose tests --- internal/integration/csot_prose_test.go | 3 - .../unified/unified_spec_runner.go | 2 - x/mongo/driver/topology/pool.go | 28 +-- x/mongo/driver/topology/pool_test.go | 206 +++++++++++++----- 4 files changed, 154 insertions(+), 85 deletions(-) diff --git a/internal/integration/csot_prose_test.go b/internal/integration/csot_prose_test.go index dd71f35ee2..f2854559ce 100644 --- a/internal/integration/csot_prose_test.go +++ b/internal/integration/csot_prose_test.go @@ -233,9 +233,6 @@ func TestCSOTProse(t *testing.T) { assert.ErrorIs(mt, err, context.DeadlineExceeded, "expected a timeout error, got: %v", err) assert.Equal(mt, 2, cnt, "expected bulkWrite calls: %d, got: %d", 2, cnt) }) - - //mt.Run("pending read", func(mt *mtest.T) { - //}) } func TestCSOTProse_GridFS(t *testing.T) { diff --git a/internal/integration/unified/unified_spec_runner.go b/internal/integration/unified/unified_spec_runner.go index cb8a98ae22..a81c61407c 100644 --- a/internal/integration/unified/unified_spec_runner.go +++ b/internal/integration/unified/unified_spec_runner.go @@ -122,8 +122,6 @@ func runTestFile(t *testing.T, filepath string, expectValidFail bool, opts ...*O defer func() { // catch panics from looking up elements and fail if it's unexpected if r := recover(); r != nil { - - panic(r) if !expectValidFail { mt.Fatal(r) } diff --git a/x/mongo/driver/topology/pool.go b/x/mongo/driver/topology/pool.go index 967abd4a63..e4b7026d23 100644 --- a/x/mongo/driver/topology/pool.go +++ b/x/mongo/driver/topology/pool.go @@ -788,7 +788,7 @@ func (p *pool) removeConnection(conn *connection, reason reason, err error) erro // // Deprecated: PendingReadTimeout is intended for internal use only and may be // removed or modified at any time. -var PendingReadTimeout = 400 * time.Millisecond +var PendingReadTimeout = 3000 * time.Millisecond // publishPendingReadStarted will log a message to the pool logger and // publish an event to the pool monitor if they are set. @@ -880,34 +880,11 @@ func publishPendingReadSucceeded(pool *pool, conn *connection) { } } -// newPendingReadContext creates a new context with a deadline that is the -// minimum of the parent context's deadline and the remaining time. -func newPendingReadContext(parent context.Context, remainingTime time.Duration) (context.Context, context.CancelFunc) { - parentDeadline, hasDeadline := parent.Deadline() - - calculatedDeadline := time.Now().Add(remainingTime) - - if hasDeadline { - // Chose the earliest of the two deadlines. - var minDeadline time.Time - if calculatedDeadline.Before(parentDeadline) { - minDeadline = calculatedDeadline - } else { - minDeadline = parentDeadline - } - - return context.WithDeadline(parent, minDeadline) - } - - // If no deadline was set on the parent context, use the remaining time. - return context.WithTimeout(parent, remainingTime) -} - // peekConnectionAlive checks if the connection is alive by peeking at the // buffered reader. If the connection is closed, it will return false. func peekConnectionAlive(conn *connection) (int, error) { // Set a very short deadline to avoid blocking. - if err := conn.nc.SetReadDeadline(time.Now().Add(1 * time.Nanosecond)); err != nil { + if err := conn.nc.SetReadDeadline(time.Now().Add(1 * time.Millisecond)); err != nil { return 0, err } @@ -1051,6 +1028,7 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { }() if netErr, ok := err.(net.Error); ok && !netErr.Timeout() { + fmt.Println(1) if err := conn.close(); err != nil { return err } diff --git a/x/mongo/driver/topology/pool_test.go b/x/mongo/driver/topology/pool_test.go index 6f0a925d3c..fc85f68f01 100644 --- a/x/mongo/driver/topology/pool_test.go +++ b/x/mongo/driver/topology/pool_test.go @@ -9,6 +9,7 @@ package topology import ( "context" "errors" + "io" "net" "regexp" "sync" @@ -1566,61 +1567,156 @@ func TestAwaitPendingRead(t *testing.T) { assert.EqualError(t, pendingReadError, "error discarding 3 byte message: EOF") }) - //t.Run("no remaining time with no response", func(t *testing.T) { - // timeout := 10 * time.Millisecond - - // addr := bootstrapConnections(t, 1, func(nc net.Conn) { - // defer func() { - // _ = nc.Close() - // }() - - // // Write nothing so that the 10 microseconds "non-blocking" - // }) - - // var pendingReadError error - // monitor := &event.PoolMonitor{ - // Event: func(pe *event.PoolEvent) { - // if pe.Type == event.ConnectionPendingReadFailed { - // pendingReadError = pe.Error - // } - // }, - // } - - // p := newPool( - // poolConfig{ - // Address: address.Address(addr.String()), - // PoolMonitor: monitor, - // }, - // ) - // defer p.close(context.Background()) - // err := p.ready() - // require.NoError(t, err) - - // conn, err := p.checkOut(context.Background()) - // require.NoError(t, err) - - // ctx, cancel := csot.WithTimeout(context.Background(), &timeout) - // defer cancel() - - // ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) - // ctx = driverutil.WithRequestID(ctx, -1) - - // _, err = conn.readWireMessage(ctx) - // regex := regexp.MustCompile( - // `^connection\(.*\[-\d+\]\) incomplete read of message header: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, - // ) - // assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) - // err = p.checkIn(conn) - // require.NoError(t, err) - - // // Wait 400ms to ensure there is no time left on the pending read. - // time.Sleep(400 * time.Millisecond) - - // _, err = p.checkOut(context.Background()) - // require.NoError(t, err) - - // require.NoError(t, pendingReadError) - //}) + + // Need to test the case where we attempt a non-blocking read to determine if + // we should refresh the remaining time. In the case of the Go Driver, we do + // this by attempt to "pee" at 1 byte with a deadline of 1ns. + t.Run("connection attempts peek but fails", func(t *testing.T) { + timeout := 10 * time.Millisecond + + // Mock a TCP listener that will write a byte sequence > 5 (to avoid errors + // due to size) to the TCP socket. Have the listener sleep for 2x the + // timeout provided to the connection AFTER writing the byte sequence. This + // wiill cause the connection to timeout while reading from the socket. + addr := bootstrapConnections(t, 1, func(nc net.Conn) { + defer func() { + _ = nc.Close() + }() + + _, err := nc.Write([]byte{12, 0, 0, 0, 0, 0, 0, 0, 1}) + require.NoError(t, err) + time.Sleep(timeout * 2) + + // Write nothing so that the 1 millisecond "non-blocking" peek fails. + }) + + var pendingReadError error + monitor := &event.PoolMonitor{ + Event: func(pe *event.PoolEvent) { + if pe.Type == event.ConnectionPendingReadFailed { + pendingReadError = pe.Error + } + }, + } + + p := newPool( + poolConfig{ + Address: address.Address(addr.String()), + PoolMonitor: monitor, + }, + ) + defer p.close(context.Background()) + err := p.ready() + require.NoError(t, err) + + // Check out a connection and read from the socket, causing a timeout and + // pinning the connection to a pending read state. + conn, err := p.checkOut(context.Background()) + require.NoError(t, err) + + ctx, cancel := csot.WithTimeout(context.Background(), &timeout) + defer cancel() + + ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) + ctx = driverutil.WithRequestID(ctx, -1) + + _, err = conn.readWireMessage(ctx) + regex := regexp.MustCompile( + `^connection\(.*\[-\d+\]\) incomplete read of full message: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, + ) + assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) + + // Check in the connection with a pending read state. The next time this + // connection is checked out, it should attempt to read the pending + // response. + err = p.checkIn(conn) + require.NoError(t, err) + + // Wait 3s to make sure there is no remaining time on the pending read + // state. + time.Sleep(3 * time.Second) + + // Check out the connection again. The remaining time should be exhausted + // requiring us to "peek" at the connection to determine if we should + _, err = p.checkOut(context.Background()) + assert.ErrorIs(t, err, io.EOF) + assert.ErrorIs(t, pendingReadError, io.EOF) + }) + + t.Run("connection attempts peek and succeeds", func(t *testing.T) { + timeout := 10 * time.Millisecond + + // Mock a TCP listener that will write a byte sequence > 5 (to avoid errors + // due to size) to the TCP socket. Have the listener sleep for 2x the + // timeout provided to the connection AFTER writing the byte sequence. This + // wiill cause the connection to timeout while reading from the socket. + addr := bootstrapConnections(t, 1, func(nc net.Conn) { + defer func() { + _ = nc.Close() + }() + + _, err := nc.Write([]byte{12, 0, 0, 0, 0, 0, 0, 0, 1}) + require.NoError(t, err) + time.Sleep(timeout * 2) + + // Write data that can be peeked at. + _, err = nc.Write([]byte{12, 0, 0, 0, 0, 0, 0, 0, 1}) + require.NoError(t, err) + + }) + + var pendingReadError error + monitor := &event.PoolMonitor{ + Event: func(pe *event.PoolEvent) { + if pe.Type == event.ConnectionPendingReadFailed { + pendingReadError = pe.Error + } + }, + } + + p := newPool( + poolConfig{ + Address: address.Address(addr.String()), + PoolMonitor: monitor, + }, + ) + defer p.close(context.Background()) + err := p.ready() + require.NoError(t, err) + + // Check out a connection and read from the socket, causing a timeout and + // pinning the connection to a pending read state. + conn, err := p.checkOut(context.Background()) + require.NoError(t, err) + + ctx, cancel := csot.WithTimeout(context.Background(), &timeout) + defer cancel() + + ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) + ctx = driverutil.WithRequestID(ctx, -1) + + _, err = conn.readWireMessage(ctx) + regex := regexp.MustCompile( + `^connection\(.*\[-\d+\]\) incomplete read of full message: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, + ) + assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) + + // Check in the connection with a pending read state. The next time this + // connection is checked out, it should attempt to read the pending + // response. + err = p.checkIn(conn) + require.NoError(t, err) + + // Wait 3s to make sure there is no remaining time on the pending read + // state. + time.Sleep(3 * time.Second) + + // Check out the connection again. The remaining time should be exhausted + // requiring us to "peek" at the connection to determine if we should + _, err = p.checkOut(context.Background()) + require.NoError(t, err) + require.NoError(t, pendingReadError) + }) } func assertConnectionsClosed(t *testing.T, dialer *dialer, count int) { From c9ef687dabe471ed0b0947def80c4834a9483f99 Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Thu, 24 Apr 2025 16:19:27 -0600 Subject: [PATCH 08/23] GODRIVER-3173 Organize prose tests --- event/monitoring.go | 28 +-- internal/integration/mtest/mongotest.go | 4 +- internal/integration/unified/event.go | 6 +- .../integration/unified/event_verification.go | 6 +- testdata/specifications | 2 +- x/mongo/driver/topology/cmap_prose_test.go | 201 ++++++++++++++++++ x/mongo/driver/topology/pool.go | 14 +- x/mongo/driver/topology/pool_test.go | 161 +------------- 8 files changed, 237 insertions(+), 185 deletions(-) diff --git a/event/monitoring.go b/event/monitoring.go index 4965a55528..6f6db625ac 100644 --- a/event/monitoring.go +++ b/event/monitoring.go @@ -75,20 +75,20 @@ const ( // strings for pool command monitoring types const ( - ConnectionPoolCreated = "ConnectionPoolCreated" - ConnectionPoolReady = "ConnectionPoolReady" - ConnectionPoolCleared = "ConnectionPoolCleared" - ConnectionPoolClosed = "ConnectionPoolClosed" - ConnectionCreated = "ConnectionCreated" - ConnectionReady = "ConnectionReady" - ConnectionClosed = "ConnectionClosed" - ConnectionCheckOutStarted = "ConnectionCheckOutStarted" - ConnectionCheckOutFailed = "ConnectionCheckOutFailed" - ConnectionCheckedOut = "ConnectionCheckedOut" - ConnectionCheckedIn = "ConnectionCheckedIn" - ConnectionPendingReadStarted = "ConnectionPendingReadStarted" - ConnectionPendingReadSucceeded = "ConnectionPendingReadSucceeded" - ConnectionPendingReadFailed = "ConnectionPendingReadFailed" + ConnectionPoolCreated = "ConnectionPoolCreated" + ConnectionPoolReady = "ConnectionPoolReady" + ConnectionPoolCleared = "ConnectionPoolCleared" + ConnectionPoolClosed = "ConnectionPoolClosed" + ConnectionCreated = "ConnectionCreated" + ConnectionReady = "ConnectionReady" + ConnectionClosed = "ConnectionClosed" + ConnectionCheckOutStarted = "ConnectionCheckOutStarted" + ConnectionCheckOutFailed = "ConnectionCheckOutFailed" + ConnectionCheckedOut = "ConnectionCheckedOut" + ConnectionCheckedIn = "ConnectionCheckedIn" + ConnectionPendingResponseStarted = "ConnectionPendingResponseStarted" + ConnectionPendingResponseSucceeded = "ConnectionPendingResponseSucceeded" + ConnectionPendingResponseFailed = "ConnectionPendingResponseFailed" ) // MonitorPoolOptions contains pool options as formatted in pool events diff --git a/internal/integration/mtest/mongotest.go b/internal/integration/mtest/mongotest.go index ce16899972..1430ab6e0c 100644 --- a/internal/integration/mtest/mongotest.go +++ b/internal/integration/mtest/mongotest.go @@ -662,9 +662,9 @@ func (t *T) createTestClient() { atomic.AddInt64(&t.connsCheckedOut, 1) case event.ConnectionCheckedIn: atomic.AddInt64(&t.connsCheckedOut, -1) - case event.ConnectionPendingReadStarted: + case event.ConnectionPendingResponseStarted: atomic.AddInt64(&t.connPendingReadStarted, 1) - case event.ConnectionPendingReadSucceeded: + case event.ConnectionPendingResponseSucceeded: atomic.AddInt64(&t.connPendingReadSucceeded, 1) case event.ConnectionCheckOutFailed: atomic.AddInt64(&t.connPendingReadFailed, 1) diff --git a/internal/integration/unified/event.go b/internal/integration/unified/event.go index 644a0e9e36..fade6e40d7 100644 --- a/internal/integration/unified/event.go +++ b/internal/integration/unified/event.go @@ -121,11 +121,11 @@ func monitoringEventTypeFromPoolEvent(evt *event.PoolEvent) monitoringEventType return connectionCheckedOutEvent case event.ConnectionCheckedIn: return connectionCheckedInEvent - case event.ConnectionPendingReadStarted: + case event.ConnectionPendingResponseStarted: return connectionPendingReadStarted - case event.ConnectionPendingReadSucceeded: + case event.ConnectionPendingResponseSucceeded: return connectionPendingReadSucceeded - case event.ConnectionPendingReadFailed: + case event.ConnectionPendingResponseFailed: return connectionPendingReadFailed default: return "" diff --git a/internal/integration/unified/event_verification.go b/internal/integration/unified/event_verification.go index 5b5f6903c4..58ca7b77ac 100644 --- a/internal/integration/unified/event_verification.go +++ b/internal/integration/unified/event_verification.go @@ -363,15 +363,15 @@ func verifyCMAPEvents(client *clientEntity, expectedEvents *expectedEvents) erro return newEventVerificationError(idx, client, "failed to get next pool event: %v", err.Error()) } case evt.ConnectionPendingReadStarted != nil: - if _, pooled, err = getNextPoolEvent(pooled, event.ConnectionPendingReadStarted); err != nil { + if _, pooled, err = getNextPoolEvent(pooled, event.ConnectionPendingResponseStarted); err != nil { return newEventVerificationError(idx, client, "failed to get next pool event: %v", err.Error()) } case evt.ConnectionPendingreadSucceeded != nil: - if _, pooled, err = getNextPoolEvent(pooled, event.ConnectionPendingReadSucceeded); err != nil { + if _, pooled, err = getNextPoolEvent(pooled, event.ConnectionPendingResponseSucceeded); err != nil { return newEventVerificationError(idx, client, "failed to get next pool event: %v", err.Error()) } case evt.ConnectionPendingReadFailed != nil: - if _, pooled, err = getNextPoolEvent(pooled, event.ConnectionPendingReadFailed); err != nil { + if _, pooled, err = getNextPoolEvent(pooled, event.ConnectionPendingResponseFailed); err != nil { return newEventVerificationError(idx, client, "failed to get next pool event: %v", err.Error()) } case evt.PoolClearedEvent != nil: diff --git a/testdata/specifications b/testdata/specifications index 43d2c7bacd..6118debee4 160000 --- a/testdata/specifications +++ b/testdata/specifications @@ -1 +1 @@ -Subproject commit 43d2c7bacd62249de8d2173bf8ee39e6fd7a686e +Subproject commit 6118debee41cfd1bca197b315bd1f10ad95f66ae diff --git a/x/mongo/driver/topology/cmap_prose_test.go b/x/mongo/driver/topology/cmap_prose_test.go index 0524b99e9c..63a1f80a3a 100644 --- a/x/mongo/driver/topology/cmap_prose_test.go +++ b/x/mongo/driver/topology/cmap_prose_test.go @@ -9,13 +9,19 @@ package topology import ( "context" "errors" + "io" "net" + "regexp" + "sync" "testing" "time" "go.mongodb.org/mongo-driver/v2/event" "go.mongodb.org/mongo-driver/v2/internal/assert" + "go.mongodb.org/mongo-driver/v2/internal/csot" + "go.mongodb.org/mongo-driver/v2/internal/driverutil" "go.mongodb.org/mongo-driver/v2/internal/require" + "go.mongodb.org/mongo-driver/v2/mongo/address" "go.mongodb.org/mongo-driver/v2/x/mongo/driver/operation" ) @@ -263,6 +269,201 @@ func TestCMAPProse(t *testing.T) { }) }) }) + + // Need to test the case where we attempt a non-blocking read to determine if + // we should refresh the remaining time. In the case of the Go Driver, we do + // this by attempt to "pee" at 1 byte with a deadline of 1ns. + t.Run("connection attempts peek but fails", func(t *testing.T) { + const requestID = int32(-1) + timeout := 10 * time.Millisecond + + // Mock a TCP listener that will write a byte sequence > 5 (to avoid errors + // due to size) to the TCP socket. Have the listener sleep for 2x the + // timeout provided to the connection AFTER writing the byte sequence. This + // wiill cause the connection to timeout while reading from the socket. + addr := bootstrapConnections(t, 1, func(nc net.Conn) { + defer func() { + _ = nc.Close() + }() + + _, err := nc.Write([]byte{12, 0, 0, 0, 0, 0, 0, 0, 1}) + require.NoError(t, err) + time.Sleep(timeout * 2) + + // Write nothing so that the 1 millisecond "non-blocking" peek fails. + }) + + poolEventsByType := make(map[string][]event.PoolEvent) + poolEventsByTypeMu := &sync.Mutex{} + + monitor := &event.PoolMonitor{ + Event: func(pe *event.PoolEvent) { + poolEventsByTypeMu.Lock() + poolEventsByType[pe.Type] = append(poolEventsByType[pe.Type], *pe) + poolEventsByTypeMu.Unlock() + }, + } + + p := newPool( + poolConfig{ + Address: address.Address(addr.String()), + PoolMonitor: monitor, + }, + ) + defer p.close(context.Background()) + err := p.ready() + require.NoError(t, err) + + // Check out a connection and read from the socket, causing a timeout and + // pinning the connection to a pending read state. + conn, err := p.checkOut(context.Background()) + require.NoError(t, err) + + ctx, cancel := csot.WithTimeout(context.Background(), &timeout) + defer cancel() + + ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) + ctx = driverutil.WithRequestID(ctx, requestID) + + _, err = conn.readWireMessage(ctx) + regex := regexp.MustCompile( + `^connection\(.*\[-\d+\]\) incomplete read of full message: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, + ) + assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) + + // Check in the connection with a pending read state. The next time this + // connection is checked out, it should attempt to read the pending + // response. + err = p.checkIn(conn) + require.NoError(t, err) + + // Wait 3s to make sure there is no remaining time on the pending read + // state. + time.Sleep(3 * time.Second) + + // Check out the connection again. The remaining time should be exhausted + // requiring us to "peek" at the connection to determine if we should + _, err = p.checkOut(context.Background()) + assert.ErrorIs(t, err, io.EOF) + + // There should be 1 ConnectionPendingResponseStarted event. + started := poolEventsByType[event.ConnectionPendingResponseStarted] + require.Len(t, started, 1) + + assert.Equal(t, addr.String(), started[0].Address) + assert.Equal(t, conn.driverConnectionID, started[0].ConnectionID) + assert.Equal(t, requestID, started[0].RequestID) + + // There should be 1 ConnectionPendingResponseFailed event. + failed := poolEventsByType[event.ConnectionPendingResponseFailed] + require.Len(t, failed, 1) + + assert.Equal(t, addr.String(), failed[0].Address) + assert.Equal(t, conn.driverConnectionID, failed[0].ConnectionID) + assert.Equal(t, requestID, failed[0].RequestID) + assert.Equal(t, io.EOF.Error(), failed[0].Reason) + assert.ErrorIs(t, failed[0].Error, io.EOF) + assert.Equal(t, time.Duration(0), failed[0].RemainingTime) + + // There should be 0 ConnectionPendingResponseSucceeded event. + require.Len(t, poolEventsByType[event.ConnectionPendingResponseSucceeded], 0) + }) + + t.Run("connection attempts peek and succeeds", func(t *testing.T) { + const requestID = int32(-1) + timeout := 10 * time.Millisecond + + // Mock a TCP listener that will write a byte sequence > 5 (to avoid errors + // due to size) to the TCP socket. Have the listener sleep for 2x the + // timeout provided to the connection AFTER writing the byte sequence. This + // wiill cause the connection to timeout while reading from the socket. + addr := bootstrapConnections(t, 1, func(nc net.Conn) { + defer func() { + _ = nc.Close() + }() + + _, err := nc.Write([]byte{12, 0, 0, 0, 0, 0, 0, 0, 1}) + require.NoError(t, err) + time.Sleep(timeout * 2) + + // Write data that can be peeked at. + _, err = nc.Write([]byte{12, 0, 0, 0, 0, 0, 0, 0, 1}) + require.NoError(t, err) + + }) + + poolEventsByType := make(map[string][]event.PoolEvent) + poolEventsByTypeMu := &sync.Mutex{} + + monitor := &event.PoolMonitor{ + Event: func(pe *event.PoolEvent) { + poolEventsByTypeMu.Lock() + poolEventsByType[pe.Type] = append(poolEventsByType[pe.Type], *pe) + poolEventsByTypeMu.Unlock() + }, + } + + p := newPool( + poolConfig{ + Address: address.Address(addr.String()), + PoolMonitor: monitor, + }, + ) + defer p.close(context.Background()) + err := p.ready() + require.NoError(t, err) + + // Check out a connection and read from the socket, causing a timeout and + // pinning the connection to a pending read state. + conn, err := p.checkOut(context.Background()) + require.NoError(t, err) + + ctx, cancel := csot.WithTimeout(context.Background(), &timeout) + defer cancel() + + ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) + ctx = driverutil.WithRequestID(ctx, requestID) + + _, err = conn.readWireMessage(ctx) + regex := regexp.MustCompile( + `^connection\(.*\[-\d+\]\) incomplete read of full message: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, + ) + assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) + + // Check in the connection with a pending read state. The next time this + // connection is checked out, it should attempt to read the pending + // response. + err = p.checkIn(conn) + require.NoError(t, err) + + // Wait 3s to make sure there is no remaining time on the pending read + // state. + time.Sleep(3 * time.Second) + + // Check out the connection again. The remaining time should be exhausted + // requiring us to "peek" at the connection to determine if we should + _, err = p.checkOut(context.Background()) + require.NoError(t, err) + + // There should be 1 ConnectionPendingResponseStarted event. + started := poolEventsByType[event.ConnectionPendingResponseStarted] + require.Len(t, started, 1) + + assert.Equal(t, addr.String(), started[0].Address) + assert.Equal(t, conn.driverConnectionID, started[0].ConnectionID) + assert.Equal(t, requestID, started[0].RequestID) + + // There should be 0 ConnectionPendingResponseFailed event. + require.Len(t, poolEventsByType[event.ConnectionPendingResponseFailed], 0) + + // There should be 1 ConnectionPendingResponseSucceeded event. + succeeded := poolEventsByType[event.ConnectionPendingResponseSucceeded] + require.Len(t, succeeded, 1) + + assert.Equal(t, addr.String(), succeeded[0].Address) + assert.Equal(t, conn.driverConnectionID, succeeded[0].ConnectionID) + assert.Greater(t, int(succeeded[0].Duration), 0) + }) } func createTestPool(t *testing.T, cfg poolConfig, opts ...ConnectionOption) *pool { diff --git a/x/mongo/driver/topology/pool.go b/x/mongo/driver/topology/pool.go index e4b7026d23..55d8971e31 100644 --- a/x/mongo/driver/topology/pool.go +++ b/x/mongo/driver/topology/pool.go @@ -811,7 +811,7 @@ func publishPendingReadStarted(pool *pool, conn *connection) { // publish an event to the pool monitor if it is set. if pool.monitor != nil { event := &event.PoolEvent{ - Type: event.ConnectionPendingReadStarted, + Type: event.ConnectionPendingResponseStarted, Address: pool.address.String(), ConnectionID: conn.driverConnectionID, RequestID: prs.requestID, @@ -840,7 +840,7 @@ func publishPendingReadFailed(pool *pool, conn *connection, err error) { if pool.monitor != nil { event := &event.PoolEvent{ - Type: event.ConnectionPendingReadFailed, + Type: event.ConnectionPendingResponseFailed, Address: pool.address.String(), ConnectionID: conn.driverConnectionID, RequestID: prs.requestID, @@ -853,7 +853,7 @@ func publishPendingReadFailed(pool *pool, conn *connection, err error) { } } -func publishPendingReadSucceeded(pool *pool, conn *connection) { +func publishPendingReadSucceeded(pool *pool, conn *connection, dur time.Duration) { prs := conn.pendingReadState if prs == nil { return @@ -870,10 +870,10 @@ func publishPendingReadSucceeded(pool *pool, conn *connection) { if pool.monitor != nil { event := &event.PoolEvent{ - Type: event.ConnectionPendingReadSucceeded, + Type: event.ConnectionPendingResponseSucceeded, Address: pool.address.String(), ConnectionID: conn.driverConnectionID, - //Duration: time.Since(st), + Duration: dur, } pool.monitor.Event(event) @@ -1002,6 +1002,7 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { bytesRead int ) + st := time.Now() if remainingTime <= 0 { // If there is no remaining time, we can just peek at the connection to check // aliveness. In such cases, we don't want to close the connection. @@ -1011,6 +1012,7 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { } endTime := time.Now() + endDuration := time.Since(st) if err != nil { // No matter what happens, always check the connection back into the @@ -1052,7 +1054,7 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { return err } - publishPendingReadSucceeded(pool, conn) + publishPendingReadSucceeded(pool, conn, endDuration) conn.pendingReadState = nil diff --git a/x/mongo/driver/topology/pool_test.go b/x/mongo/driver/topology/pool_test.go index fc85f68f01..c05e8133f0 100644 --- a/x/mongo/driver/topology/pool_test.go +++ b/x/mongo/driver/topology/pool_test.go @@ -9,7 +9,6 @@ package topology import ( "context" "errors" - "io" "net" "regexp" "sync" @@ -1293,7 +1292,7 @@ func TestAwaitPendingRead(t *testing.T) { var pendingReadError error monitor := &event.PoolMonitor{ Event: func(pe *event.PoolEvent) { - if pe.Type == event.ConnectionPendingReadFailed { + if pe.Type == event.ConnectionPendingResponseFailed { pendingReadError = pe.Error } }, @@ -1348,7 +1347,7 @@ func TestAwaitPendingRead(t *testing.T) { var pendingReadError error monitor := &event.PoolMonitor{ Event: func(pe *event.PoolEvent) { - if pe.Type == event.ConnectionPendingReadFailed { + if pe.Type == event.ConnectionPendingResponseFailed { pendingReadError = pe.Error } }, @@ -1407,7 +1406,7 @@ func TestAwaitPendingRead(t *testing.T) { var pendingReadError error monitor := &event.PoolMonitor{ Event: func(pe *event.PoolEvent) { - if pe.Type == event.ConnectionPendingReadFailed { + if pe.Type == event.ConnectionPendingResponseFailed { pendingReadError = pe.Error } }, @@ -1469,7 +1468,7 @@ func TestAwaitPendingRead(t *testing.T) { var pendingReadError error monitor := &event.PoolMonitor{ Event: func(pe *event.PoolEvent) { - if pe.Type == event.ConnectionPendingReadFailed { + if pe.Type == event.ConnectionPendingResponseFailed { pendingReadError = pe.Error } }, @@ -1528,7 +1527,7 @@ func TestAwaitPendingRead(t *testing.T) { var pendingReadError error monitor := &event.PoolMonitor{ Event: func(pe *event.PoolEvent) { - if pe.Type == event.ConnectionPendingReadFailed { + if pe.Type == event.ConnectionPendingResponseFailed { pendingReadError = pe.Error } }, @@ -1567,156 +1566,6 @@ func TestAwaitPendingRead(t *testing.T) { assert.EqualError(t, pendingReadError, "error discarding 3 byte message: EOF") }) - - // Need to test the case where we attempt a non-blocking read to determine if - // we should refresh the remaining time. In the case of the Go Driver, we do - // this by attempt to "pee" at 1 byte with a deadline of 1ns. - t.Run("connection attempts peek but fails", func(t *testing.T) { - timeout := 10 * time.Millisecond - - // Mock a TCP listener that will write a byte sequence > 5 (to avoid errors - // due to size) to the TCP socket. Have the listener sleep for 2x the - // timeout provided to the connection AFTER writing the byte sequence. This - // wiill cause the connection to timeout while reading from the socket. - addr := bootstrapConnections(t, 1, func(nc net.Conn) { - defer func() { - _ = nc.Close() - }() - - _, err := nc.Write([]byte{12, 0, 0, 0, 0, 0, 0, 0, 1}) - require.NoError(t, err) - time.Sleep(timeout * 2) - - // Write nothing so that the 1 millisecond "non-blocking" peek fails. - }) - - var pendingReadError error - monitor := &event.PoolMonitor{ - Event: func(pe *event.PoolEvent) { - if pe.Type == event.ConnectionPendingReadFailed { - pendingReadError = pe.Error - } - }, - } - - p := newPool( - poolConfig{ - Address: address.Address(addr.String()), - PoolMonitor: monitor, - }, - ) - defer p.close(context.Background()) - err := p.ready() - require.NoError(t, err) - - // Check out a connection and read from the socket, causing a timeout and - // pinning the connection to a pending read state. - conn, err := p.checkOut(context.Background()) - require.NoError(t, err) - - ctx, cancel := csot.WithTimeout(context.Background(), &timeout) - defer cancel() - - ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) - ctx = driverutil.WithRequestID(ctx, -1) - - _, err = conn.readWireMessage(ctx) - regex := regexp.MustCompile( - `^connection\(.*\[-\d+\]\) incomplete read of full message: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, - ) - assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) - - // Check in the connection with a pending read state. The next time this - // connection is checked out, it should attempt to read the pending - // response. - err = p.checkIn(conn) - require.NoError(t, err) - - // Wait 3s to make sure there is no remaining time on the pending read - // state. - time.Sleep(3 * time.Second) - - // Check out the connection again. The remaining time should be exhausted - // requiring us to "peek" at the connection to determine if we should - _, err = p.checkOut(context.Background()) - assert.ErrorIs(t, err, io.EOF) - assert.ErrorIs(t, pendingReadError, io.EOF) - }) - - t.Run("connection attempts peek and succeeds", func(t *testing.T) { - timeout := 10 * time.Millisecond - - // Mock a TCP listener that will write a byte sequence > 5 (to avoid errors - // due to size) to the TCP socket. Have the listener sleep for 2x the - // timeout provided to the connection AFTER writing the byte sequence. This - // wiill cause the connection to timeout while reading from the socket. - addr := bootstrapConnections(t, 1, func(nc net.Conn) { - defer func() { - _ = nc.Close() - }() - - _, err := nc.Write([]byte{12, 0, 0, 0, 0, 0, 0, 0, 1}) - require.NoError(t, err) - time.Sleep(timeout * 2) - - // Write data that can be peeked at. - _, err = nc.Write([]byte{12, 0, 0, 0, 0, 0, 0, 0, 1}) - require.NoError(t, err) - - }) - - var pendingReadError error - monitor := &event.PoolMonitor{ - Event: func(pe *event.PoolEvent) { - if pe.Type == event.ConnectionPendingReadFailed { - pendingReadError = pe.Error - } - }, - } - - p := newPool( - poolConfig{ - Address: address.Address(addr.String()), - PoolMonitor: monitor, - }, - ) - defer p.close(context.Background()) - err := p.ready() - require.NoError(t, err) - - // Check out a connection and read from the socket, causing a timeout and - // pinning the connection to a pending read state. - conn, err := p.checkOut(context.Background()) - require.NoError(t, err) - - ctx, cancel := csot.WithTimeout(context.Background(), &timeout) - defer cancel() - - ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) - ctx = driverutil.WithRequestID(ctx, -1) - - _, err = conn.readWireMessage(ctx) - regex := regexp.MustCompile( - `^connection\(.*\[-\d+\]\) incomplete read of full message: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, - ) - assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) - - // Check in the connection with a pending read state. The next time this - // connection is checked out, it should attempt to read the pending - // response. - err = p.checkIn(conn) - require.NoError(t, err) - - // Wait 3s to make sure there is no remaining time on the pending read - // state. - time.Sleep(3 * time.Second) - - // Check out the connection again. The remaining time should be exhausted - // requiring us to "peek" at the connection to determine if we should - _, err = p.checkOut(context.Background()) - require.NoError(t, err) - require.NoError(t, pendingReadError) - }) } func assertConnectionsClosed(t *testing.T, dialer *dialer, count int) { From d15516d2f02508d78f0f7a49092ea458f030c4e8 Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Thu, 24 Apr 2025 17:27:34 -0600 Subject: [PATCH 09/23] GODRIVER-3173 Fix bugs in events and logs --- internal/logger/component.go | 6 +++--- x/mongo/driver/topology/cmap_prose_test.go | 3 ++- x/mongo/driver/topology/connection.go | 1 - x/mongo/driver/topology/pool.go | 21 +++++++++++++-------- 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/internal/logger/component.go b/internal/logger/component.go index 5abc3f5f79..85acf05142 100644 --- a/internal/logger/component.go +++ b/internal/logger/component.go @@ -28,9 +28,9 @@ const ( ConnectionCheckoutFailed = "Connection checkout failed" ConnectionCheckedOut = "Connection checked out" ConnectionCheckedIn = "Connection checked in" - ConnectionPendingReadStarted = "Pending read started" - ConnectionPendingReadSucceeded = "Pending read succeeded" - ConnectionPendingReadFailed = "Pending read failed" + ConnectionPendingReadStarted = "Pending response started" + ConnectionPendingReadSucceeded = "Pending response succeeded" + ConnectionPendingReadFailed = "Pending response failed" ServerSelectionFailed = "Server selection failed" ServerSelectionStarted = "Server selection started" ServerSelectionSucceeded = "Server selection succeeded" diff --git a/x/mongo/driver/topology/cmap_prose_test.go b/x/mongo/driver/topology/cmap_prose_test.go index 63a1f80a3a..1b33e263ec 100644 --- a/x/mongo/driver/topology/cmap_prose_test.go +++ b/x/mongo/driver/topology/cmap_prose_test.go @@ -361,7 +361,7 @@ func TestCMAPProse(t *testing.T) { assert.Equal(t, addr.String(), failed[0].Address) assert.Equal(t, conn.driverConnectionID, failed[0].ConnectionID) assert.Equal(t, requestID, failed[0].RequestID) - assert.Equal(t, io.EOF.Error(), failed[0].Reason) + assert.Equal(t, "error", failed[0].Reason) assert.ErrorIs(t, failed[0].Error, io.EOF) assert.Equal(t, time.Duration(0), failed[0].RemainingTime) @@ -462,6 +462,7 @@ func TestCMAPProse(t *testing.T) { assert.Equal(t, addr.String(), succeeded[0].Address) assert.Equal(t, conn.driverConnectionID, succeeded[0].ConnectionID) + assert.Equal(t, requestID, succeeded[0].RequestID) assert.Greater(t, int(succeeded[0].Duration), 0) }) } diff --git a/x/mongo/driver/topology/connection.go b/x/mongo/driver/topology/connection.go index 884f73187f..7edbd15ceb 100644 --- a/x/mongo/driver/topology/connection.go +++ b/x/mongo/driver/topology/connection.go @@ -50,7 +50,6 @@ func nextConnectionID() uint64 { return atomic.AddUint64(&globalConnectionID, 1) type pendingReadState struct { remainingBytes int32 requestID int32 - remainingTime *time.Duration start time.Time } diff --git a/x/mongo/driver/topology/pool.go b/x/mongo/driver/topology/pool.go index 55d8971e31..7bfd368818 100644 --- a/x/mongo/driver/topology/pool.go +++ b/x/mongo/driver/topology/pool.go @@ -827,29 +827,32 @@ func publishPendingReadFailed(pool *pool, conn *connection, err error) { return } + reason := event.ReasonError + if errors.Is(err, context.DeadlineExceeded) { + reason = event.ReasonTimedOut + } + if mustLogPoolMessage(pool) { keysAndValues := logger.KeyValues{ logger.KeyDriverConnectionID, conn.driverConnectionID, logger.KeyRequestID, prs.requestID, - logger.KeyReason, err.Error(), - logger.KeyRemainingTimeMS, *prs.remainingTime, + logger.KeyReason, reason, + logger.KeyError, err.Error(), } logPoolMessage(pool, logger.ConnectionPendingReadFailed, keysAndValues...) } if pool.monitor != nil { - event := &event.PoolEvent{ + e := &event.PoolEvent{ Type: event.ConnectionPendingResponseFailed, Address: pool.address.String(), ConnectionID: conn.driverConnectionID, RequestID: prs.requestID, - //RemainingTime: remainingTime, - Reason: err.Error(), - Error: err, + Reason: reason, + Error: err, } - - pool.monitor.Event(event) + pool.monitor.Event(e) } } @@ -863,6 +866,7 @@ func publishPendingReadSucceeded(pool *pool, conn *connection, dur time.Duration keysAndValues := logger.KeyValues{ logger.KeyDriverConnectionID, conn.driverConnectionID, logger.KeyRequestID, prs.requestID, + logger.KeyDurationMS, dur.Milliseconds(), } logPoolMessage(pool, logger.ConnectionPendingReadSucceeded, keysAndValues...) @@ -873,6 +877,7 @@ func publishPendingReadSucceeded(pool *pool, conn *connection, dur time.Duration Type: event.ConnectionPendingResponseSucceeded, Address: pool.address.String(), ConnectionID: conn.driverConnectionID, + RequestID: prs.requestID, Duration: dur, } From 1752645a6ac0ec500c7f62d4fdb214176f994b4f Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Fri, 25 Apr 2025 15:32:53 -0600 Subject: [PATCH 10/23] GODRIVER-3173 Update test runner to use response over read --- internal/integration/unified/event.go | 66 +-- .../integration/unified/event_verification.go | 14 +- .../pending-reads.json | 552 ------------------ .../pending-reads.yml | 325 ----------- 4 files changed, 40 insertions(+), 917 deletions(-) delete mode 100644 testdata/client-side-operations-timeout/pending-reads.json delete mode 100644 testdata/client-side-operations-timeout/pending-reads.yml diff --git a/internal/integration/unified/event.go b/internal/integration/unified/event.go index fade6e40d7..9ee8fe7404 100644 --- a/internal/integration/unified/event.go +++ b/internal/integration/unified/event.go @@ -16,30 +16,30 @@ import ( type monitoringEventType string const ( - commandStartedEvent monitoringEventType = "CommandStartedEvent" - commandSucceededEvent monitoringEventType = "CommandSucceededEvent" - commandFailedEvent monitoringEventType = "CommandFailedEvent" - poolCreatedEvent monitoringEventType = "PoolCreatedEvent" - poolReadyEvent monitoringEventType = "PoolReadyEvent" - poolClearedEvent monitoringEventType = "PoolClearedEvent" - poolClosedEvent monitoringEventType = "PoolClosedEvent" - connectionCreatedEvent monitoringEventType = "ConnectionCreatedEvent" - connectionReadyEvent monitoringEventType = "ConnectionReadyEvent" - connectionClosedEvent monitoringEventType = "ConnectionClosedEvent" - connectionCheckOutStartedEvent monitoringEventType = "ConnectionCheckOutStartedEvent" - connectionCheckOutFailedEvent monitoringEventType = "ConnectionCheckOutFailedEvent" - connectionCheckedOutEvent monitoringEventType = "ConnectionCheckedOutEvent" - connectionCheckedInEvent monitoringEventType = "ConnectionCheckedInEvent" - connectionPendingReadStarted monitoringEventType = "ConnectionPendingReadStarted" - connectionPendingReadSucceeded monitoringEventType = "ConnectionPendingReadSucceeded" - connectionPendingReadFailed monitoringEventType = "ConnectionPendingReadFailed" - serverDescriptionChangedEvent monitoringEventType = "ServerDescriptionChangedEvent" - serverHeartbeatFailedEvent monitoringEventType = "ServerHeartbeatFailedEvent" - serverHeartbeatStartedEvent monitoringEventType = "ServerHeartbeatStartedEvent" - serverHeartbeatSucceededEvent monitoringEventType = "ServerHeartbeatSucceededEvent" - topologyDescriptionChangedEvent monitoringEventType = "TopologyDescriptionChangedEvent" - topologyOpeningEvent monitoringEventType = "TopologyOpeningEvent" - topologyClosedEvent monitoringEventType = "TopologyClosedEvent" + commandStartedEvent monitoringEventType = "CommandStartedEvent" + commandSucceededEvent monitoringEventType = "CommandSucceededEvent" + commandFailedEvent monitoringEventType = "CommandFailedEvent" + poolCreatedEvent monitoringEventType = "PoolCreatedEvent" + poolReadyEvent monitoringEventType = "PoolReadyEvent" + poolClearedEvent monitoringEventType = "PoolClearedEvent" + poolClosedEvent monitoringEventType = "PoolClosedEvent" + connectionCreatedEvent monitoringEventType = "ConnectionCreatedEvent" + connectionReadyEvent monitoringEventType = "ConnectionReadyEvent" + connectionClosedEvent monitoringEventType = "ConnectionClosedEvent" + connectionCheckOutStartedEvent monitoringEventType = "ConnectionCheckOutStartedEvent" + connectionCheckOutFailedEvent monitoringEventType = "ConnectionCheckOutFailedEvent" + connectionCheckedOutEvent monitoringEventType = "ConnectionCheckedOutEvent" + connectionCheckedInEvent monitoringEventType = "ConnectionCheckedInEvent" + connectionPendingResponseStarted monitoringEventType = "ConnectionPendingResponseStarted" + connectionPendingResponseSucceeded monitoringEventType = "ConnectionPendingResponseSucceeded" + connectionPendingResponseFailed monitoringEventType = "ConnectionPendingResponseFailed" + serverDescriptionChangedEvent monitoringEventType = "ServerDescriptionChangedEvent" + serverHeartbeatFailedEvent monitoringEventType = "ServerHeartbeatFailedEvent" + serverHeartbeatStartedEvent monitoringEventType = "ServerHeartbeatStartedEvent" + serverHeartbeatSucceededEvent monitoringEventType = "ServerHeartbeatSucceededEvent" + topologyDescriptionChangedEvent monitoringEventType = "TopologyDescriptionChangedEvent" + topologyOpeningEvent monitoringEventType = "TopologyOpeningEvent" + topologyClosedEvent monitoringEventType = "TopologyClosedEvent" ) func monitoringEventTypeFromString(eventStr string) (monitoringEventType, bool) { @@ -72,12 +72,12 @@ func monitoringEventTypeFromString(eventStr string) (monitoringEventType, bool) return connectionCheckedOutEvent, true case "connectioncheckedinevent": return connectionCheckedInEvent, true - case "connectionpendingreadstarted": - return connectionPendingReadStarted, true - case "connectionpendingreadsucceeded": - return connectionPendingReadSucceeded, true - case "connectionpendingreadfailed": - return connectionPendingReadFailed, true + case "connectionpendingresponsestarted": + return connectionPendingResponseStarted, true + case "connectionpendingresponsesucceeded": + return connectionPendingResponseSucceeded, true + case "connectionpendingresponsefailed": + return connectionPendingResponseFailed, true case "serverdescriptionchangedevent": return serverDescriptionChangedEvent, true case "serverheartbeatfailedevent": @@ -122,11 +122,11 @@ func monitoringEventTypeFromPoolEvent(evt *event.PoolEvent) monitoringEventType case event.ConnectionCheckedIn: return connectionCheckedInEvent case event.ConnectionPendingResponseStarted: - return connectionPendingReadStarted + return connectionPendingResponseStarted case event.ConnectionPendingResponseSucceeded: - return connectionPendingReadSucceeded + return connectionPendingResponseSucceeded case event.ConnectionPendingResponseFailed: - return connectionPendingReadFailed + return connectionPendingResponseFailed default: return "" } diff --git a/internal/integration/unified/event_verification.go b/internal/integration/unified/event_verification.go index 58ca7b77ac..eb3e8b49be 100644 --- a/internal/integration/unified/event_verification.go +++ b/internal/integration/unified/event_verification.go @@ -56,10 +56,10 @@ type cmapEvent struct { Reason *string `bson:"reason"` } `bson:"connectionCheckOutFailedEvent"` - ConnectionCheckedInEvent *struct{} `bson:"connectionCheckedInEvent"` - ConnectionPendingReadStarted *struct{} `bson:"connectionPendingReadStarted"` - ConnectionPendingreadSucceeded *struct{} `bson:"connectionPendingReadSucceeded"` - ConnectionPendingReadFailed *struct{} `bson:"connectionPendingReadFailed"` + ConnectionCheckedInEvent *struct{} `bson:"connectionCheckedInEvent"` + ConnectionPendingResponseStarted *struct{} `bson:"connectionPendingResponseStarted"` + ConnectionPendingResponseSucceeded *struct{} `bson:"connectionPendingResponseSucceeded"` + ConnectionPendingResponseFailed *struct{} `bson:"connectionPendingResponseFailed"` PoolClearedEvent *struct { HasServiceID *bool `bson:"hasServiceId"` @@ -362,15 +362,15 @@ func verifyCMAPEvents(client *clientEntity, expectedEvents *expectedEvents) erro if _, pooled, err = getNextPoolEvent(pooled, event.ConnectionCheckedIn); err != nil { return newEventVerificationError(idx, client, "failed to get next pool event: %v", err.Error()) } - case evt.ConnectionPendingReadStarted != nil: + case evt.ConnectionPendingResponseStarted != nil: if _, pooled, err = getNextPoolEvent(pooled, event.ConnectionPendingResponseStarted); err != nil { return newEventVerificationError(idx, client, "failed to get next pool event: %v", err.Error()) } - case evt.ConnectionPendingreadSucceeded != nil: + case evt.ConnectionPendingResponseSucceeded != nil: if _, pooled, err = getNextPoolEvent(pooled, event.ConnectionPendingResponseSucceeded); err != nil { return newEventVerificationError(idx, client, "failed to get next pool event: %v", err.Error()) } - case evt.ConnectionPendingReadFailed != nil: + case evt.ConnectionPendingResponseFailed != nil: if _, pooled, err = getNextPoolEvent(pooled, event.ConnectionPendingResponseFailed); err != nil { return newEventVerificationError(idx, client, "failed to get next pool event: %v", err.Error()) } diff --git a/testdata/client-side-operations-timeout/pending-reads.json b/testdata/client-side-operations-timeout/pending-reads.json deleted file mode 100644 index ef7c20dedb..0000000000 --- a/testdata/client-side-operations-timeout/pending-reads.json +++ /dev/null @@ -1,552 +0,0 @@ -{ - "description": "Operation timeouts do not cause connection churn", - "schemaVersion": "1.9", - "runOnRequirements": [ - { - "minServerVersion": "4.4", - "topologies": [ - "standalone", - "replicaset" - ] - } - ], - "createEntities": [ - { - "client": { - "id": "failPointClient", - "useMultipleMongoses": false - } - }, - { - "client": { - "id": "client", - "uriOptions": { - "maxPoolSize": 1 - }, - "useMultipleMongoses": false, - "observeEvents": [ - "commandFailedEvent", - "commandSucceededEvent", - "connectionCheckedOutEvent", - "connectionCheckedInEvent", - "connectionClosedEvent", - "connectionPendingReadSucceeded", - "connectionPendingReadStarted", - "connectionPendingReadFailed" - ] - } - }, - { - "database": { - "id": "test", - "client": "client", - "databaseName": "test" - } - }, - { - "collection": { - "id": "coll", - "database": "test", - "collectionName": "coll" - } - } - ], - "initialData": [ - { - "collectionName": "coll", - "databaseName": "test", - "documents": [] - } - ], - "tests": [ - { - "description": "Write operation with successful pending read", - "operations": [ - { - "name": "failPoint", - "object": "testRunner", - "arguments": { - "client": "failPointClient", - "failPoint": { - "configureFailPoint": "failCommand", - "mode": { - "times": 1 - }, - "data": { - "failCommands": [ - "insert" - ], - "blockConnection": true, - "blockTimeMS": 750 - } - } - } - }, - { - "name": "insertOne", - "object": "coll", - "arguments": { - "timeoutMS": 50, - "document": { - "_id": 3, - "x": 1 - } - }, - "expectError": { - "isTimeoutError": true - } - }, - { - "name": "findOne", - "object": "coll", - "arguments": { - "filter": { - "_id": 1 - } - } - } - ], - "expectEvents": [ - { - "client": "client", - "events": [ - { - "commandFailedEvent": { - "commandName": "insert" - } - }, - { - "commandSucceededEvent": { - "commandName": "find" - } - } - ] - }, - { - "client": "client", - "eventType": "cmap", - "events": [ - { - "connectionCheckedOutEvent": {} - }, - { - "connectionCheckedInEvent": {} - }, - { - "connectionPendingReadStarted": {} - }, - { - "connectionPendingReadSucceeded": {} - }, - { - "connectionCheckedOutEvent": {} - }, - { - "connectionCheckedInEvent": {} - } - ] - } - ] - }, - { - "description": "Concurrent write operation with successful pending read", - "operations": [ - { - "name": "failPoint", - "object": "testRunner", - "arguments": { - "client": "failPointClient", - "failPoint": { - "configureFailPoint": "failCommand", - "mode": { - "times": 1 - }, - "data": { - "failCommands": [ - "insert" - ], - "blockConnection": true, - "blockTimeMS": 750 - } - } - } - }, - { - "name": "createEntities", - "object": "testRunner", - "arguments": { - "entities": [ - { - "thread": { - "id": "thread0" - } - }, - { - "thread": { - "id": "thread1" - } - } - ] - } - }, - { - "name": "runOnThread", - "object": "testRunner", - "arguments": { - "thread": "thread0", - "operation": { - "name": "insertOne", - "object": "coll", - "arguments": { - "timeoutMS": 500, - "document": { - "_id": 2 - } - } - }, - "expectError": { - "isTimeoutError": true - } - } - }, - { - "name": "waitForEvent", - "object": "testRunner", - "arguments": { - "client": "client", - "event": { - "connectionCheckedOutEvent": {} - }, - "count": 1 - } - }, - { - "name": "runOnThread", - "object": "testRunner", - "arguments": { - "thread": "thread1", - "operation": { - "name": "insertOne", - "object": "coll", - "arguments": { - "document": { - "_id": 3 - } - } - } - } - }, - { - "name": "waitForThread", - "object": "testRunner", - "arguments": { - "thread": "thread1" - } - } - ], - "expectEvents": [ - { - "client": "client", - "events": [ - { - "commandFailedEvent": { - "commandName": "insert" - } - }, - { - "commandSucceededEvent": { - "commandName": "insert" - } - } - ] - }, - { - "client": "client", - "eventType": "cmap", - "events": [ - { - "connectionCheckedOutEvent": {} - }, - { - "connectionCheckedInEvent": {} - }, - { - "connectionPendingReadStarted": {} - }, - { - "connectionPendingReadSucceeded": {} - }, - { - "connectionCheckedOutEvent": {} - }, - { - "connectionCheckedInEvent": {} - } - ] - } - ] - }, - { - "description": "Write operation with unsuccessful pending read", - "operations": [ - { - "name": "failPoint", - "object": "testRunner", - "arguments": { - "client": "failPointClient", - "failPoint": { - "configureFailPoint": "failCommand", - "mode": { - "times": 1 - }, - "data": { - "failCommands": [ - "insert" - ], - "blockConnection": true, - "blockTimeMS": 500 - } - } - } - }, - { - "name": "insertOne", - "object": "coll", - "arguments": { - "timeoutMS": 50, - "document": { - "_id": 3, - "x": 1 - } - }, - "expectError": { - "isTimeoutError": true - } - }, - { - "name": "insertOne", - "object": "coll", - "arguments": { - "timeoutMS": 400, - "document": { - "_id": 3, - "x": 1 - } - }, - "expectError": { - "isTimeoutError": true - } - } - ], - "expectEvents": [ - { - "client": "client", - "events": [ - { - "commandFailedEvent": { - "commandName": "insert" - } - } - ] - }, - { - "client": "client", - "eventType": "cmap", - "events": [ - { - "connectionCheckedOutEvent": {} - }, - { - "connectionCheckedInEvent": {} - }, - { - "connectionPendingReadStarted": {} - }, - { - "connectionPendingReadFailed": {} - }, - { - "connectionClosedEvent": { - "reason": "error" - } - } - ] - } - ] - }, - { - "description": "Read operation with successful pending read", - "operations": [ - { - "name": "failPoint", - "object": "testRunner", - "arguments": { - "client": "failPointClient", - "failPoint": { - "configureFailPoint": "failCommand", - "mode": { - "times": 1 - }, - "data": { - "failCommands": [ - "find" - ], - "blockConnection": true, - "blockTimeMS": 750 - } - } - } - }, - { - "name": "findOne", - "object": "coll", - "arguments": { - "timeoutMS": 50, - "filter": { - "_id": 1 - } - }, - "expectError": { - "isTimeoutError": true - } - }, - { - "name": "findOne", - "object": "coll", - "arguments": { - "filter": { - "_id": 1 - } - } - } - ], - "expectEvents": [ - { - "client": "client", - "events": [ - { - "commandFailedEvent": { - "commandName": "find" - } - }, - { - "commandSucceededEvent": { - "commandName": "find" - } - } - ] - }, - { - "client": "client", - "eventType": "cmap", - "events": [ - { - "connectionCheckedOutEvent": {} - }, - { - "connectionCheckedInEvent": {} - }, - { - "connectionPendingReadStarted": {} - }, - { - "connectionPendingReadSucceeded": {} - }, - { - "connectionCheckedOutEvent": {} - }, - { - "connectionCheckedInEvent": {} - } - ] - } - ] - }, - { - "description": "Read operation with unsuccessful pending read", - "operations": [ - { - "name": "failPoint", - "object": "testRunner", - "arguments": { - "client": "failPointClient", - "failPoint": { - "configureFailPoint": "failCommand", - "mode": { - "times": 1 - }, - "data": { - "failCommands": [ - "find" - ], - "blockConnection": true, - "blockTimeMS": 500 - } - } - } - }, - { - "name": "findOne", - "object": "coll", - "arguments": { - "timeoutMS": 50, - "filter": { - "_id": 1 - } - }, - "expectError": { - "isTimeoutError": true - } - }, - { - "name": "findOne", - "object": "coll", - "arguments": { - "timeoutMS": 400, - "filter": { - "_id": 1 - } - }, - "expectError": { - "isTimeoutError": true - } - } - ], - "expectEvents": [ - { - "client": "client", - "events": [ - { - "commandFailedEvent": { - "commandName": "find" - } - } - ] - }, - { - "client": "client", - "eventType": "cmap", - "events": [ - { - "connectionCheckedOutEvent": {} - }, - { - "connectionCheckedInEvent": {} - }, - { - "connectionPendingReadStarted": {} - }, - { - "connectionPendingReadFailed": {} - }, - { - "connectionClosedEvent": { - "reason": "error" - } - } - ] - } - ] - } - ] -} diff --git a/testdata/client-side-operations-timeout/pending-reads.yml b/testdata/client-side-operations-timeout/pending-reads.yml deleted file mode 100644 index c76f6bf1e2..0000000000 --- a/testdata/client-side-operations-timeout/pending-reads.yml +++ /dev/null @@ -1,325 +0,0 @@ -description: "Operation timeouts do not cause connection churn" - -schemaVersion: "1.9" - -runOnRequirements: - - minServerVersion: "4.4" - # TODO(SERVER-96344): When using failpoints, mongos returns MaxTimeMSExpired - # after maxTimeMS, whereas mongod returns it after - # max(blockTimeMS, maxTimeMS). Until this ticket is resolved, these tests - # will not pass on sharded clusters. - topologies: ["standalone", "replicaset"] - -createEntities: - - client: - id: &failPointClient failPointClient - useMultipleMongoses: false - - client: - id: &client client - uriOptions: - maxPoolSize: 1 - useMultipleMongoses: false - observeEvents: - - commandFailedEvent - - commandSucceededEvent - - connectionCheckedOutEvent - - connectionCheckedInEvent - - connectionClosedEvent - - connectionPendingReadSucceeded - - connectionPendingReadStarted - - connectionPendingReadFailed - - database: - id: &database test - client: *client - databaseName: *database - - collection: - id: &collection coll - database: *database - collectionName: *collection - -initialData: - - collectionName: *collection - databaseName: *database - documents: [] - -tests: - - description: "Write operation with successful pending read" - operations: - # Create a failpoint to block the first operation - - name: failPoint - object: testRunner - arguments: - client: *failPointClient - failPoint: - configureFailPoint: failCommand - mode: { times: 1 } - data: - failCommands: ["insert"] - blockConnection: true - blockTimeMS: 750 - - # Execute operation with timeout less than block time - - name: insertOne - object: *collection - arguments: - timeoutMS: 50 - document: { _id: 3, x: 1 } - expectError: - isTimeoutError: true - - # Execute a subsequent operation to complete the read - - name: findOne - object: *collection - arguments: - filter: { _id: 1 } - - expectEvents: - - client: *client - events: - - commandFailedEvent: - commandName: insert - - commandSucceededEvent: - commandName: find - - client: *client - eventType: cmap - events: - - connectionCheckedOutEvent: {} - - connectionCheckedInEvent: {} # insert fails - - connectionPendingReadStarted: {} - - connectionPendingReadSucceeded: {} # find op drains conn - - connectionCheckedOutEvent: {} - - connectionCheckedInEvent: {} # find succeeds - - - description: "Concurrent write operation with successful pending read" - operations: - # Create a failpoint to block the first operation - - name: failPoint - object: testRunner - arguments: - client: *failPointClient - failPoint: - configureFailPoint: failCommand - mode: { times: 1 } - data: - failCommands: ["insert"] - blockConnection: true - blockTimeMS: 750 - - # Start threads. - - name: createEntities - object: testRunner - arguments: - entities: - - thread: - id: &thread0 thread0 - - thread: - id: &thread1 thread1 - - # Run an insert in two threads. We expect the first to time out and the - # second to finish the pending read from the first and complete - # successfully. - - name: runOnThread - object: testRunner - arguments: - thread: *thread0 - operation: - name: insertOne - object: *collection - arguments: - timeoutMS: 500 - document: - _id: 2 - expectError: - isTimeoutError: true - - # Ensure the first thread checks out a connection before executing the - # operation in the second thread. This maintains concurrent behavior but - # presents the worst case scenario. - - name: waitForEvent - object: testRunner - arguments: - client: *client - event: - connectionCheckedOutEvent: {} - count: 1 - - - name: runOnThread - object: testRunner - arguments: - thread: *thread1 - operation: - name: insertOne - object: *collection - arguments: - document: - _id: 3 - - # Stop threads. - - name: waitForThread - object: testRunner - arguments: - thread: *thread1 - - expectEvents: - - client: *client - events: - - commandFailedEvent: - commandName: insert - - commandSucceededEvent: - commandName: insert - - client: *client - eventType: cmap - events: - - connectionCheckedOutEvent: {} # insert - - connectionCheckedInEvent: {} # insert fails - - connectionPendingReadStarted: {} - - connectionPendingReadSucceeded: {} - - connectionCheckedOutEvent: {} # find - - connectionCheckedInEvent: {} # find succeeds - - - description: "Write operation with unsuccessful pending read" - operations: - # Create a failpoint to block the first operation - - name: failPoint - object: testRunner - arguments: - client: *failPointClient - failPoint: - configureFailPoint: failCommand - mode: { times: 1 } - data: - failCommands: ["insert"] - blockConnection: true - blockTimeMS: 500 - - # Execute operation with timeout less than block time - - name: insertOne - object: *collection - arguments: - timeoutMS: 50 - document: { _id: 3, x: 1 } - expectError: - isTimeoutError: true - - # The pending read should fail - - name: insertOne - object: *collection - arguments: - timeoutMS: 400 - document: { _id: 3, x: 1 } - expectError: - isTimeoutError: true - - expectEvents: - - client: *client - events: - - commandFailedEvent: - commandName: insert - # No second failed event since we timed out attempting to check out - # the connection for the second operation - - client: *client - eventType: cmap - events: - - connectionCheckedOutEvent: {} # first insert - - connectionCheckedInEvent: {} # first insert fails - - connectionPendingReadStarted: {} - - connectionPendingReadFailed: {} - - connectionClosedEvent: # second insert times out pending read in checkout, closes - reason: error - - - description: "Read operation with successful pending read" - operations: - # Create a failpoint to block the first operation - - name: failPoint - object: testRunner - arguments: - client: *failPointClient - failPoint: - configureFailPoint: failCommand - mode: { times: 1 } - data: - failCommands: ["find"] - blockConnection: true - blockTimeMS: 750 - - # Execute operation with timeout less than block time - - name: findOne - object: *collection - arguments: - timeoutMS: 50 - filter: { _id: 1 } - expectError: - isTimeoutError: true - - # Execute a subsequent operation to complete the read - - name: findOne - object: *collection - arguments: - filter: { _id: 1 } - - expectEvents: - - client: *client - events: - - commandFailedEvent: - commandName: find - - commandSucceededEvent: - commandName: find - - client: *client - eventType: cmap - events: - - connectionCheckedOutEvent: {} # first find - - connectionCheckedInEvent: {} # first find fails - - connectionPendingReadStarted: {} - - connectionPendingReadSucceeded: {} - - connectionCheckedOutEvent: {} # second find - - connectionCheckedInEvent: {} # second find succeeds - - - description: "Read operation with unsuccessful pending read" - operations: - # Create a failpoint to block the first operation - - name: failPoint - object: testRunner - arguments: - client: *failPointClient - failPoint: - configureFailPoint: failCommand - mode: { times: 1 } - data: - failCommands: ["find"] - blockConnection: true - blockTimeMS: 500 - - # Execute operation with timeout less than block time - - name: findOne - object: *collection - arguments: - timeoutMS: 50 - filter: { _id: 1 } - expectError: - isTimeoutError: true - - # The pending read should fail - - name: findOne - object: *collection - arguments: - timeoutMS: 400 - filter: { _id: 1 } - expectError: - isTimeoutError: true - - expectEvents: - - client: *client - events: - - commandFailedEvent: - commandName: find - # No second failed event since we timed out attempting to check out - # the connection for the second operation - - client: *client - eventType: cmap - events: - - connectionCheckedOutEvent: {} # first find - - connectionCheckedInEvent: {} # first find fails - - connectionPendingReadStarted: {} - - connectionPendingReadFailed: {} - - connectionClosedEvent: # second find times out pending read in checkout, closes - reason: error From 3fa059d0d6fa687f80f3b56d1b472fd5668330ef Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Wed, 30 Apr 2025 16:09:07 -0600 Subject: [PATCH 11/23] GODRIVER-3173 Fix typos --- bson/default_value_encoders.go | 2 - event/monitoring.go | 9 ++- internal/integration/csot_prose_test.go | 1 - internal/logger/component.go | 58 ++++++++--------- x/mongo/driver/topology/cmap_prose_test.go | 6 +- x/mongo/driver/topology/connection.go | 18 +++--- x/mongo/driver/topology/pool.go | 74 +++++++++++----------- x/mongo/driver/topology/pool_test.go | 2 +- 8 files changed, 82 insertions(+), 88 deletions(-) diff --git a/bson/default_value_encoders.go b/bson/default_value_encoders.go index 83c409375d..bd5a20f2f9 100644 --- a/bson/default_value_encoders.go +++ b/bson/default_value_encoders.go @@ -9,7 +9,6 @@ package bson import ( "encoding/json" "errors" - "fmt" "math" "net/url" "reflect" @@ -166,7 +165,6 @@ func decimal128EncodeValue(_ EncodeContext, vw ValueWriter, val reflect.Value) e if !val.IsValid() || val.Type() != tDecimal { return ValueEncoderError{Name: "Decimal128EncodeValue", Types: []reflect.Type{tDecimal}, Received: val} } - fmt.Println(val.Interface().(Decimal128)) return vw.WriteDecimal128(val.Interface().(Decimal128)) } diff --git a/event/monitoring.go b/event/monitoring.go index 6f6db625ac..6e2cefca2f 100644 --- a/event/monitoring.go +++ b/event/monitoring.go @@ -108,11 +108,10 @@ type PoolEvent struct { Reason string `json:"reason"` // ServiceID is only set if the Type is PoolCleared and the server is deployed behind a load balancer. This field // can be used to distinguish between individual servers in a load balanced deployment. - ServiceID *bson.ObjectID `json:"serviceId"` - Interruption bool `json:"interruptInUseConnections"` - Error error `json:"error"` - RequestID int32 `json:"requestId"` - RemainingTime time.Duration `json:"remainingTime"` + ServiceID *bson.ObjectID `json:"serviceId"` + Interruption bool `json:"interruptInUseConnections"` + Error error `json:"error"` + RequestID int32 `json:"requestId"` } // PoolMonitor is a function that allows the user to gain access to events occurring in the pool diff --git a/internal/integration/csot_prose_test.go b/internal/integration/csot_prose_test.go index f2854559ce..ce7219b042 100644 --- a/internal/integration/csot_prose_test.go +++ b/internal/integration/csot_prose_test.go @@ -176,7 +176,6 @@ func TestCSOTProse(t *testing.T) { time.Millisecond, "expected ping to fail within 150ms") }) - }) mt.RunOpts("11. multi-batch bulkWrites", mtest.NewOptions().MinServerVersion("8.0"). diff --git a/internal/logger/component.go b/internal/logger/component.go index 85acf05142..366a482b86 100644 --- a/internal/logger/component.go +++ b/internal/logger/component.go @@ -14,35 +14,35 @@ import ( ) const ( - CommandFailed = "Command failed" - CommandStarted = "Command started" - CommandSucceeded = "Command succeeded" - ConnectionPoolCreated = "Connection pool created" - ConnectionPoolReady = "Connection pool ready" - ConnectionPoolCleared = "Connection pool cleared" - ConnectionPoolClosed = "Connection pool closed" - ConnectionCreated = "Connection created" - ConnectionReady = "Connection ready" - ConnectionClosed = "Connection closed" - ConnectionCheckoutStarted = "Connection checkout started" - ConnectionCheckoutFailed = "Connection checkout failed" - ConnectionCheckedOut = "Connection checked out" - ConnectionCheckedIn = "Connection checked in" - ConnectionPendingReadStarted = "Pending response started" - ConnectionPendingReadSucceeded = "Pending response succeeded" - ConnectionPendingReadFailed = "Pending response failed" - ServerSelectionFailed = "Server selection failed" - ServerSelectionStarted = "Server selection started" - ServerSelectionSucceeded = "Server selection succeeded" - ServerSelectionWaiting = "Waiting for suitable server to become available" - TopologyClosed = "Stopped topology monitoring" - TopologyDescriptionChanged = "Topology description changed" - TopologyOpening = "Starting topology monitoring" - TopologyServerClosed = "Stopped server monitoring" - TopologyServerHeartbeatFailed = "Server heartbeat failed" - TopologyServerHeartbeatStarted = "Server heartbeat started" - TopologyServerHeartbeatSucceeded = "Server heartbeat succeeded" - TopologyServerOpening = "Starting server monitoring" + CommandFailed = "Command failed" + CommandStarted = "Command started" + CommandSucceeded = "Command succeeded" + ConnectionPoolCreated = "Connection pool created" + ConnectionPoolReady = "Connection pool ready" + ConnectionPoolCleared = "Connection pool cleared" + ConnectionPoolClosed = "Connection pool closed" + ConnectionCreated = "Connection created" + ConnectionReady = "Connection ready" + ConnectionClosed = "Connection closed" + ConnectionCheckoutStarted = "Connection checkout started" + ConnectionCheckoutFailed = "Connection checkout failed" + ConnectionCheckedOut = "Connection checked out" + ConnectionCheckedIn = "Connection checked in" + ConnectionPendingResponseStarted = "Pending response started" + ConnectionPendingResponseSucceeded = "Pending response succeeded" + ConnectionPendingResponseFailed = "Pending response failed" + ServerSelectionFailed = "Server selection failed" + ServerSelectionStarted = "Server selection started" + ServerSelectionSucceeded = "Server selection succeeded" + ServerSelectionWaiting = "Waiting for suitable server to become available" + TopologyClosed = "Stopped topology monitoring" + TopologyDescriptionChanged = "Topology description changed" + TopologyOpening = "Starting topology monitoring" + TopologyServerClosed = "Stopped server monitoring" + TopologyServerHeartbeatFailed = "Server heartbeat failed" + TopologyServerHeartbeatStarted = "Server heartbeat started" + TopologyServerHeartbeatSucceeded = "Server heartbeat succeeded" + TopologyServerOpening = "Starting server monitoring" ) const ( diff --git a/x/mongo/driver/topology/cmap_prose_test.go b/x/mongo/driver/topology/cmap_prose_test.go index 1b33e263ec..38171df40b 100644 --- a/x/mongo/driver/topology/cmap_prose_test.go +++ b/x/mongo/driver/topology/cmap_prose_test.go @@ -272,7 +272,7 @@ func TestCMAPProse(t *testing.T) { // Need to test the case where we attempt a non-blocking read to determine if // we should refresh the remaining time. In the case of the Go Driver, we do - // this by attempt to "pee" at 1 byte with a deadline of 1ns. + // this by attempt to "peek" at 1 byte with a deadline of 1ns. t.Run("connection attempts peek but fails", func(t *testing.T) { const requestID = int32(-1) timeout := 10 * time.Millisecond @@ -342,7 +342,7 @@ func TestCMAPProse(t *testing.T) { time.Sleep(3 * time.Second) // Check out the connection again. The remaining time should be exhausted - // requiring us to "peek" at the connection to determine if we should + // requiring us to "peek" at the connection to determine if we should. _, err = p.checkOut(context.Background()) assert.ErrorIs(t, err, io.EOF) @@ -363,7 +363,6 @@ func TestCMAPProse(t *testing.T) { assert.Equal(t, requestID, failed[0].RequestID) assert.Equal(t, "error", failed[0].Reason) assert.ErrorIs(t, failed[0].Error, io.EOF) - assert.Equal(t, time.Duration(0), failed[0].RemainingTime) // There should be 0 ConnectionPendingResponseSucceeded event. require.Len(t, poolEventsByType[event.ConnectionPendingResponseSucceeded], 0) @@ -442,6 +441,7 @@ func TestCMAPProse(t *testing.T) { // Check out the connection again. The remaining time should be exhausted // requiring us to "peek" at the connection to determine if we should + // close as not alive. _, err = p.checkOut(context.Background()) require.NoError(t, err) diff --git a/x/mongo/driver/topology/connection.go b/x/mongo/driver/topology/connection.go index 7edbd15ceb..a7ff24e935 100644 --- a/x/mongo/driver/topology/connection.go +++ b/x/mongo/driver/topology/connection.go @@ -47,7 +47,7 @@ var ( func nextConnectionID() uint64 { return atomic.AddUint64(&globalConnectionID, 1) } -type pendingReadState struct { +type pendingResponseState struct { remainingBytes int32 requestID int32 start time.Time @@ -88,11 +88,11 @@ type connection struct { // accessTokens in the OIDC authenticator cache. oidcTokenGenID uint64 - // pendingReadState contains information required to attempt a pending read + // pendingResponseState contains information required to attempt a pending read // in the event of a socket timeout for an operation that has appended // maxTimeMS to the wire message. - pendingReadState *pendingReadState - pendingReadMu sync.Mutex + pendingResponseState *pendingResponseState + pendingResponseStateMu sync.Mutex } // newConnection handles the creation of a connection. It does not connect the connection. @@ -415,14 +415,14 @@ func (c *connection) readWireMessage(ctx context.Context) ([]byte, error) { dst, errMsg, err := c.read(ctx) if err != nil { - c.pendingReadMu.Lock() - if c.pendingReadState == nil { + c.pendingResponseStateMu.Lock() + if c.pendingResponseState == nil { // If there is no pending read on the connection, use the pre-CSOT // behavior and close the connection because we don't know if there are // other bytes left to read. c.close() } - c.pendingReadMu.Unlock() + c.pendingResponseStateMu.Unlock() message := errMsg if errors.Is(err, io.EOF) { message = "socket was unexpectedly closed" @@ -490,7 +490,7 @@ func (c *connection) read(ctx context.Context) (bytesRead []byte, errMsg string, if l := int32(n); l == 0 && isCSOTTimeout(err) && driverutil.HasMaxTimeMS(ctx) { requestID, _ := driverutil.GetRequestID(ctx) - c.pendingReadState = &pendingReadState{ + c.pendingResponseState = &pendingResponseState{ remainingBytes: l, requestID: requestID, start: time.Now(), @@ -512,7 +512,7 @@ func (c *connection) read(ctx context.Context) (bytesRead []byte, errMsg string, if remainingBytes > 0 && isCSOTTimeout(err) && driverutil.HasMaxTimeMS(ctx) { requestID, _ := driverutil.GetRequestID(ctx) - c.pendingReadState = &pendingReadState{ + c.pendingResponseState = &pendingResponseState{ remainingBytes: remainingBytes, requestID: requestID, start: time.Now(), diff --git a/x/mongo/driver/topology/pool.go b/x/mongo/driver/topology/pool.go index 7bfd368818..02e24ef532 100644 --- a/x/mongo/driver/topology/pool.go +++ b/x/mongo/driver/topology/pool.go @@ -578,7 +578,7 @@ func (p *pool) checkOut(ctx context.Context) (conn *connection, err error) { return nil, w.err } - if err := awaitPendingRead(ctx, p, w.conn); err != nil { + if err := awaitPendingResponse(ctx, p, w.conn); err != nil { return nil, err } @@ -638,7 +638,7 @@ func (p *pool) checkOut(ctx context.Context) (conn *connection, err error) { return nil, w.err } - if err := awaitPendingRead(ctx, p, w.conn); err != nil { + if err := awaitPendingResponse(ctx, p, w.conn); err != nil { return nil, err } @@ -781,19 +781,18 @@ func (p *pool) removeConnection(conn *connection, reason reason, err error) erro return nil } -// PendingReadTimeout is the maximum amount of the to wait when trying to read -// the server reply on a connection after an operation timed out. The -// default is 400 milliseconds. This value is refreshed for every 4KB read from -// the TCP stream. +// PendingResponseTimeout is the maximum amount of the to wait when trying to +// read the server reply on a connection after an operation timed out. The +// default is 3000 milliseconds. // -// Deprecated: PendingReadTimeout is intended for internal use only and may be -// removed or modified at any time. -var PendingReadTimeout = 3000 * time.Millisecond +// Deprecated: PendingResponseTimeout is intended for internal use only and may +// be removed or modified at any time. +var PendingResponseTimeout = 3000 * time.Millisecond -// publishPendingReadStarted will log a message to the pool logger and +// publishPendingResponseStarted will log a message to the pool logger and // publish an event to the pool monitor if they are set. -func publishPendingReadStarted(pool *pool, conn *connection) { - prs := conn.pendingReadState +func publishPendingResponseStarted(pool *pool, conn *connection) { + prs := conn.pendingResponseState if prs == nil { return } @@ -805,7 +804,7 @@ func publishPendingReadStarted(pool *pool, conn *connection) { logger.KeyRequestID, prs.requestID, } - logPoolMessage(pool, logger.ConnectionPendingReadStarted, keysAndValues...) + logPoolMessage(pool, logger.ConnectionPendingResponseStarted, keysAndValues...) } // publish an event to the pool monitor if it is set. @@ -821,8 +820,8 @@ func publishPendingReadStarted(pool *pool, conn *connection) { } } -func publishPendingReadFailed(pool *pool, conn *connection, err error) { - prs := conn.pendingReadState +func publishPendingResponseFailed(pool *pool, conn *connection, err error) { + prs := conn.pendingResponseState if prs == nil { return } @@ -840,7 +839,7 @@ func publishPendingReadFailed(pool *pool, conn *connection, err error) { logger.KeyError, err.Error(), } - logPoolMessage(pool, logger.ConnectionPendingReadFailed, keysAndValues...) + logPoolMessage(pool, logger.ConnectionPendingResponseFailed, keysAndValues...) } if pool.monitor != nil { @@ -856,8 +855,8 @@ func publishPendingReadFailed(pool *pool, conn *connection, err error) { } } -func publishPendingReadSucceeded(pool *pool, conn *connection, dur time.Duration) { - prs := conn.pendingReadState +func publishPendingResponseSucceeded(pool *pool, conn *connection, dur time.Duration) { + prs := conn.pendingResponseState if prs == nil { return } @@ -869,7 +868,7 @@ func publishPendingReadSucceeded(pool *pool, conn *connection, dur time.Duration logger.KeyDurationMS, dur.Milliseconds(), } - logPoolMessage(pool, logger.ConnectionPendingReadSucceeded, keysAndValues...) + logPoolMessage(pool, logger.ConnectionPendingResponseSucceeded, keysAndValues...) } if pool.monitor != nil { @@ -901,8 +900,8 @@ func peekConnectionAlive(conn *connection) (int, error) { return len(bytes), err } -func attemptPendingRead(ctx context.Context, conn *connection, remainingTime time.Duration) (int, error) { - pendingreadState := conn.pendingReadState +func attemptPendingResponse(ctx context.Context, conn *connection, remainingTime time.Duration) (int, error) { + pendingreadState := conn.pendingResponseState if pendingreadState == nil { return 0, fmt.Errorf("no pending read state") } @@ -986,25 +985,25 @@ func attemptPendingRead(ctx context.Context, conn *connection, remainingTime tim return int(totalRead), nil } -// awaitPendingRead sets a new read deadline on the provided connection and +// awaitPendingResponse sets a new read deadline on the provided connection and // tries to read any bytes returned by the server. If there are any errors, the // connection will be checked back into the pool to be retried. -func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { - conn.pendingReadMu.Lock() - defer conn.pendingReadMu.Unlock() +func awaitPendingResponse(ctx context.Context, pool *pool, conn *connection) error { + conn.pendingResponseStateMu.Lock() + defer conn.pendingResponseStateMu.Unlock() // If there are no bytes pending read, do nothing. - if conn.pendingReadState == nil { + if conn.pendingResponseState == nil { return nil } - publishPendingReadStarted(pool, conn) + publishPendingResponseStarted(pool, conn) var ( - pendingReadState = conn.pendingReadState - remainingTime = pendingReadState.start.Add(PendingReadTimeout).Sub(time.Now()) - err error - bytesRead int + pendingResponseState = conn.pendingResponseState + remainingTime = pendingResponseState.start.Add(PendingResponseTimeout).Sub(time.Now()) + err error + bytesRead int ) st := time.Now() @@ -1013,7 +1012,7 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { // aliveness. In such cases, we don't want to close the connection. bytesRead, err = peekConnectionAlive(conn) } else { - bytesRead, err = attemptPendingRead(ctx, conn, remainingTime) + bytesRead, err = attemptPendingResponse(ctx, conn, remainingTime) } endTime := time.Now() @@ -1029,13 +1028,12 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { // operations. This is likely a bug in the Go Driver. So it's possible that // the connection is idle at the point of check-in. defer func() { - publishPendingReadFailed(pool, conn, err) + publishPendingResponseFailed(pool, conn, err) _ = pool.checkInNoEvent(conn) }() if netErr, ok := err.(net.Error); ok && !netErr.Timeout() { - fmt.Println(1) if err := conn.close(); err != nil { return err } @@ -1045,11 +1043,11 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { // If the read was successful, then we should refresh the remaining timeout. if bytesRead > 0 { - pendingReadState.start = endTime + pendingResponseState.start = endTime } // If the remaining time has been exceeded, then close the connection. - if endTime.Sub(pendingReadState.start) > PendingReadTimeout { + if endTime.Sub(pendingResponseState.start) > PendingResponseTimeout { if err := conn.close(); err != nil { return err } @@ -1059,9 +1057,9 @@ func awaitPendingRead(ctx context.Context, pool *pool, conn *connection) error { return err } - publishPendingReadSucceeded(pool, conn, endDuration) + publishPendingResponseSucceeded(pool, conn, endDuration) - conn.pendingReadState = nil + conn.pendingResponseState = nil return nil } diff --git a/x/mongo/driver/topology/pool_test.go b/x/mongo/driver/topology/pool_test.go index c05e8133f0..f0ba122646 100644 --- a/x/mongo/driver/topology/pool_test.go +++ b/x/mongo/driver/topology/pool_test.go @@ -1273,7 +1273,7 @@ func TestAwaitPendingRead(t *testing.T) { `^connection\(.*\[-\d+\]\) incomplete read of message header: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, ) assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) - assert.Nil(t, conn.pendingReadState, "conn.awaitRemainingBytes should be nil") + assert.Nil(t, conn.pendingResponseState, "conn.awaitRemainingBytes should be nil") }) t.Run("timeout reading message header, successful background read", func(t *testing.T) { timeout := 10 * time.Millisecond From 72a4a0defd6c12740e33c37f79b8da414b36bd02 Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Wed, 30 Apr 2025 16:31:53 -0600 Subject: [PATCH 12/23] GODRIVER-3173 Update spec --- testdata/specifications | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testdata/specifications b/testdata/specifications index 6118debee4..43d2c7bacd 160000 --- a/testdata/specifications +++ b/testdata/specifications @@ -1 +1 @@ -Subproject commit 6118debee41cfd1bca197b315bd1f10ad95f66ae +Subproject commit 43d2c7bacd62249de8d2173bf8ee39e6fd7a686e From 02f048a5a05684a71dea4f05fd05ef26c75e5ddf Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Mon, 5 May 2025 13:23:41 -0600 Subject: [PATCH 13/23] GODRIVER-3173 Add duration to failure; update tests --- internal/integration/unified/schema_version.go | 2 +- testdata/specifications | 2 +- x/mongo/driver/topology/cmap_prose_test.go | 1 + x/mongo/driver/topology/pool.go | 6 ++++-- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/internal/integration/unified/schema_version.go b/internal/integration/unified/schema_version.go index 7908b39017..f1f5447883 100644 --- a/internal/integration/unified/schema_version.go +++ b/internal/integration/unified/schema_version.go @@ -16,7 +16,7 @@ import ( var ( supportedSchemaVersions = map[int]string{ - 1: "1.22", + 1: "1.24", } ) diff --git a/testdata/specifications b/testdata/specifications index 43d2c7bacd..d286786bc1 160000 --- a/testdata/specifications +++ b/testdata/specifications @@ -1 +1 @@ -Subproject commit 43d2c7bacd62249de8d2173bf8ee39e6fd7a686e +Subproject commit d286786bc1d4d17cf91a89ee0e8a5cbb4e14ee2f diff --git a/x/mongo/driver/topology/cmap_prose_test.go b/x/mongo/driver/topology/cmap_prose_test.go index 38171df40b..55ca7ec000 100644 --- a/x/mongo/driver/topology/cmap_prose_test.go +++ b/x/mongo/driver/topology/cmap_prose_test.go @@ -362,6 +362,7 @@ func TestCMAPProse(t *testing.T) { assert.Equal(t, conn.driverConnectionID, failed[0].ConnectionID) assert.Equal(t, requestID, failed[0].RequestID) assert.Equal(t, "error", failed[0].Reason) + assert.Greater(t, int(failed[0].Duration), 0) assert.ErrorIs(t, failed[0].Error, io.EOF) // There should be 0 ConnectionPendingResponseSucceeded event. diff --git a/x/mongo/driver/topology/pool.go b/x/mongo/driver/topology/pool.go index 02e24ef532..4373041f11 100644 --- a/x/mongo/driver/topology/pool.go +++ b/x/mongo/driver/topology/pool.go @@ -820,7 +820,7 @@ func publishPendingResponseStarted(pool *pool, conn *connection) { } } -func publishPendingResponseFailed(pool *pool, conn *connection, err error) { +func publishPendingResponseFailed(pool *pool, conn *connection, dur time.Duration, err error) { prs := conn.pendingResponseState if prs == nil { return @@ -850,6 +850,7 @@ func publishPendingResponseFailed(pool *pool, conn *connection, err error) { RequestID: prs.requestID, Reason: reason, Error: err, + Duration: dur, } pool.monitor.Event(e) } @@ -1028,7 +1029,8 @@ func awaitPendingResponse(ctx context.Context, pool *pool, conn *connection) err // operations. This is likely a bug in the Go Driver. So it's possible that // the connection is idle at the point of check-in. defer func() { - publishPendingResponseFailed(pool, conn, err) + dur := time.Since(st) + publishPendingResponseFailed(pool, conn, dur, err) _ = pool.checkInNoEvent(conn) }() From 328db7e9e28f70307d69cf028e2905588f2261ad Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Fri, 9 May 2025 14:10:43 -0600 Subject: [PATCH 14/23] GODRIVER-3173 Shore up testing --- testdata/specifications | 2 +- x/mongo/driver/topology/cmap_prose_test.go | 6 ++++++ x/mongo/driver/topology/pool.go | 9 ++++++++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/testdata/specifications b/testdata/specifications index d286786bc1..0c41c8b283 160000 --- a/testdata/specifications +++ b/testdata/specifications @@ -1 +1 @@ -Subproject commit d286786bc1d4d17cf91a89ee0e8a5cbb4e14ee2f +Subproject commit 0c41c8b28321e093e7e156c72abe65744cc1b467 diff --git a/x/mongo/driver/topology/cmap_prose_test.go b/x/mongo/driver/topology/cmap_prose_test.go index 55ca7ec000..cc24a029cb 100644 --- a/x/mongo/driver/topology/cmap_prose_test.go +++ b/x/mongo/driver/topology/cmap_prose_test.go @@ -367,6 +367,9 @@ func TestCMAPProse(t *testing.T) { // There should be 0 ConnectionPendingResponseSucceeded event. require.Len(t, poolEventsByType[event.ConnectionPendingResponseSucceeded], 0) + + // The connection should have been closed. + require.Len(t, poolEventsByType[event.ConnectionClosed], 1) }) t.Run("connection attempts peek and succeeds", func(t *testing.T) { @@ -465,6 +468,9 @@ func TestCMAPProse(t *testing.T) { assert.Equal(t, conn.driverConnectionID, succeeded[0].ConnectionID) assert.Equal(t, requestID, succeeded[0].RequestID) assert.Greater(t, int(succeeded[0].Duration), 0) + + // The connection should not have been closed. + require.Len(t, poolEventsByType[event.ConnectionClosed], 0) }) } diff --git a/x/mongo/driver/topology/pool.go b/x/mongo/driver/topology/pool.go index 4373041f11..1feb07811c 100644 --- a/x/mongo/driver/topology/pool.go +++ b/x/mongo/driver/topology/pool.go @@ -1035,7 +1035,14 @@ func awaitPendingResponse(ctx context.Context, pool *pool, conn *connection) err _ = pool.checkInNoEvent(conn) }() - if netErr, ok := err.(net.Error); ok && !netErr.Timeout() { + isCSOTTimeout := func(err error) bool { + // If the error was a timeout error, instead of closing the + // connection mark it as awaiting response so the pool can read the + // response before making it available to other operations. + nerr := net.Error(nil) + return errors.As(err, &nerr) && nerr.Timeout() + } + if !isCSOTTimeout(err) { if err := conn.close(); err != nil { return err } From 106003a46824e86d2532c3ce3664040f4dfb7336 Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Wed, 25 Jun 2025 16:09:55 -0600 Subject: [PATCH 15/23] GODRIVER-3173 Add CMAP prose tests --- .evergreen/config.yml | 45 +++++- .evergreen/setup-system.sh | 19 +-- internal/integration/cmap_prose_test.go | 151 +++++++++++++++++++++ internal/integration/mtest/mongotest.go | 9 +- testdata/specifications | 2 +- x/mongo/driver/operation.go | 4 + x/mongo/driver/topology/cmap_prose_test.go | 103 +------------- x/mongo/driver/topology/pool.go | 34 ++++- 8 files changed, 251 insertions(+), 116 deletions(-) create mode 100644 internal/integration/cmap_prose_test.go diff --git a/.evergreen/config.yml b/.evergreen/config.yml index 332175496a..9bc9f0ac51 100644 --- a/.evergreen/config.yml +++ b/.evergreen/config.yml @@ -543,6 +543,16 @@ functions: binary: bash args: ["${DRIVERS_TOOLS}/.evergreen/csfle/await-servers.sh"] + start-mongoproxy: + - command: ec2.assume_role # TODO: not sure this is needd + params: + role_arn: ${aws_test_secrets_role} + - command: subprocess.exec + params: + binary: bash + background: true + args: ["${DRIVERS_TOOLS}/.evergreen/start-mongoproxy.sh"] + run-kms-tls-test: - command: subprocess.exec type: test @@ -658,6 +668,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" @@ -673,6 +684,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" @@ -689,6 +701,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" @@ -705,6 +718,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" @@ -720,7 +734,8 @@ tasks: TOPOLOGY: "server" AUTH: "auth" SSL: "ssl" - - func: start-cse-servers + - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" @@ -736,6 +751,7 @@ tasks: AUTH: "auth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" @@ -751,6 +767,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" @@ -766,7 +783,8 @@ tasks: TOPOLOGY: "server" AUTH: "auth" SSL: "ssl" - - func: start-cse-servers + - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" @@ -783,6 +801,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" @@ -1203,6 +1222,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "replica_set" @@ -1219,6 +1239,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "replica_set" @@ -1234,6 +1255,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "replica_set" @@ -1249,6 +1271,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "replica_set" @@ -1267,6 +1290,7 @@ tasks: AUTH: "auth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "replica_set" @@ -1282,6 +1306,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1297,6 +1322,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1313,6 +1339,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1329,6 +1356,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1345,6 +1373,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1360,6 +1389,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1376,6 +1406,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1392,6 +1423,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1408,6 +1440,7 @@ tasks: AUTH: "auth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1479,6 +1512,7 @@ tasks: SSL: "nossl" REQUIRE_API_VERSION: true - func: start-cse-servers + - func: start-mongoproxy - func: run-versioned-api-test vars: TOPOLOGY: "server" @@ -1496,6 +1530,7 @@ tasks: SSL: "nossl" REQUIRE_API_VERSION: true - func: start-cse-servers + - func: start-mongoproxy - func: run-versioned-api-test vars: TOPOLOGY: "sharded_cluster" @@ -1513,6 +1548,7 @@ tasks: SSL: "nossl" ORCHESTRATION_FILE: "versioned-api-testing.json" - func: start-cse-servers + - func: start-mongoproxy - func: run-versioned-api-test vars: TOPOLOGY: "server" @@ -1528,6 +1564,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-kms-tls-test vars: KMS_TLS_TESTCASE: "INVALID_CERT" @@ -1544,6 +1581,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-kms-tls-test vars: KMS_TLS_TESTCASE: "INVALID_HOSTNAME" @@ -1560,6 +1598,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-kmip-tests vars: TOPOLOGY: "server" @@ -1575,12 +1614,14 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers + - func: start-mongoproxy - func: run-retry-kms-requests - name: "test-serverless" tags: ["serverless"] commands: - func: start-cse-servers + - func: start-mongoproxy - func: "run-serverless-tests" vars: MONGO_GO_DRIVER_COMPRESSOR: "snappy" diff --git a/.evergreen/setup-system.sh b/.evergreen/setup-system.sh index 542060fee4..a7527d5021 100755 --- a/.evergreen/setup-system.sh +++ b/.evergreen/setup-system.sh @@ -4,7 +4,7 @@ set -eu # Set up default environment variables. -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) PROJECT_DIRECTORY=$(dirname $SCRIPT_DIR) pushd $PROJECT_DIRECTORY ROOT_DIR=$(dirname $PROJECT_DIRECTORY) @@ -40,18 +40,19 @@ PATH="${GOROOT}/bin:${GOPATH}/bin:${MONGODB_BINARIES}:${EXTRA_PATH}:${PATH}" # Get the current unique version of this checkout. if [ "${IS_PATCH:-}" = "true" ]; then - CURRENT_VERSION=$(git describe)-patch-${VERSION_ID} + CURRENT_VERSION=$(git describe)-patch-${VERSION_ID} else - CURRENT_VERSION=latest + CURRENT_VERSION=latest fi # Ensure a checkout of drivers-tools. if [ ! -d "$DRIVERS_TOOLS" ]; then - git clone https://github.com/mongodb-labs/drivers-evergreen-tools $DRIVERS_TOOLS + #git clone https://github.com/mongodb-labs/drivers-evergreen-tools $DRIVERS_TOOLS + git clone -b DRIVERS-2884 https://github.com/prestonvasquez/drivers-evergreen-tools $DRIVERS_TOOLS fi # Write the .env file for drivers-tools. -cat < ${DRIVERS_TOOLS}/.env +cat <${DRIVERS_TOOLS}/.env SKIP_LEGACY_SHELL=1 DRIVERS_TOOLS="$DRIVERS_TOOLS" MONGO_ORCHESTRATION_HOME="$MONGO_ORCHESTRATION_HOME" @@ -67,7 +68,7 @@ go env go install github.com/go-task/task/v3/cmd/task@v3.39.1 # Write our own env file. -cat < env.sh +cat <env.sh export GOROOT="$GOROOT" export GOPATH="$GOPATH" export GOCACHE="$GOCACHE" @@ -78,12 +79,12 @@ export PATH="$PATH" EOT if [ "Windows_NT" = "$OS" ]; then - echo "export USERPROFILE=$USERPROFILE" >> env.sh - echo "export HOME=$HOME" >> env.sh + echo "export USERPROFILE=$USERPROFILE" >>env.sh + echo "export HOME=$HOME" >>env.sh fi # source the env.sh file and write the expansion file. -cat < expansion.yml +cat <expansion.yml CURRENT_VERSION: "$CURRENT_VERSION" DRIVERS_TOOLS: "$DRIVERS_TOOLS" PROJECT_DIRECTORY: "$PROJECT_DIRECTORY" diff --git a/internal/integration/cmap_prose_test.go b/internal/integration/cmap_prose_test.go new file mode 100644 index 0000000000..cec86398ba --- /dev/null +++ b/internal/integration/cmap_prose_test.go @@ -0,0 +1,151 @@ +package integration + +import ( + "context" + "fmt" + "os" + "testing" + "time" + + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/internal/assert" + "go.mongodb.org/mongo-driver/v2/internal/integration/mtest" + "go.mongodb.org/mongo-driver/v2/internal/require" + "go.mongodb.org/mongo-driver/v2/mongo/options" +) + +func TestCMAPProse_PendingResponse_ConnectionAliveness(t *testing.T) { + const timeoutMS = 200 * time.Millisecond + + // Establish a direct connection to the proxy. + proxyURI := "mongodb://127.0.0.1:28017/?directConnection=true" + //t.Setenv("MONGODB_URI", proxyURI) + // + // Print the MONGODB_URI environment variable for debugging purposes + fmt.Println("MONGODB_URI", os.Getenv("MONGODB_URI")) + + clientOpts := options.Client().ApplyURI(proxyURI).SetMaxPoolSize(1).SetDirect(true) + mt := mtest.New(t, mtest.NewOptions().CreateClient(false).ClientOptions(clientOpts)) + + fmt.Println("Running TestCMAPProse_PendingResponse_ConnectionAliveness...") + + mt.Run("fails", func(mt *mtest.T) { + // Create a command document that instructs the proxy to dely 2x the + // timeoutMS for the operation then never respond. + proxyTest := bson.D{ + {Key: "actions", Value: bson.A{ + // Causes the timeout in the initial try. + bson.D{{Key: "delayMs", Value: 400}}, + // Send nothing back to the client, ever. + bson.D{{Key: "sendBytes", Value: 0}}, + }}, + } + + type myStruct struct { + Name string `bson:"name"` + Age int `bson:"age"` + } + + cmd := bson.D{ + {Key: "insert", Value: "mycoll"}, + {Key: "documents", Value: bson.A{myStruct{Name: "Alice", Age: 30}}}, + {Key: "proxyTest", Value: proxyTest}, + } + + db := mt.Client.Database("testdb") + coll := db.Collection("mycoll") + + _ = coll.Drop(context.Background()) // Ensure the collection is clean before the test. + + // Run the command against the proxy with timeoutMS. + ctx, cancel := context.WithTimeout(context.Background(), timeoutMS) + defer cancel() + + err := db.RunCommand(ctx, cmd).Err() + require.Error(mt, err, "expected command to fail due to timeout") + assert.ErrorIs(mt, err, context.DeadlineExceeded) + + // Wait 3 seconds to ensure there is time left in the pending response state. + time.Sleep(3 * time.Second) + + // Run an insertOne without a timeout. Expect the pending response to fail + // at the aliveness check. However, the insert should succeed since pending + // response failures are retryable. + _, err = coll.InsertOne(context.Background(), myStruct{Name: "Bob", Age: 25}) + require.NoError(mt, err, "expected insertOne to succeed after pending response aliveness check") + + // There should be 1 ConnectionPendingResponseStarted event. + assert.Equal(mt, 1, mt.NumberConnectionsPendingReadStarted()) + + // There should be 1 ConnectionPendingResponseFailed event. + assert.Equal(mt, 1, mt.NumberConnectionsPendingReadFailed()) + + // There should be 0 ConnectionPendingResponseSucceeded event. + assert.Equal(mt, 0, mt.NumberConnectionsPendingReadSucceeded()) + + // There should be 1 ConnectionClosed event. + assert.Equal(mt, 1, mt.NumberConnectionsClosed()) + }) + + mt.Run("succeeds", func(mt *mtest.T) { + // Create a command document that instructs the proxy to dely 2x the + // timeoutMS for the operation, then responds with exactly 1 byte for + // the alivness check and finally with the entire message. + proxyTest := bson.D{ + {Key: "actions", Value: bson.A{ + // Causes the timeout in the initial try. + bson.D{{Key: "delayMs", Value: 400}}, + // Send exactly one byte so that the aliveness check succeeds. + bson.D{{Key: "sendBytes", Value: 1}}, + // Cause another delay for the retry operation. + bson.D{{Key: "delayMs", Value: 10}}, + // Send the rest of the response for discarding on retry. + bson.D{{Key: "sendAll", Value: true}}, + }}, + } + + type myStruct struct { + Name string `bson:"name"` + Age int `bson:"age"` + } + + cmd := bson.D{ + {Key: "insert", Value: "mycoll"}, + {Key: "documents", Value: bson.A{myStruct{Name: "Alice", Age: 30}}}, + {Key: "proxyTest", Value: proxyTest}, + } + + db := mt.Client.Database("testdb") + coll := db.Collection("mycoll") + + _ = coll.Drop(context.Background()) // Ensure the collection is clean before the test. + + // Run the command against the proxy with timeoutMS. + ctx, cancel := context.WithTimeout(context.Background(), timeoutMS) + defer cancel() + + err := db.RunCommand(ctx, cmd).Err() + require.Error(mt, err, "expected command to fail due to timeout") + assert.ErrorIs(mt, err, context.DeadlineExceeded) + + // Wait 3 seconds to ensure there is time left in the pending response state. + time.Sleep(3 * time.Second) + + // Run an insertOne without a timeout. Expect the pending response to fail + // at the aliveness check. + _, err = coll.InsertOne(context.Background(), myStruct{Name: "Bob", Age: 25}) + require.NoError(mt, err, "expected insertOne to succeed after pending response aliveness check") + + // There should be 1 ConnectionPendingResponseStarted event. + assert.Equal(mt, 1, mt.NumberConnectionsPendingReadStarted()) + + // There should be 0 ConnectionPendingResponseFailed event. + assert.Equal(mt, 0, mt.NumberConnectionsPendingReadFailed()) + + // There should be 1 ConnectionPendingResponseSucceeded event. + assert.Equal(mt, 1, mt.NumberConnectionsPendingReadSucceeded()) + + // The connection should not have been closed. + assert.Equal(mt, 0, mt.NumberConnectionsClosed()) + }) +} diff --git a/internal/integration/mtest/mongotest.go b/internal/integration/mtest/mongotest.go index 1430ab6e0c..c39b801048 100644 --- a/internal/integration/mtest/mongotest.go +++ b/internal/integration/mtest/mongotest.go @@ -59,6 +59,7 @@ type T struct { connPendingReadStarted int64 connPendingReadSucceeded int64 connPendingReadFailed int64 + connClosed int64 *testing.T @@ -357,6 +358,10 @@ func (t *T) NumberConnectionsPendingReadStarted() int { return int(atomic.LoadInt64(&t.connPendingReadStarted)) } +func (t *T) NumberConnectionsClosed() int { + return int(atomic.LoadInt64(&t.connClosed)) +} + func (t *T) NumberConnectionsPendingReadSucceeded() int { return int(atomic.LoadInt64(&t.connPendingReadSucceeded)) } @@ -666,8 +671,10 @@ func (t *T) createTestClient() { atomic.AddInt64(&t.connPendingReadStarted, 1) case event.ConnectionPendingResponseSucceeded: atomic.AddInt64(&t.connPendingReadSucceeded, 1) - case event.ConnectionCheckOutFailed: + case event.ConnectionPendingResponseFailed: atomic.AddInt64(&t.connPendingReadFailed, 1) + case event.ConnectionClosed: + atomic.AddInt64(&t.connClosed, 1) } }, }) diff --git a/testdata/specifications b/testdata/specifications index 0c41c8b283..4e5d624565 160000 --- a/testdata/specifications +++ b/testdata/specifications @@ -1 +1 @@ -Subproject commit 0c41c8b28321e093e7e156c72abe65744cc1b467 +Subproject commit 4e5d6245655f30f13e42a15bd340f57f6729bb27 diff --git a/x/mongo/driver/operation.go b/x/mongo/driver/operation.go index 06a896da42..91bcda8b89 100644 --- a/x/mongo/driver/operation.go +++ b/x/mongo/driver/operation.go @@ -76,6 +76,10 @@ type RetryablePoolError interface { Retryable() bool } +type RetryablePendingResponseError interface { + Retryable() bool +} + // labeledError is an error that can have error labels added to it. type labeledError interface { error diff --git a/x/mongo/driver/topology/cmap_prose_test.go b/x/mongo/driver/topology/cmap_prose_test.go index cc24a029cb..6a48105749 100644 --- a/x/mongo/driver/topology/cmap_prose_test.go +++ b/x/mongo/driver/topology/cmap_prose_test.go @@ -9,7 +9,6 @@ package topology import ( "context" "errors" - "io" "net" "regexp" "sync" @@ -273,104 +272,10 @@ func TestCMAPProse(t *testing.T) { // Need to test the case where we attempt a non-blocking read to determine if // we should refresh the remaining time. In the case of the Go Driver, we do // this by attempt to "peek" at 1 byte with a deadline of 1ns. - t.Run("connection attempts peek but fails", func(t *testing.T) { - const requestID = int32(-1) - timeout := 10 * time.Millisecond - - // Mock a TCP listener that will write a byte sequence > 5 (to avoid errors - // due to size) to the TCP socket. Have the listener sleep for 2x the - // timeout provided to the connection AFTER writing the byte sequence. This - // wiill cause the connection to timeout while reading from the socket. - addr := bootstrapConnections(t, 1, func(nc net.Conn) { - defer func() { - _ = nc.Close() - }() - - _, err := nc.Write([]byte{12, 0, 0, 0, 0, 0, 0, 0, 1}) - require.NoError(t, err) - time.Sleep(timeout * 2) - - // Write nothing so that the 1 millisecond "non-blocking" peek fails. - }) - - poolEventsByType := make(map[string][]event.PoolEvent) - poolEventsByTypeMu := &sync.Mutex{} - - monitor := &event.PoolMonitor{ - Event: func(pe *event.PoolEvent) { - poolEventsByTypeMu.Lock() - poolEventsByType[pe.Type] = append(poolEventsByType[pe.Type], *pe) - poolEventsByTypeMu.Unlock() - }, - } - - p := newPool( - poolConfig{ - Address: address.Address(addr.String()), - PoolMonitor: monitor, - }, - ) - defer p.close(context.Background()) - err := p.ready() - require.NoError(t, err) - - // Check out a connection and read from the socket, causing a timeout and - // pinning the connection to a pending read state. - conn, err := p.checkOut(context.Background()) - require.NoError(t, err) - - ctx, cancel := csot.WithTimeout(context.Background(), &timeout) - defer cancel() - - ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) - ctx = driverutil.WithRequestID(ctx, requestID) - - _, err = conn.readWireMessage(ctx) - regex := regexp.MustCompile( - `^connection\(.*\[-\d+\]\) incomplete read of full message: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, - ) - assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) - - // Check in the connection with a pending read state. The next time this - // connection is checked out, it should attempt to read the pending - // response. - err = p.checkIn(conn) - require.NoError(t, err) - - // Wait 3s to make sure there is no remaining time on the pending read - // state. - time.Sleep(3 * time.Second) - - // Check out the connection again. The remaining time should be exhausted - // requiring us to "peek" at the connection to determine if we should. - _, err = p.checkOut(context.Background()) - assert.ErrorIs(t, err, io.EOF) - - // There should be 1 ConnectionPendingResponseStarted event. - started := poolEventsByType[event.ConnectionPendingResponseStarted] - require.Len(t, started, 1) - - assert.Equal(t, addr.String(), started[0].Address) - assert.Equal(t, conn.driverConnectionID, started[0].ConnectionID) - assert.Equal(t, requestID, started[0].RequestID) - - // There should be 1 ConnectionPendingResponseFailed event. - failed := poolEventsByType[event.ConnectionPendingResponseFailed] - require.Len(t, failed, 1) - - assert.Equal(t, addr.String(), failed[0].Address) - assert.Equal(t, conn.driverConnectionID, failed[0].ConnectionID) - assert.Equal(t, requestID, failed[0].RequestID) - assert.Equal(t, "error", failed[0].Reason) - assert.Greater(t, int(failed[0].Duration), 0) - assert.ErrorIs(t, failed[0].Error, io.EOF) - - // There should be 0 ConnectionPendingResponseSucceeded event. - require.Len(t, poolEventsByType[event.ConnectionPendingResponseSucceeded], 0) - - // The connection should have been closed. - require.Len(t, poolEventsByType[event.ConnectionClosed], 1) - }) + //t.Run("connection attempts peek but fails", func(t *testing.T) { + // // Create a client using a direct connection to the proxy. + // client, err := mongo.Connect( + //}) t.Run("connection attempts peek and succeeds", func(t *testing.T) { const requestID = int32(-1) diff --git a/x/mongo/driver/topology/pool.go b/x/mongo/driver/topology/pool.go index 1feb07811c..6c4c4391c9 100644 --- a/x/mongo/driver/topology/pool.go +++ b/x/mongo/driver/topology/pool.go @@ -986,6 +986,32 @@ func attemptPendingResponse(ctx context.Context, conn *connection, remainingTime return int(totalRead), nil } +// poolClearedError is an error returned when the connection pool is cleared or currently paused. It +// is a retryable error. +type pendingResponseError struct { + err error +} + +var _ error = pendingResponseError{} +var _ driver.RetryablePoolError = pendingResponseError{} + +func (pre pendingResponseError) Error() string { + if pre.err == nil { + return "" + } + return pre.err.Error() +} + +// Retryable returns true. All poolClearedErrors are retryable. +func (pendingResponseError) Retryable() bool { return true } + +func (pre pendingResponseError) Unwrap() error { + if pre.err == nil { + return nil + } + return pre.err +} + // awaitPendingResponse sets a new read deadline on the provided connection and // tries to read any bytes returned by the server. If there are any errors, the // connection will be checked back into the pool to be retried. @@ -1044,9 +1070,9 @@ func awaitPendingResponse(ctx context.Context, pool *pool, conn *connection) err } if !isCSOTTimeout(err) { if err := conn.close(); err != nil { - return err + return pendingResponseError{err: err} } - return err + return pendingResponseError{err: err} } } @@ -1058,12 +1084,12 @@ func awaitPendingResponse(ctx context.Context, pool *pool, conn *connection) err // If the remaining time has been exceeded, then close the connection. if endTime.Sub(pendingResponseState.start) > PendingResponseTimeout { if err := conn.close(); err != nil { - return err + return pendingResponseError{err: err} } } if err != nil { - return err + return pendingResponseError{err: err} } publishPendingResponseSucceeded(pool, conn, endDuration) From 73e7b022e8793cfe1ab0de61b8de116dd3555c29 Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Thu, 26 Jun 2025 18:57:52 -0600 Subject: [PATCH 16/23] Update pool w/ aliveness check --- .evergreen/config.yml | 64 +++++++++++++------------ .evergreen/run-task.sh | 8 ++-- .evergreen/setup-system.sh | 2 +- Taskfile.yml | 2 + etc/start-mongoproxy.sh | 6 +++ internal/integration/cmap_prose_test.go | 57 ++++++++++++++-------- internal/integration/mtest/mongotest.go | 2 + internal/integration/mtest/options.go | 3 ++ testdata/specifications | 2 +- x/mongo/driver/topology/pool.go | 15 ++++++ 10 files changed, 103 insertions(+), 58 deletions(-) create mode 100644 etc/start-mongoproxy.sh diff --git a/.evergreen/config.yml b/.evergreen/config.yml index 9bc9f0ac51..c8f71f7a81 100644 --- a/.evergreen/config.yml +++ b/.evergreen/config.yml @@ -550,8 +550,9 @@ functions: - command: subprocess.exec params: binary: bash + include_expansions_in_env: [AUTH, SSL, MONGODB_URI] background: true - args: ["${DRIVERS_TOOLS}/.evergreen/start-mongoproxy.sh"] + args: [*task-runner, start-mongoproxy] run-kms-tls-test: - command: subprocess.exec @@ -684,13 +685,14 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" AUTH: "noauth" SSL: "nossl" MONGO_GO_DRIVER_COMPRESSOR: "snappy" + MONGO_PROXY_URI: "mongodb://127.0.0.1:28017/?directConnection=true" - name: test-standalone-noauth-nossl-zlib-compression tags: ["test", "standalone", "compression", "zlib"] @@ -701,7 +703,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" @@ -718,7 +720,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" @@ -735,7 +737,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" @@ -751,7 +753,7 @@ tasks: AUTH: "auth" SSL: "nossl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" @@ -767,7 +769,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" @@ -784,7 +786,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" @@ -801,7 +803,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "server" @@ -1222,7 +1224,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "replica_set" @@ -1239,7 +1241,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "replica_set" @@ -1255,7 +1257,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "replica_set" @@ -1271,7 +1273,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "replica_set" @@ -1290,7 +1292,7 @@ tasks: AUTH: "auth" SSL: "nossl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "replica_set" @@ -1306,7 +1308,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1322,7 +1324,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1339,7 +1341,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1356,7 +1358,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1373,7 +1375,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1389,7 +1391,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1406,7 +1408,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1423,7 +1425,7 @@ tasks: AUTH: "auth" SSL: "ssl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1440,7 +1442,7 @@ tasks: AUTH: "auth" SSL: "nossl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-tests vars: TOPOLOGY: "sharded_cluster" @@ -1512,7 +1514,7 @@ tasks: SSL: "nossl" REQUIRE_API_VERSION: true - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-versioned-api-test vars: TOPOLOGY: "server" @@ -1530,7 +1532,7 @@ tasks: SSL: "nossl" REQUIRE_API_VERSION: true - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-versioned-api-test vars: TOPOLOGY: "sharded_cluster" @@ -1548,7 +1550,7 @@ tasks: SSL: "nossl" ORCHESTRATION_FILE: "versioned-api-testing.json" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-versioned-api-test vars: TOPOLOGY: "server" @@ -1564,7 +1566,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-kms-tls-test vars: KMS_TLS_TESTCASE: "INVALID_CERT" @@ -1581,7 +1583,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-kms-tls-test vars: KMS_TLS_TESTCASE: "INVALID_HOSTNAME" @@ -1598,7 +1600,7 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-kmip-tests vars: TOPOLOGY: "server" @@ -1614,14 +1616,14 @@ tasks: AUTH: "noauth" SSL: "nossl" - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: run-retry-kms-requests - name: "test-serverless" tags: ["serverless"] commands: - func: start-cse-servers - - func: start-mongoproxy + #- func: start-mongoproxy - func: "run-serverless-tests" vars: MONGO_GO_DRIVER_COMPRESSOR: "snappy" diff --git a/.evergreen/run-task.sh b/.evergreen/run-task.sh index 564d937703..a5001f04b8 100755 --- a/.evergreen/run-task.sh +++ b/.evergreen/run-task.sh @@ -1,13 +1,13 @@ #!/usr/bin/env bash # # Source the env.sh file and run the given task -set -eu +set -exu -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) PROJECT_DIRECTORY=$(dirname $SCRIPT_DIR) -pushd ${PROJECT_DIRECTORY} > /dev/null +pushd ${PROJECT_DIRECTORY} >/dev/null source env.sh task "$@" -popd > /dev/null +popd >/dev/null diff --git a/.evergreen/setup-system.sh b/.evergreen/setup-system.sh index a7527d5021..83f00d1572 100755 --- a/.evergreen/setup-system.sh +++ b/.evergreen/setup-system.sh @@ -48,7 +48,7 @@ fi # Ensure a checkout of drivers-tools. if [ ! -d "$DRIVERS_TOOLS" ]; then #git clone https://github.com/mongodb-labs/drivers-evergreen-tools $DRIVERS_TOOLS - git clone -b DRIVERS-2884 https://github.com/prestonvasquez/drivers-evergreen-tools $DRIVERS_TOOLS + git clone -b DRIVERS-2884-mongoproxy https://github.com/prestonvasquez/drivers-evergreen-tools $DRIVERS_TOOLS fi # Write the .env file for drivers-tools. diff --git a/Taskfile.yml b/Taskfile.yml index 6784d855c2..438913417d 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -123,6 +123,8 @@ tasks: setup-encryption: bash etc/setup-encryption.sh + start-mongoproxy: bash etc/start-mongoproxy.sh + evg-test: - go test -exec "env PKG_CONFIG_PATH=${PKG_CONFIG_PATH} LD_LIBRARY_PATH=${LD_LIBRARY_PATH} DYLD_LIBRARY_PATH=$MACOS_LIBRARY_PATH}" ${BUILD_TAGS} -v -timeout {{.TEST_TIMEOUT}}s -p 1 ./... >> test.suite diff --git a/etc/start-mongoproxy.sh b/etc/start-mongoproxy.sh new file mode 100644 index 0000000000..ebfa66a348 --- /dev/null +++ b/etc/start-mongoproxy.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +# +# Set up environment and write env.sh and expansion.yml files. +set -eux + +bash $DRIVERS_TOOLS/.evergreen/start-mongoproxy.sh diff --git a/internal/integration/cmap_prose_test.go b/internal/integration/cmap_prose_test.go index cec86398ba..a5e868e1bf 100644 --- a/internal/integration/cmap_prose_test.go +++ b/internal/integration/cmap_prose_test.go @@ -2,7 +2,6 @@ package integration import ( "context" - "fmt" "os" "testing" "time" @@ -17,21 +16,37 @@ import ( func TestCMAPProse_PendingResponse_ConnectionAliveness(t *testing.T) { const timeoutMS = 200 * time.Millisecond - // Establish a direct connection to the proxy. - proxyURI := "mongodb://127.0.0.1:28017/?directConnection=true" - //t.Setenv("MONGODB_URI", proxyURI) - // - // Print the MONGODB_URI environment variable for debugging purposes - fmt.Println("MONGODB_URI", os.Getenv("MONGODB_URI")) + // Skip on compressor due to proxy complexity. + if os.Getenv("MONGO_GO_DRIVER_COMPRESSOR") != "" { + t.Skip("Skipping test because MONGO_GO_DRIVER_COMPRESSOR is set, which is not supported by the proxy") + } - clientOpts := options.Client().ApplyURI(proxyURI).SetMaxPoolSize(1).SetDirect(true) - mt := mtest.New(t, mtest.NewOptions().CreateClient(false).ClientOptions(clientOpts)) + // Skip on auth due to proxy complexity. + if os.Getenv("AUTH") == "auth" { + t.Skip("Skipping test because AUTH is set") + } - fmt.Println("Running TestCMAPProse_PendingResponse_ConnectionAliveness...") + // Skip on SSL due to proxy complexity. + if os.Getenv("SSL") == "ssl" { + t.Skip("Skipping test because SSL is set") + } - mt.Run("fails", func(mt *mtest.T) { - // Create a command document that instructs the proxy to dely 2x the - // timeoutMS for the operation then never respond. + // Skip unless the proxy URI is set. + if os.Getenv("MONGO_PROXY_URI") == "" { + t.Skip("Skipping test because MONGO_PROXY_URI is not set") + } + + //proxyURI := "mongodb://127.0.0.1:28017/?directConnection=true" + proxyURI := os.Getenv("MONGO_PROXY_URI") + + // Establish a direct connection to the proxy via mtest. + clientOpts := options.Client().ApplyURI(proxyURI).SetMaxPoolSize(1).SetDirect(true) + mt := mtest.New(t, mtest.NewOptions().ClientOptions(clientOpts).ClientType(mtest.MongoProxy)) + + opts := mtest.NewOptions().CreateCollection(false) + mt.RunOpts("fails", opts, func(mt *mtest.T) { + //Create a command document that instructs the proxy to dely 2x the + //timeoutMS for the operation then never respond. proxyTest := bson.D{ {Key: "actions", Value: bson.A{ // Causes the timeout in the initial try. @@ -62,19 +77,19 @@ func TestCMAPProse_PendingResponse_ConnectionAliveness(t *testing.T) { defer cancel() err := db.RunCommand(ctx, cmd).Err() - require.Error(mt, err, "expected command to fail due to timeout") - assert.ErrorIs(mt, err, context.DeadlineExceeded) + require.Error(t, err, "expected command to fail due to timeout") + assert.ErrorIs(t, err, context.DeadlineExceeded) // Wait 3 seconds to ensure there is time left in the pending response state. time.Sleep(3 * time.Second) - // Run an insertOne without a timeout. Expect the pending response to fail - // at the aliveness check. However, the insert should succeed since pending - // response failures are retryable. + //Run an insertOne without a timeout. Expect the pending response to fail + //at the aliveness check. However, the insert should succeed since pending + //response failures are retryable. _, err = coll.InsertOne(context.Background(), myStruct{Name: "Bob", Age: 25}) - require.NoError(mt, err, "expected insertOne to succeed after pending response aliveness check") + require.NoError(t, err, "expected insertOne to succeed after pending response aliveness check") - // There should be 1 ConnectionPendingResponseStarted event. + //There should be 1 ConnectionPendingResponseStarted event. assert.Equal(mt, 1, mt.NumberConnectionsPendingReadStarted()) // There should be 1 ConnectionPendingResponseFailed event. @@ -87,7 +102,7 @@ func TestCMAPProse_PendingResponse_ConnectionAliveness(t *testing.T) { assert.Equal(mt, 1, mt.NumberConnectionsClosed()) }) - mt.Run("succeeds", func(mt *mtest.T) { + mt.RunOpts("succeeds", opts, func(mt *mtest.T) { // Create a command document that instructs the proxy to dely 2x the // timeoutMS for the operation, then responds with exactly 1 byte for // the alivness check and finally with the entire message. diff --git a/internal/integration/mtest/mongotest.go b/internal/integration/mtest/mongotest.go index c39b801048..71aea259a1 100644 --- a/internal/integration/mtest/mongotest.go +++ b/internal/integration/mtest/mongotest.go @@ -695,6 +695,8 @@ func (t *T) createTestClient() { t.mockDeployment = drivertest.NewMockDeployment() clientOpts.Deployment = t.mockDeployment + t.Client, err = mongo.Connect(clientOpts) + case MongoProxy: t.Client, err = mongo.Connect(clientOpts) case Proxy: t.proxyDialer = newProxyDialer() diff --git a/internal/integration/mtest/options.go b/internal/integration/mtest/options.go index aff188b481..72d3affddc 100644 --- a/internal/integration/mtest/options.go +++ b/internal/integration/mtest/options.go @@ -43,6 +43,9 @@ const ( // Proxy specifies a client that proxies messages to the server and also stores parsed copies. The proxied // messages can be retrieved via T.GetProxiedMessages or T.GetRawProxiedMessages. Proxy + // MongoProxy specifies a client that communicates with a MongoDB proxy server + // as defined in Drivers Evergreen Tools. + MongoProxy ) var ( diff --git a/testdata/specifications b/testdata/specifications index 4e5d624565..db693517de 160000 --- a/testdata/specifications +++ b/testdata/specifications @@ -1 +1 @@ -Subproject commit 4e5d6245655f30f13e42a15bd340f57f6729bb27 +Subproject commit db693517de561465621b474558ded99eaf38757d diff --git a/x/mongo/driver/topology/pool.go b/x/mongo/driver/topology/pool.go index 6c4c4391c9..f9cdec040d 100644 --- a/x/mongo/driver/topology/pool.go +++ b/x/mongo/driver/topology/pool.go @@ -927,6 +927,10 @@ func attemptPendingResponse(ctx context.Context, conn *connection, remainingTime size := pendingreadState.remainingBytes + // TODO: What happens when you do an aliveness check and read a byte? + // Need to make a test for this, otherwise size could be corrupted. + // aliveness check MUST only peek. + if size == 0 { // Question: Would this alawys equal to zero? var sizeBuf [4]byte if bytesRead, err := io.ReadFull(conn.nc, sizeBuf[:]); err != nil { @@ -1031,6 +1035,7 @@ func awaitPendingResponse(ctx context.Context, pool *pool, conn *connection) err remainingTime = pendingResponseState.start.Add(PendingResponseTimeout).Sub(time.Now()) err error bytesRead int + alivenessCheck bool ) st := time.Now() @@ -1038,6 +1043,7 @@ func awaitPendingResponse(ctx context.Context, pool *pool, conn *connection) err // If there is no remaining time, we can just peek at the connection to check // aliveness. In such cases, we don't want to close the connection. bytesRead, err = peekConnectionAlive(conn) + alivenessCheck = true } else { bytesRead, err = attemptPendingResponse(ctx, conn, remainingTime) } @@ -1092,6 +1098,15 @@ func awaitPendingResponse(ctx context.Context, pool *pool, conn *connection) err return pendingResponseError{err: err} } + // If the connection is alive but undrained we can check it back into the pool + // and return a pendingResponseError to indicate that the connection is + // alive and retryable. + if alivenessCheck { + _ = pool.checkInNoEvent(conn) + + return pendingResponseError{err: fmt.Errorf("connection is alive and retryable: %w", err)} + } + publishPendingResponseSucceeded(pool, conn, endDuration) conn.pendingResponseState = nil From e465d623ac2e943ca69f6196b41639f29c181c71 Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Thu, 26 Jun 2025 20:11:29 -0600 Subject: [PATCH 17/23] GODRIVER-3173 Fix peek bug --- internal/integration/cmap_prose_test.go | 10 +- x/mongo/driver/topology/connection.go | 41 ++++---- x/mongo/driver/topology/pool.go | 119 +++++++++--------------- 3 files changed, 75 insertions(+), 95 deletions(-) diff --git a/internal/integration/cmap_prose_test.go b/internal/integration/cmap_prose_test.go index a5e868e1bf..758a338f88 100644 --- a/internal/integration/cmap_prose_test.go +++ b/internal/integration/cmap_prose_test.go @@ -110,11 +110,11 @@ func TestCMAPProse_PendingResponse_ConnectionAliveness(t *testing.T) { {Key: "actions", Value: bson.A{ // Causes the timeout in the initial try. bson.D{{Key: "delayMs", Value: 400}}, - // Send exactly one byte so that the aliveness check succeeds. + //// Send exactly one byte so that the aliveness check succeeds. bson.D{{Key: "sendBytes", Value: 1}}, - // Cause another delay for the retry operation. + //// Cause another delay for the retry operation. bson.D{{Key: "delayMs", Value: 10}}, - // Send the rest of the response for discarding on retry. + //// Send the rest of the response for discarding on retry. bson.D{{Key: "sendAll", Value: true}}, }}, } @@ -152,10 +152,10 @@ func TestCMAPProse_PendingResponse_ConnectionAliveness(t *testing.T) { require.NoError(mt, err, "expected insertOne to succeed after pending response aliveness check") // There should be 1 ConnectionPendingResponseStarted event. - assert.Equal(mt, 1, mt.NumberConnectionsPendingReadStarted()) + assert.Equal(mt, 2, mt.NumberConnectionsPendingReadStarted()) // There should be 0 ConnectionPendingResponseFailed event. - assert.Equal(mt, 0, mt.NumberConnectionsPendingReadFailed()) + assert.Equal(mt, 1, mt.NumberConnectionsPendingReadFailed()) // There should be 1 ConnectionPendingResponseSucceeded event. assert.Equal(mt, 1, mt.NumberConnectionsPendingReadSucceeded()) diff --git a/x/mongo/driver/topology/connection.go b/x/mongo/driver/topology/connection.go index a7ff24e935..7d4f983277 100644 --- a/x/mongo/driver/topology/connection.go +++ b/x/mongo/driver/topology/connection.go @@ -7,6 +7,7 @@ package topology import ( + "bufio" // Import bufio "context" "crypto/tls" "encoding/binary" @@ -60,7 +61,8 @@ type connection struct { state int64 id string - nc net.Conn // When nil, the connection is closed. + nc net.Conn // When nil, the connection is closed. + br *bufio.Reader // When non-nil, used to read from nc. addr address.Address idleTimeout time.Duration idleStart atomic.Value // Stores a time.Time @@ -242,6 +244,9 @@ func (c *connection) connect(ctx context.Context) (err error) { c.nc = tlsNc } + // Initialize the buffered reader now that we have a finalized net.Conn. + c.br = bufio.NewReader(c.nc) + // running hello and authentication is handled by a handshaker on the configuration instance. handshaker := c.config.handshaker if handshaker == nil { @@ -485,7 +490,7 @@ func (c *connection) read(ctx context.Context) (bytesRead []byte, errMsg string, // We do a ReadFull into an array here instead of doing an opportunistic ReadAtLeast into dst // because there might be more than one wire message waiting to be read, for example when // reading messages from an exhaust cursor. - n, err := io.ReadFull(c.nc, sizeBuf[:]) + n, err := io.ReadFull(c.br, sizeBuf[:]) // Use the buffered reader if err != nil { if l := int32(n); l == 0 && isCSOTTimeout(err) && driverutil.HasMaxTimeMS(ctx) { requestID, _ := driverutil.GetRequestID(ctx) @@ -506,7 +511,7 @@ func (c *connection) read(ctx context.Context) (bytesRead []byte, errMsg string, dst := make([]byte, size) copy(dst, sizeBuf[:]) - n, err = io.ReadFull(c.nc, dst[4:]) + n, err = io.ReadFull(c.br, dst[4:]) // Use the buffered reader if err != nil { remainingBytes := size - 4 - int32(n) if remainingBytes > 0 && isCSOTTimeout(err) && driverutil.HasMaxTimeMS(ctx) { @@ -557,7 +562,7 @@ func (c *connection) closed() bool { // seconds. For frequently in-use connections, a network error during an // operation will be the first indication of a dead connection. func (c *connection) isAlive() bool { - if c.nc == nil { + if c.nc == nil || c.br == nil { return false } @@ -574,20 +579,24 @@ func (c *connection) isAlive() bool { return true } - // Set a 1ms read deadline and attempt to read 1 byte from the connection. - // Expect it to block for 1ms then return a deadline exceeded error. If it - // returns any other error, the connection is not usable, so return false. - // If it doesn't return an error and actually reads data, the connection is - // also not usable, so return false. - // - // Note that we don't need to un-set the read deadline because the "read" - // and "write" methods always reset the deadlines. - err := c.nc.SetReadDeadline(time.Now().Add(1 * time.Millisecond)) - if err != nil { + // Set a 1ms read deadline and attempt to peek at 1 byte from the connection's + // buffered reader. Expect it to block for 1ms then return a deadline exceeded error. + // If it returns a different error (e.g. io.EOF), the connection is dead. If it + // successfully peeks a byte, the connection is alive but we haven't consumed + // the byte from the stream. + if err := c.nc.SetReadDeadline(time.Now().Add(1 * time.Millisecond)); err != nil { return false } - var b [1]byte - _, err = c.nc.Read(b[:]) + // Important: always reset the deadline after the check. + defer c.nc.SetReadDeadline(time.Time{}) + + _, err := c.br.Peek(1) + // The connection is alive if we got a timeout (meaning the connection is idle + // and waiting for data) or if we successfully peeked at a byte (err == nil). + // Any other error (e.g. io.EOF) means the connection is dead. + if err == nil { + return true + } return errors.Is(err, os.ErrDeadlineExceeded) } diff --git a/x/mongo/driver/topology/pool.go b/x/mongo/driver/topology/pool.go index f9cdec040d..b7769336e1 100644 --- a/x/mongo/driver/topology/pool.go +++ b/x/mongo/driver/topology/pool.go @@ -7,11 +7,10 @@ package topology import ( - "bufio" "context" + "encoding/binary" "errors" "fmt" - "io" "net" "sync" "sync/atomic" @@ -885,109 +884,78 @@ func publishPendingResponseSucceeded(pool *pool, conn *connection, dur time.Dura } } -// peekConnectionAlive checks if the connection is alive by peeking at the -// buffered reader. If the connection is closed, it will return false. func peekConnectionAlive(conn *connection) (int, error) { - // Set a very short deadline to avoid blocking. + // very short deadline so we don’t block if err := conn.nc.SetReadDeadline(time.Now().Add(1 * time.Millisecond)); err != nil { return 0, err } - - // Wrap the connection in a buffered reader to use peek. - reader := bufio.NewReader(conn.nc) - - // Try to peek at one byte. - bytes, err := reader.Peek(1) + // Peek(1) will fill the bufio.Reader’s buffer if needed, + // but will NOT advance it. + bytes, err := conn.br.Peek(1) return len(bytes), err } func attemptPendingResponse(ctx context.Context, conn *connection, remainingTime time.Duration) (int, error) { - pendingreadState := conn.pendingResponseState - if pendingreadState == nil { + state := conn.pendingResponseState + if state == nil { return 0, fmt.Errorf("no pending read state") } - dl, contextDeadlineUsed := ctx.Deadline() - calculatedDeadline := time.Now().Add(remainingTime) - - if contextDeadlineUsed { - // Use the minimum of the user-provided deadline and the calculated - // deadline. - if calculatedDeadline.Before(dl) { - dl = calculatedDeadline - } + // compute an absolute deadline combining ctx + our leftover + var dl time.Time + if dl0, ok := ctx.Deadline(); ok && time.Now().Add(remainingTime).After(dl0) { + dl = dl0 } else { - dl = calculatedDeadline + dl = time.Now().Add(remainingTime) } - - err := conn.nc.SetReadDeadline(dl) - if err != nil { - return 0, fmt.Errorf("error setting a read deadline: %w", err) + if err := conn.nc.SetReadDeadline(dl); err != nil { + return 0, fmt.Errorf("setting read deadline: %w", err) } - size := pendingreadState.remainingBytes - - // TODO: What happens when you do an aliveness check and read a byte? - // Need to make a test for this, otherwise size could be corrupted. - // aliveness check MUST only peek. - - if size == 0 { // Question: Would this alawys equal to zero? - var sizeBuf [4]byte - if bytesRead, err := io.ReadFull(conn.nc, sizeBuf[:]); err != nil { - err = transformNetworkError(ctx, err, contextDeadlineUsed) - - return bytesRead, fmt.Errorf("error reading the message size: %w", err) - } + totalRead := 0 - size, err = conn.parseWmSizeBytes(sizeBuf) + // if we haven’t even parsed the 4-byte length yet, peek it + if state.remainingBytes == 0 { + hdr, err := conn.br.Peek(4) if err != nil { - return int(size), transformNetworkError(ctx, err, contextDeadlineUsed) + return 0, fmt.Errorf("peeking length prefix: %w", err) } - size -= 4 + msgLen := int(binary.LittleEndian.Uint32(hdr)) + // consume those 4 bytes now that we know the message length + if _, err := conn.br.Discard(4); err != nil { + return 0, fmt.Errorf("discarding length prefix: %w", err) + } + state.remainingBytes = int32(msgLen) - 4 } - const bufSize = 4096 - buf := make([]byte, bufSize) - - var totalRead int64 - - // Iterate every 4KB of the TCP stream, refreshing the remainingTimeout for - // each successful read to avoid closing while streaming large (upto 16MiB) - // response messages. - for totalRead < int64(size) { - newDeadline := time.Now().Add(time.Until(dl)) - if err := conn.nc.SetReadDeadline(newDeadline); err != nil { - return int(totalRead), fmt.Errorf("error renewing read deadline: %w", err) + buf := make([]byte, 4096) + for state.remainingBytes > 0 { + // refresh the deadline so large messages don't time out + if err := conn.nc.SetReadDeadline(time.Now().Add(time.Until(dl))); err != nil { + return totalRead, fmt.Errorf("renewing deadline: %w", err) } - remaining := int64(size) - totalRead - - readSize := bufSize - if int64(readSize) > remaining { - readSize = int(remaining) + toRead := int32(len(buf)) + if state.remainingBytes < toRead { + toRead = state.remainingBytes } - n, err := conn.nc.Read(buf[:readSize]) + n, err := conn.br.Read(buf[:toRead]) if n > 0 { - totalRead += int64(n) + totalRead += n + state.remainingBytes -= int32(n) } - if err != nil { - // If the read times out, record the bytes left to read before exiting. - // Reduce the remainingTime. - nerr := net.Error(nil) - if l := int32(n); l == 0 && errors.As(err, &nerr) && nerr.Timeout() { - pendingreadState.remainingBytes = l + pendingreadState.remainingBytes + // if it's just a timeout, record how much is left + if ne, ok := err.(net.Error); ok && ne.Timeout() { + return totalRead, fmt.Errorf("timeout discarding %d bytes: %w", + state.remainingBytes, err) } - - err = transformNetworkError(ctx, err, contextDeadlineUsed) - return n, fmt.Errorf("error discarding %d byte message: %w", size, err) + return totalRead, fmt.Errorf("reading body: %w", err) } - - pendingreadState.start = time.Now() } - return int(totalRead), nil + return totalRead + 4, nil } // poolClearedError is an error returned when the connection pool is cleared or currently paused. It @@ -1102,6 +1070,9 @@ func awaitPendingResponse(ctx context.Context, pool *pool, conn *connection) err // and return a pendingResponseError to indicate that the connection is // alive and retryable. if alivenessCheck { + dur := time.Since(st) + publishPendingResponseFailed(pool, conn, dur, err) + _ = pool.checkInNoEvent(conn) return pendingResponseError{err: fmt.Errorf("connection is alive and retryable: %w", err)} From 32e751c58ddfd181eeb4477158af34bed6b5f908 Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Fri, 27 Jun 2025 15:22:46 -0600 Subject: [PATCH 18/23] Use raw net connection for read() --- x/mongo/driver/topology/connection.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/x/mongo/driver/topology/connection.go b/x/mongo/driver/topology/connection.go index 7d4f983277..54046cca6c 100644 --- a/x/mongo/driver/topology/connection.go +++ b/x/mongo/driver/topology/connection.go @@ -490,7 +490,7 @@ func (c *connection) read(ctx context.Context) (bytesRead []byte, errMsg string, // We do a ReadFull into an array here instead of doing an opportunistic ReadAtLeast into dst // because there might be more than one wire message waiting to be read, for example when // reading messages from an exhaust cursor. - n, err := io.ReadFull(c.br, sizeBuf[:]) // Use the buffered reader + n, err := io.ReadFull(c.nc, sizeBuf[:]) // Use the buffered reader if err != nil { if l := int32(n); l == 0 && isCSOTTimeout(err) && driverutil.HasMaxTimeMS(ctx) { requestID, _ := driverutil.GetRequestID(ctx) @@ -511,7 +511,7 @@ func (c *connection) read(ctx context.Context) (bytesRead []byte, errMsg string, dst := make([]byte, size) copy(dst, sizeBuf[:]) - n, err = io.ReadFull(c.br, dst[4:]) // Use the buffered reader + n, err = io.ReadFull(c.nc, dst[4:]) // Use the buffered reader if err != nil { remainingBytes := size - 4 - int32(n) if remainingBytes > 0 && isCSOTTimeout(err) && driverutil.HasMaxTimeMS(ctx) { From 901ddbe21542af89aebb76c2cdf4de654b803e49 Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Fri, 27 Jun 2025 16:05:28 -0600 Subject: [PATCH 19/23] Fix timeout problems --- x/mongo/driver/operation.go | 5 + x/mongo/driver/topology/cmap_prose_test.go | 114 --------------------- x/mongo/driver/topology/connection.go | 1 - x/mongo/driver/topology/connection_test.go | 99 ++---------------- x/mongo/driver/topology/pool.go | 6 ++ x/mongo/driver/topology/pool_test.go | 6 +- 6 files changed, 23 insertions(+), 208 deletions(-) diff --git a/x/mongo/driver/operation.go b/x/mongo/driver/operation.go index 91bcda8b89..88f0a05f0a 100644 --- a/x/mongo/driver/operation.go +++ b/x/mongo/driver/operation.go @@ -645,6 +645,11 @@ func (op Operation) Execute(ctx context.Context) error { if srvr == nil || conn == nil { srvr, conn, err = op.getServerAndConnection(ctx, requestID, deprioritizedServers) if err != nil { + // If the returned error is a context error, return it immediately. + if ctx.Err() != nil { + err = ctx.Err() + } + // If the returned error is retryable and there are retries remaining (negative // retries means retry indefinitely), then retry the operation. Set the server // and connection to nil to request a new server and connection. diff --git a/x/mongo/driver/topology/cmap_prose_test.go b/x/mongo/driver/topology/cmap_prose_test.go index 6a48105749..0524b99e9c 100644 --- a/x/mongo/driver/topology/cmap_prose_test.go +++ b/x/mongo/driver/topology/cmap_prose_test.go @@ -10,17 +10,12 @@ import ( "context" "errors" "net" - "regexp" - "sync" "testing" "time" "go.mongodb.org/mongo-driver/v2/event" "go.mongodb.org/mongo-driver/v2/internal/assert" - "go.mongodb.org/mongo-driver/v2/internal/csot" - "go.mongodb.org/mongo-driver/v2/internal/driverutil" "go.mongodb.org/mongo-driver/v2/internal/require" - "go.mongodb.org/mongo-driver/v2/mongo/address" "go.mongodb.org/mongo-driver/v2/x/mongo/driver/operation" ) @@ -268,115 +263,6 @@ func TestCMAPProse(t *testing.T) { }) }) }) - - // Need to test the case where we attempt a non-blocking read to determine if - // we should refresh the remaining time. In the case of the Go Driver, we do - // this by attempt to "peek" at 1 byte with a deadline of 1ns. - //t.Run("connection attempts peek but fails", func(t *testing.T) { - // // Create a client using a direct connection to the proxy. - // client, err := mongo.Connect( - //}) - - t.Run("connection attempts peek and succeeds", func(t *testing.T) { - const requestID = int32(-1) - timeout := 10 * time.Millisecond - - // Mock a TCP listener that will write a byte sequence > 5 (to avoid errors - // due to size) to the TCP socket. Have the listener sleep for 2x the - // timeout provided to the connection AFTER writing the byte sequence. This - // wiill cause the connection to timeout while reading from the socket. - addr := bootstrapConnections(t, 1, func(nc net.Conn) { - defer func() { - _ = nc.Close() - }() - - _, err := nc.Write([]byte{12, 0, 0, 0, 0, 0, 0, 0, 1}) - require.NoError(t, err) - time.Sleep(timeout * 2) - - // Write data that can be peeked at. - _, err = nc.Write([]byte{12, 0, 0, 0, 0, 0, 0, 0, 1}) - require.NoError(t, err) - - }) - - poolEventsByType := make(map[string][]event.PoolEvent) - poolEventsByTypeMu := &sync.Mutex{} - - monitor := &event.PoolMonitor{ - Event: func(pe *event.PoolEvent) { - poolEventsByTypeMu.Lock() - poolEventsByType[pe.Type] = append(poolEventsByType[pe.Type], *pe) - poolEventsByTypeMu.Unlock() - }, - } - - p := newPool( - poolConfig{ - Address: address.Address(addr.String()), - PoolMonitor: monitor, - }, - ) - defer p.close(context.Background()) - err := p.ready() - require.NoError(t, err) - - // Check out a connection and read from the socket, causing a timeout and - // pinning the connection to a pending read state. - conn, err := p.checkOut(context.Background()) - require.NoError(t, err) - - ctx, cancel := csot.WithTimeout(context.Background(), &timeout) - defer cancel() - - ctx = driverutil.WithValueHasMaxTimeMS(ctx, true) - ctx = driverutil.WithRequestID(ctx, requestID) - - _, err = conn.readWireMessage(ctx) - regex := regexp.MustCompile( - `^connection\(.*\[-\d+\]\) incomplete read of full message: context deadline exceeded: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, - ) - assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) - - // Check in the connection with a pending read state. The next time this - // connection is checked out, it should attempt to read the pending - // response. - err = p.checkIn(conn) - require.NoError(t, err) - - // Wait 3s to make sure there is no remaining time on the pending read - // state. - time.Sleep(3 * time.Second) - - // Check out the connection again. The remaining time should be exhausted - // requiring us to "peek" at the connection to determine if we should - // close as not alive. - _, err = p.checkOut(context.Background()) - require.NoError(t, err) - - // There should be 1 ConnectionPendingResponseStarted event. - started := poolEventsByType[event.ConnectionPendingResponseStarted] - require.Len(t, started, 1) - - assert.Equal(t, addr.String(), started[0].Address) - assert.Equal(t, conn.driverConnectionID, started[0].ConnectionID) - assert.Equal(t, requestID, started[0].RequestID) - - // There should be 0 ConnectionPendingResponseFailed event. - require.Len(t, poolEventsByType[event.ConnectionPendingResponseFailed], 0) - - // There should be 1 ConnectionPendingResponseSucceeded event. - succeeded := poolEventsByType[event.ConnectionPendingResponseSucceeded] - require.Len(t, succeeded, 1) - - assert.Equal(t, addr.String(), succeeded[0].Address) - assert.Equal(t, conn.driverConnectionID, succeeded[0].ConnectionID) - assert.Equal(t, requestID, succeeded[0].RequestID) - assert.Greater(t, int(succeeded[0].Duration), 0) - - // The connection should not have been closed. - require.Len(t, poolEventsByType[event.ConnectionClosed], 0) - }) } func createTestPool(t *testing.T, cfg poolConfig, opts ...ConnectionOption) *pool { diff --git a/x/mongo/driver/topology/connection.go b/x/mongo/driver/topology/connection.go index 54046cca6c..4299886f5c 100644 --- a/x/mongo/driver/topology/connection.go +++ b/x/mongo/driver/topology/connection.go @@ -589,7 +589,6 @@ func (c *connection) isAlive() bool { } // Important: always reset the deadline after the check. defer c.nc.SetReadDeadline(time.Time{}) - _, err := c.br.Peek(1) // The connection is alive if we got a timeout (meaning the connection is idle // and waiting for data) or if we successfully peeked at a byte (err == nil). diff --git a/x/mongo/driver/topology/connection_test.go b/x/mongo/driver/topology/connection_test.go index f4d0cd9d7d..8ae97f0769 100644 --- a/x/mongo/driver/topology/connection_test.go +++ b/x/mongo/driver/topology/connection_test.go @@ -7,6 +7,7 @@ package topology import ( + "bufio" "context" "crypto/tls" "errors" @@ -380,7 +381,7 @@ func TestConnection(t *testing.T) { t.Run("size read errors", func(t *testing.T) { err := errors.New("Read error") tnc := &testNetConn{readerr: err} - conn := &connection{id: "foobar", nc: tnc, state: connConnected} + conn := &connection{id: "foobar", nc: tnc, state: connConnected, br: bufio.NewReader(tnc)} listener := newTestCancellationListener(false) conn.cancellationListener = listener @@ -397,7 +398,7 @@ func TestConnection(t *testing.T) { t.Run("size too small errors", func(t *testing.T) { err := errors.New("malformed message length: 3") tnc := &testNetConn{readerr: err, buf: []byte{0x03, 0x00, 0x00, 0x00}} - conn := &connection{id: "foobar", nc: tnc, state: connConnected} + conn := &connection{id: "foobar", nc: tnc, state: connConnected, br: bufio.NewReader(tnc)} listener := newTestCancellationListener(false) conn.cancellationListener = listener @@ -414,7 +415,7 @@ func TestConnection(t *testing.T) { t.Run("full message read errors", func(t *testing.T) { err := errors.New("Read error") tnc := &testNetConn{readerr: err, buf: []byte{0x11, 0x00, 0x00, 0x00}} - conn := &connection{id: "foobar", nc: tnc, state: connConnected} + conn := &connection{id: "foobar", nc: tnc, state: connConnected, br: bufio.NewReader(tnc)} listener := newTestCancellationListener(false) conn.cancellationListener = listener @@ -450,7 +451,7 @@ func TestConnection(t *testing.T) { err := errors.New("length of read message too large") tnc := &testNetConn{buf: make([]byte, len(tc.buffer))} copy(tnc.buf, tc.buffer) - conn := &connection{id: "foobar", nc: tnc, state: connConnected, desc: tc.desc} + conn := &connection{id: "foobar", nc: tnc, state: connConnected, desc: tc.desc, br: bufio.NewReader(tnc)} listener := newTestCancellationListener(false) conn.cancellationListener = listener @@ -467,7 +468,7 @@ func TestConnection(t *testing.T) { want := []byte{0x0A, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A} tnc := &testNetConn{buf: make([]byte, len(want))} copy(tnc.buf, want) - conn := &connection{id: "foobar", nc: tnc, state: connConnected} + conn := &connection{id: "foobar", nc: tnc, state: connConnected, br: bufio.NewReader(tnc)} listener := newTestCancellationListener(false) conn.cancellationListener = listener @@ -497,7 +498,7 @@ func TestConnection(t *testing.T) { readBuf := []byte{10, 0, 0, 0} nc := newCancellationReadConn(&testNetConn{}, tc.skip, readBuf) - conn := &connection{id: "foobar", nc: nc, state: connConnected} + conn := &connection{id: "foobar", nc: nc, state: connConnected, br: bufio.NewReader(nc)} listener := newTestCancellationListener(false) conn.cancellationListener = listener @@ -525,7 +526,7 @@ func TestConnection(t *testing.T) { }) t.Run("closes connection if context is cancelled even if the socket read succeeds", func(t *testing.T) { tnc := &testNetConn{buf: []byte{0x0A, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A}} - conn := &connection{id: "foobar", nc: tnc, state: connConnected} + conn := &connection{id: "foobar", nc: tnc, state: connConnected, br: bufio.NewReader(tnc)} listener := newTestCancellationListener(true) conn.cancellationListener = listener @@ -566,7 +567,7 @@ func TestConnection(t *testing.T) { t.Run("cancellation listener callback", func(t *testing.T) { t.Run("closes connection", func(t *testing.T) { tnc := &testNetConn{} - conn := &connection{state: connConnected, nc: tnc} + conn := &connection{state: connConnected, nc: tnc, br: bufio.NewReader(tnc), id: "foobar"} conn.cancellationListenerCallback() assert.True(t, conn.state == connDisconnected, "expected connection state %v, got %v", connDisconnected, @@ -1100,85 +1101,3 @@ func (tcl *testCancellationListener) assertCalledOnce(t *testing.T) { assert.Equal(t, 1, tcl.numListen, "expected Listen to be called once, got %d", tcl.numListen) assert.Equal(t, 1, tcl.numStopListening, "expected StopListening to be called once, got %d", tcl.numListen) } - -func TestConnection_IsAlive(t *testing.T) { - t.Parallel() - - t.Run("uninitialized", func(t *testing.T) { - t.Parallel() - - conn := newConnection("") - assert.False(t, - conn.isAlive(), - "expected isAlive for an uninitialized connection to always return false") - }) - - t.Run("connection open", func(t *testing.T) { - t.Parallel() - - cleanup := make(chan struct{}) - defer close(cleanup) - addr := bootstrapConnections(t, 1, func(nc net.Conn) { - // Keep the connection open until the end of the test. - <-cleanup - _ = nc.Close() - }) - - conn := newConnection(address.Address(addr.String())) - err := conn.connect(context.Background()) - require.NoError(t, err) - - conn.idleStart.Store(time.Now().Add(-11 * time.Second)) - assert.True(t, - conn.isAlive(), - "expected isAlive for an open connection to return true") - }) - - t.Run("connection closed", func(t *testing.T) { - t.Parallel() - - conns := make(chan net.Conn) - addr := bootstrapConnections(t, 1, func(nc net.Conn) { - conns <- nc - }) - - conn := newConnection(address.Address(addr.String())) - err := conn.connect(context.Background()) - require.NoError(t, err) - - // Close the connection before calling isAlive. - nc := <-conns - err = nc.Close() - require.NoError(t, err) - - conn.idleStart.Store(time.Now().Add(-11 * time.Second)) - assert.False(t, - conn.isAlive(), - "expected isAlive for a closed connection to return false") - }) - - t.Run("connection reads data", func(t *testing.T) { - t.Parallel() - - cleanup := make(chan struct{}) - defer close(cleanup) - addr := bootstrapConnections(t, 1, func(nc net.Conn) { - // Write some data to the connection before calling isAlive. - _, err := nc.Write([]byte{5, 0, 0, 0, 0}) - require.NoError(t, err) - - // Keep the connection open until the end of the test. - <-cleanup - _ = nc.Close() - }) - - conn := newConnection(address.Address(addr.String())) - err := conn.connect(context.Background()) - require.NoError(t, err) - - conn.idleStart.Store(time.Now().Add(-11 * time.Second)) - assert.False(t, - conn.isAlive(), - "expected isAlive for an open connection that reads data to return false") - }) -} diff --git a/x/mongo/driver/topology/pool.go b/x/mongo/driver/topology/pool.go index b7769336e1..1f7f7bf724 100644 --- a/x/mongo/driver/topology/pool.go +++ b/x/mongo/driver/topology/pool.go @@ -1011,6 +1011,9 @@ func awaitPendingResponse(ctx context.Context, pool *pool, conn *connection) err // If there is no remaining time, we can just peek at the connection to check // aliveness. In such cases, we don't want to close the connection. bytesRead, err = peekConnectionAlive(conn) + + // Mark this attempt as alive but check the connection back it the pull and + // send a retryable error. alivenessCheck = true } else { bytesRead, err = attemptPendingResponse(ctx, conn, remainingTime) @@ -1042,6 +1045,7 @@ func awaitPendingResponse(ctx context.Context, pool *pool, conn *connection) err nerr := net.Error(nil) return errors.As(err, &nerr) && nerr.Timeout() } + if !isCSOTTimeout(err) { if err := conn.close(); err != nil { return pendingResponseError{err: err} @@ -1075,6 +1079,8 @@ func awaitPendingResponse(ctx context.Context, pool *pool, conn *connection) err _ = pool.checkInNoEvent(conn) + // TODO this should be a special error noting that the remainting timeout + // has been exceeded. return pendingResponseError{err: fmt.Errorf("connection is alive and retryable: %w", err)} } diff --git a/x/mongo/driver/topology/pool_test.go b/x/mongo/driver/topology/pool_test.go index f0ba122646..54ef05c46d 100644 --- a/x/mongo/driver/topology/pool_test.go +++ b/x/mongo/driver/topology/pool_test.go @@ -1383,7 +1383,7 @@ func TestAwaitPendingRead(t *testing.T) { _, err = p.checkOut(context.Background()) require.Error(t, err) - assert.EqualError(t, pendingReadError, "error reading the message size: unexpected EOF") + assert.EqualError(t, pendingReadError, "peeking length prefix: EOF") }) t.Run("timeout reading message header, background read timeout", func(t *testing.T) { timeout := 10 * time.Millisecond @@ -1444,7 +1444,7 @@ func TestAwaitPendingRead(t *testing.T) { require.Error(t, err) wantErr := regexp.MustCompile( - `^error discarding 6 byte message: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, + `^timeout discarding 2 bytes: read tcp 127.0.0.1:.*->127.0.0.1:.*: i\/o timeout$`, ) assert.True(t, wantErr.MatchString(pendingReadError.Error()), "error %q does not match pattern %q", pendingReadError, wantErr) }) @@ -1564,7 +1564,7 @@ func TestAwaitPendingRead(t *testing.T) { _, err = p.checkOut(context.Background()) require.Error(t, err) - assert.EqualError(t, pendingReadError, "error discarding 3 byte message: EOF") + assert.EqualError(t, pendingReadError, "reading body: EOF") }) } From fad8cf1c0a617032a665e154aa73c88f92a3bdf3 Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Fri, 27 Jun 2025 16:49:56 -0600 Subject: [PATCH 20/23] Update test to align with prose tests --- internal/integration/cmap_prose_test.go | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/internal/integration/cmap_prose_test.go b/internal/integration/cmap_prose_test.go index 758a338f88..ec95296564 100644 --- a/internal/integration/cmap_prose_test.go +++ b/internal/integration/cmap_prose_test.go @@ -36,10 +36,9 @@ func TestCMAPProse_PendingResponse_ConnectionAliveness(t *testing.T) { t.Skip("Skipping test because MONGO_PROXY_URI is not set") } - //proxyURI := "mongodb://127.0.0.1:28017/?directConnection=true" + // Create a direct connection to the proxy server. proxyURI := os.Getenv("MONGO_PROXY_URI") - // Establish a direct connection to the proxy via mtest. clientOpts := options.Client().ApplyURI(proxyURI).SetMaxPoolSize(1).SetDirect(true) mt := mtest.New(t, mtest.NewOptions().ClientOptions(clientOpts).ClientType(mtest.MongoProxy)) @@ -83,9 +82,9 @@ func TestCMAPProse_PendingResponse_ConnectionAliveness(t *testing.T) { // Wait 3 seconds to ensure there is time left in the pending response state. time.Sleep(3 * time.Second) - //Run an insertOne without a timeout. Expect the pending response to fail - //at the aliveness check. However, the insert should succeed since pending - //response failures are retryable. + // Run an insertOne without a timeout. Expect the pending response to fail + // at the aliveness check. However, the insert should succeed since pending + // response failures are retryable. _, err = coll.InsertOne(context.Background(), myStruct{Name: "Bob", Age: 25}) require.NoError(t, err, "expected insertOne to succeed after pending response aliveness check") @@ -110,11 +109,7 @@ func TestCMAPProse_PendingResponse_ConnectionAliveness(t *testing.T) { {Key: "actions", Value: bson.A{ // Causes the timeout in the initial try. bson.D{{Key: "delayMs", Value: 400}}, - //// Send exactly one byte so that the aliveness check succeeds. - bson.D{{Key: "sendBytes", Value: 1}}, - //// Cause another delay for the retry operation. - bson.D{{Key: "delayMs", Value: 10}}, - //// Send the rest of the response for discarding on retry. + // Send the rest of the response for discarding on retry. bson.D{{Key: "sendAll", Value: true}}, }}, } From 9d7d6014e6ada89a2e760fdc616eda86b070dc7c Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Mon, 30 Jun 2025 12:37:21 -0600 Subject: [PATCH 21/23] Return raw error --- x/mongo/driver/operation.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x/mongo/driver/operation.go b/x/mongo/driver/operation.go index d322b0c1ae..d4a23c8293 100644 --- a/x/mongo/driver/operation.go +++ b/x/mongo/driver/operation.go @@ -650,7 +650,7 @@ func (op Operation) Execute(ctx context.Context) error { if err != nil { // If the returned error is a context error, return it immediately. if ctx.Err() != nil { - err = ctx.Err() + return err } // If the returned error is retryable and there are retries remaining (negative From 3e1a00907461e7727032eefcdec00f59d6592b6c Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Tue, 1 Jul 2025 13:00:22 -0600 Subject: [PATCH 22/23] Make CSOT times windows-friendly --- internal/integration/client_test.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/internal/integration/client_test.go b/internal/integration/client_test.go index c500cd2217..f495c40698 100644 --- a/internal/integration/client_test.go +++ b/internal/integration/client_test.go @@ -687,7 +687,7 @@ func TestClient(t *testing.T) { Data: failpoint.Data{ FailCommands: []string{"find", "insert"}, BlockConnection: true, - BlockTimeMS: 500, + BlockTimeMS: 2000, }, }) @@ -697,13 +697,13 @@ func TestClient(t *testing.T) { wg.Add(50) for i := 0; i < 50; i++ { - // Run 50 concurrent operations, each with a timeout of 50ms. Expect + // Run 50 concurrent operations, each with a timeout of 1s. Expect // them to all return a timeout error because the failpoint - // blocks find operations for 50ms. Run 50 to increase the + // blocks find operations for 2s. Run 50 to increase the // probability that an operation will time out in a way that // can cause a retry. go func() { - ctx, cancel := context.WithTimeout(context.Background(), 150*time.Millisecond) + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) err := tc.operation(ctx, mt.Coll) cancel() assert.ErrorIs(mt, err, context.DeadlineExceeded) @@ -733,6 +733,10 @@ func TestClient(t *testing.T) { "expected exactly 1 command started event per operation (50), but got %d", len(evts)+pendingReadConns) mt.ClearEvents() + + // Wait for the failpoint to be unblocked. + time.Sleep(2 * time.Second) + mt.ClearFailPoints() }) } From ca291b805b6e9ed4bb511c2c80ab12945c0c34a6 Mon Sep 17 00:00:00 2001 From: Preston Vasquez Date: Wed, 2 Jul 2025 23:22:55 -0600 Subject: [PATCH 23/23] GODRIVER-3173 Update prose tests --- .evergreen/config.yml | 92 +++++++------ etc/setup-test.sh | 176 +++++++++++++----------- internal/integration/cmap_prose_test.go | 149 +++++++++++++++----- x/mongo/driver/topology/connection.go | 15 +- x/mongo/driver/topology/pool.go | 15 +- x/mongo/driver/topology/pool_test.go | 3 +- 6 files changed, 282 insertions(+), 168 deletions(-) diff --git a/.evergreen/config.yml b/.evergreen/config.yml index 5e5fc0a519..a72905b43b 100644 --- a/.evergreen/config.yml +++ b/.evergreen/config.yml @@ -26,7 +26,7 @@ timeout: args: [ls, -la] functions: - assume-test-secrets-ec2-role: + assume-test-secrets-ec2-role: - command: ec2.assume_role params: role_arn: ${aws_test_secrets_role} @@ -121,21 +121,21 @@ functions: params: binary: bash add_expansions_to_env: true - args: + args: - ${DRIVERS_TOOLS}/.evergreen/atlas_data_lake/pull-mongohouse-image.sh - command: subprocess.exec params: binary: bash - args: + args: - ${DRIVERS_TOOLS}/.evergreen/atlas_data_lake/run-mongohouse-image.sh bootstrap-mongo-orchestration: - command: subprocess.exec params: binary: bash - env: + env: MONGODB_VERSION: ${VERSION} - include_expansions_in_env: [TOPOLOGY, AUTH, SSL, ORCHESTRATION_FILE, + include_expansions_in_env: [TOPOLOGY, AUTH, SSL, ORCHESTRATION_FILE, REQUIRE_API_VERSION, LOAD_BALANCER] args: ["${DRIVERS_TOOLS}/.evergreen/run-orchestration.sh"] - command: expansions.update @@ -146,7 +146,7 @@ functions: - command: subprocess.exec params: binary: bash - env: + env: MONGODB_VERSION: ${VERSION} include_expansions_in_env: [TOPOLOGY, AUTH, SSL, ORCHESTRATION_FILE] args: ["${DRIVERS_TOOLS}/.evergreen/run-orchestration.sh"] @@ -158,23 +158,23 @@ functions: - command: subprocess.exec params: binary: bash - args: + args: # Ensure the instance profile is reassigned for aws tests. - ${DRIVERS_TOOLS}/.evergreen/auth_aws/teardown.sh - command: subprocess.exec params: binary: bash - args: + args: - ${DRIVERS_TOOLS}/.evergreen/csfle/teardown.sh - command: subprocess.exec params: binary: bash - args: + args: - ${DRIVERS_TOOLS}/.evergreen/ocsp/teardown.sh - command: subprocess.exec params: binary: bash - args: + args: - ${DRIVERS_TOOLS}/.evergreen/teardown.sh assume-test-secrets-ec2-role: @@ -200,7 +200,19 @@ functions: binary: bash env: GO_BUILD_TAGS: cse - include_expansions_in_env: ["TOPOLOGY", "AUTH", "SSL", "SKIP_CSOT_TESTS", "MONGODB_URI", "CRYPT_SHARED_LIB_PATH", "SKIP_CRYPT_SHARED_LIB", "RACE", "MONGO_GO_DRIVER_COMPRESSOR", "REQUIRE_API_VERSION", "LOAD_BALANCER"] + include_expansions_in_env: + - "TOPOLOGY" + - "AUTH" + - "SSL" + - "SKIP_CSOT_TESTS" + - "MONGODB_URI" + - "CRYPT_SHARED_LIB_PATH" + - "SKIP_CRYPT_SHARED_LIB" + - "RACE" + - "MONGO_GO_DRIVER_COMPRESSOR" + - "REQUIRE_API_VERSION" + - "LOAD_BALANCER" + - "MONGOPROXY" args: [*task-runner, setup-test] - command: subprocess.exec type: test @@ -294,7 +306,7 @@ functions: params: binary: bash include_expansions_in_env: [AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN] - env: + env: TEST_ENTERPRISE_AUTH: plain args: [*task-runner, setup-test] - command: subprocess.exec @@ -312,7 +324,7 @@ functions: params: binary: bash include_expansions_in_env: [AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN] - env: + env: TEST_ENTERPRISE_AUTH: gssapi args: [*task-runner, setup-test] - command: subprocess.exec @@ -344,7 +356,7 @@ functions: type: test params: binary: bash - env: + env: TOPOLOGY: server AUTH: auth SSL: ssl @@ -362,9 +374,9 @@ functions: type: test params: binary: bash - env: + env: GO_BUILD_TAGS: cse - include_expansions_in_env: [AUTH, SSL, MONGODB_URI, TOPOLOGY, MONGO_GO_DRIVER_COMPRESSOR, + include_expansions_in_env: [AUTH, SSL, MONGODB_URI, TOPOLOGY, MONGO_GO_DRIVER_COMPRESSOR, REQUIRE_API_VERSION, SKIP_CRYPT_SHARED_LIB, CRYPT_SHARED_LIB_PATH] args: [*task-runner, setup-test] - command: subprocess.exec @@ -379,7 +391,7 @@ functions: params: binary: bash include_expansions_in_env: [SINGLE_MONGOS_LB_URI, MULTI_MONGOS_LB_URI, AUTH, SSL, MONGO_GO_DRIVER_COMPRESSOR] - env: + env: LOAD_BALANCER: "true" args: [*task-runner, setup-test] - command: subprocess.exec @@ -393,7 +405,7 @@ functions: type: test params: binary: "bash" - env: + env: AUTH: auth SSL: nossl TOPOLOGY: server @@ -417,7 +429,7 @@ functions: type: test params: binary: "bash" - env: + env: TOPOLOGY: sharded_cluster TASKFILE_TARGET: test-short args: [*task-runner, run-docker] @@ -477,7 +489,7 @@ functions: params: include_expansions_in_env: ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN"] binary: "bash" - args: + args: - ${DRIVERS_TOOLS}/.evergreen/auth_aws/setup-secrets.sh run-aws-auth-test-with-regular-aws-credentials: @@ -502,7 +514,7 @@ functions: - command: subprocess.exec type: test params: - binary: bash + binary: bash include_expansions_in_env: [SKIP_EC2_AUTH_TEST] env: AWS_TEST: ec2 @@ -547,7 +559,7 @@ functions: type: test params: binary: bash - env: + env: AWS_ROLE_SESSION_NAME: test AWS_TEST: web-identity include_expansions_in_env: [SKIP_WEB_IDENTITY_AUTH_TEST] @@ -586,9 +598,9 @@ functions: type: test params: binary: "bash" - env: + env: GO_BUILD_TAGS: cse - include_expansions_in_env: [AUTH, SSL, MONGODB_URI, TOPOLOGY, + include_expansions_in_env: [AUTH, SSL, MONGODB_URI, TOPOLOGY, MONGO_GO_DRIVER_COMPRESSOR] args: [*task-runner, setup-test] - command: subprocess.exec @@ -603,9 +615,9 @@ functions: type: test params: binary: "bash" - env: + env: GO_BUILD_TAGS: cse - include_expansions_in_env: [AUTH, SSL, MONGODB_URI, TOPOLOGY, + include_expansions_in_env: [AUTH, SSL, MONGODB_URI, TOPOLOGY, MONGO_GO_DRIVER_COMPRESSOR] args: [*task-runner, setup-test] - command: subprocess.exec @@ -621,9 +633,9 @@ functions: type: test params: binary: "bash" - env: + env: GO_BUILD_TAGS: cse - include_expansions_in_env: [AUTH, SSL, MONGODB_URI, TOPOLOGY, + include_expansions_in_env: [AUTH, SSL, MONGODB_URI, TOPOLOGY, MONGO_GO_DRIVER_COMPRESSOR] args: [*task-runner, setup-test] - command: subprocess.exec @@ -662,14 +674,14 @@ tasks: allowed_requesters: ["patch", "github_pr"] commands: - func: assume-test-secrets-ec2-role - - func: "add PR reviewer" + - func: "add PR reviewer" - func: "add PR labels" - func: "create-api-report" - name: backport-pr allowed_requesters: ["commit"] commands: - - func: "backport pr" + - func: "backport pr" - name: perf tags: ["performance"] @@ -703,6 +715,7 @@ tasks: TOPOLOGY: "server" AUTH: "noauth" SSL: "nossl" + MONGOPROXY: "true" - name: test-standalone-noauth-nossl-snappy-compression tags: ["test", "standalone", "compression", "snappy"] @@ -720,7 +733,6 @@ tasks: AUTH: "noauth" SSL: "nossl" MONGO_GO_DRIVER_COMPRESSOR: "snappy" - MONGO_PROXY_URI: "mongodb://127.0.0.1:28017/?directConnection=true" - name: test-standalone-noauth-nossl-zlib-compression tags: ["test", "standalone", "compression", "zlib"] @@ -764,7 +776,7 @@ tasks: TOPOLOGY: "server" AUTH: "auth" SSL: "ssl" - - func: start-cse-servers + - func: start-cse-servers #- func: start-mongoproxy - func: run-tests vars: @@ -813,7 +825,7 @@ tasks: TOPOLOGY: "server" AUTH: "auth" SSL: "ssl" - - func: start-cse-servers + - func: start-cse-servers #- func: start-mongoproxy - func: run-tests vars: @@ -1489,7 +1501,7 @@ tasks: vars: MONGO_GO_DRIVER_COMPRESSOR: "snappy" - # Build the compilecheck submodule with all supported versions of Go >= + # Build the compilecheck submodule with all supported versions of Go >= # the minimum supported version. - name: go-build tags: ["compile-check"] @@ -1687,7 +1699,7 @@ tasks: params: binary: "bash" add_expansions_to_env: true - env: + env: EXPECT_ERROR: 'status=400' args: [*task-runner, test-awskms] @@ -1943,13 +1955,13 @@ task_groups: params: binary: "bash" add_expansions_to_env: true - args: + args: - ${DRIVERS_TOOLS}/.evergreen/csfle/gcpkms/setup.sh teardown_group: - command: subprocess.exec params: binary: "bash" - args: + args: - ${DRIVERS_TOOLS}/.evergreen/csfle/gcpkms/teardown.sh - func: teardown - func: handle-test-artifacts @@ -1967,7 +1979,7 @@ task_groups: params: binary: bash add_expansions_to_env: true - env: + env: AZUREKMS_VMNAME_PREFIX: GODRIVER args: - ${DRIVERS_TOOLS}/.evergreen/csfle/azurekms/setup.sh @@ -1975,7 +1987,7 @@ task_groups: - command: subprocess.exec params: binary: "bash" - args: + args: - ${DRIVERS_TOOLS}/.evergreen/csfle/azurekms/teardown.sh - func: teardown - func: handle-test-artifacts @@ -2198,7 +2210,7 @@ buildvariants: - rhel8.7-small expansions: GO_DIST: "/opt/golang/go1.23" - tasks: + tasks: - name: "backport-pr" - name: atlas-test diff --git a/etc/setup-test.sh b/etc/setup-test.sh index 8f31e301a4..6ce2da635a 100755 --- a/etc/setup-test.sh +++ b/etc/setup-test.sh @@ -10,108 +10,117 @@ RACE=${RACE:-} SERVERLESS=${SERVERLESS:-} LOAD_BALANCER=${LOAD_BALANCER:-} MONGODB_URI=${MONGODB_URI:-} +MONGOPROXY=${MONGOPROXY:-} +MONGO_PROX_URI=${MONGO_PROX_URI:-} # Handle special cases first. if [ -n "${TEST_ENTERPRISE_AUTH:-}" ]; then - . $DRIVERS_TOOLS/.evergreen/secrets_handling/setup-secrets.sh drivers/enterprise_auth - AUTH="auth" - case $TEST_ENTERPRISE_AUTH in - plain) - MONGODB_URI="mongodb://${SASL_USER}:${SASL_PASS}@${SASL_HOST}:${SASL_PORT}/ldap?authMechanism=PLAIN" - ;; - gssapi) - if [ "Windows_NT" = "${OS:-}" ]; then - MONGODB_URI="mongodb://${PRINCIPAL/@/%40}:${SASL_PASS}@${SASL_HOST}:${SASL_PORT}/kerberos?authMechanism=GSSAPI" - else - echo ${KEYTAB_BASE64} | base64 -d > ${PROJECT_DIRECTORY}/.evergreen/drivers.keytab - mkdir -p ~/.krb5 - cat .evergreen/krb5.config | tee -a ~/.krb5/config - kinit -k -t .evergreen/drivers.keytab -p "${PRINCIPAL}" - MONGODB_URI="mongodb://${PRINCIPAL/@/%40}@${SASL_HOST}:${SASL_PORT}/kerberos?authMechanism=GSSAPI" - fi - ;; - esac - rm secrets-export.sh + . $DRIVERS_TOOLS/.evergreen/secrets_handling/setup-secrets.sh drivers/enterprise_auth + AUTH="auth" + case $TEST_ENTERPRISE_AUTH in + plain) + MONGODB_URI="mongodb://${SASL_USER}:${SASL_PASS}@${SASL_HOST}:${SASL_PORT}/ldap?authMechanism=PLAIN" + ;; + gssapi) + if [ "Windows_NT" = "${OS:-}" ]; then + MONGODB_URI="mongodb://${PRINCIPAL/@/%40}:${SASL_PASS}@${SASL_HOST}:${SASL_PORT}/kerberos?authMechanism=GSSAPI" + else + echo ${KEYTAB_BASE64} | base64 -d >${PROJECT_DIRECTORY}/.evergreen/drivers.keytab + mkdir -p ~/.krb5 + cat .evergreen/krb5.config | tee -a ~/.krb5/config + kinit -k -t .evergreen/drivers.keytab -p "${PRINCIPAL}" + MONGODB_URI="mongodb://${PRINCIPAL/@/%40}@${SASL_HOST}:${SASL_PORT}/kerberos?authMechanism=GSSAPI" + fi + ;; + esac + rm secrets-export.sh fi if [ -n "${SERVERLESS}" ]; then - . $DRIVERS_TOOLS/.evergreen/serverless/secrets-export.sh - MONGODB_URI="${SERVERLESS_URI}" - AUTH="auth" + . $DRIVERS_TOOLS/.evergreen/serverless/secrets-export.sh + MONGODB_URI="${SERVERLESS_URI}" + AUTH="auth" fi if [ -n "${TEST_ATLAS_CONNECT:-}" ]; then - . $DRIVERS_TOOLS/.evergreen/secrets_handling/setup-secrets.sh drivers/atlas_connect + . $DRIVERS_TOOLS/.evergreen/secrets_handling/setup-secrets.sh drivers/atlas_connect fi if [ -n "${LOAD_BALANCER}" ]; then - # Verify that the required LB URI expansions are set to ensure that the test runner can correctly connect to - # the LBs. - if [ -z "${SINGLE_MONGOS_LB_URI}" ]; then - echo "SINGLE_MONGOS_LB_URI must be set for testing against LBs" - exit 1 - fi - if [ -z "${MULTI_MONGOS_LB_URI}" ]; then - echo "MULTI_MONGOS_LB_URI must be set for testing against LBs" - exit 1 - fi - MONGODB_URI="${SINGLE_MONGOS_LB_URI}" + # Verify that the required LB URI expansions are set to ensure that the test runner can correctly connect to + # the LBs. + if [ -z "${SINGLE_MONGOS_LB_URI}" ]; then + echo "SINGLE_MONGOS_LB_URI must be set for testing against LBs" + exit 1 + fi + if [ -z "${MULTI_MONGOS_LB_URI}" ]; then + echo "MULTI_MONGOS_LB_URI must be set for testing against LBs" + exit 1 + fi + MONGODB_URI="${SINGLE_MONGOS_LB_URI}" +fi + +if [ -n "${MONGOPROXY}" ]; then + echo "MONGOPROXY is set, using the proxy for qualifying tests." + # If MONGOPROXY is set, we assume that the user wants to use the proxy for all tests. + # This is useful for testing the proxy itself. + MONGO_PROXY_URI="mongodb://127.0.0.1:28017/?directConnection=true" fi if [ -n "${OCSP_ALGORITHM:-}" ]; then - MONGO_GO_DRIVER_CA_FILE="${DRIVERS_TOOLS}/.evergreen/ocsp/${OCSP_ALGORITHM}/ca.pem" - if [ "Windows_NT" = "$OS" ]; then - MONGO_GO_DRIVER_CA_FILE=$(cygpath -m $MONGO_GO_DRIVER_CA_FILE) - fi + MONGO_GO_DRIVER_CA_FILE="${DRIVERS_TOOLS}/.evergreen/ocsp/${OCSP_ALGORITHM}/ca.pem" + if [ "Windows_NT" = "$OS" ]; then + MONGO_GO_DRIVER_CA_FILE=$(cygpath -m $MONGO_GO_DRIVER_CA_FILE) + fi fi # Handle encryption. if [[ "${GO_BUILD_TAGS}" =~ cse ]]; then - # Install libmongocrypt if needed. - task install-libmongocrypt - - # Handle libmongocrypt paths. - PKG_CONFIG_PATH=$(pwd)/install/libmongocrypt/lib64/pkgconfig - LD_LIBRARY_PATH=$(pwd)/install/libmongocrypt/lib64 - - if [ "$(uname -s)" = "Darwin" ]; then - PKG_CONFIG_PATH=$(pwd)/install/libmongocrypt/lib/pkgconfig - DYLD_FALLBACK_LIBRARY_PATH=$(pwd)/install/libmongocrypt/lib - fi - - if [ "${SKIP_CRYPT_SHARED_LIB:-''}" = "true" ]; then - CRYPT_SHARED_LIB_PATH="" - echo "crypt_shared library is skipped" - elif [ -z "${CRYPT_SHARED_LIB_PATH:-}" ]; then - echo "crypt_shared library path is empty" - else - echo "crypt_shared library will be loaded from path: $CRYPT_SHARED_LIB_PATH" - fi + # Install libmongocrypt if needed. + task install-libmongocrypt + + # Handle libmongocrypt paths. + PKG_CONFIG_PATH=$(pwd)/install/libmongocrypt/lib64/pkgconfig + LD_LIBRARY_PATH=$(pwd)/install/libmongocrypt/lib64 + + if [ "$(uname -s)" = "Darwin" ]; then + PKG_CONFIG_PATH=$(pwd)/install/libmongocrypt/lib/pkgconfig + DYLD_FALLBACK_LIBRARY_PATH=$(pwd)/install/libmongocrypt/lib + fi + + if [ "${SKIP_CRYPT_SHARED_LIB:-''}" = "true" ]; then + CRYPT_SHARED_LIB_PATH="" + echo "crypt_shared library is skipped" + elif [ -z "${CRYPT_SHARED_LIB_PATH:-}" ]; then + echo "crypt_shared library path is empty" + else + echo "crypt_shared library will be loaded from path: $CRYPT_SHARED_LIB_PATH" + fi fi # Handle the build tags argument. if [ -n "${GO_BUILD_TAGS}" ]; then - BUILD_TAGS="${RACE} --tags=${GO_BUILD_TAGS}" + BUILD_TAGS="${RACE} --tags=${GO_BUILD_TAGS}" else - BUILD_TAGS="${RACE}" + BUILD_TAGS="${RACE}" fi # Handle certificates. if [ "$SSL" != "nossl" ] && [ -z "${SERVERLESS}" ] && [ -z "${OCSP_ALGORITHM:-}" ]; then - MONGO_GO_DRIVER_CA_FILE="$DRIVERS_TOOLS/.evergreen/x509gen/ca.pem" - MONGO_GO_DRIVER_KEY_FILE="$DRIVERS_TOOLS/.evergreen/x509gen/client.pem" - MONGO_GO_DRIVER_PKCS8_ENCRYPTED_KEY_FILE="$DRIVERS_TOOLS/.evergreen/x509gen/client-pkcs8-encrypted.pem" - MONGO_GO_DRIVER_PKCS8_UNENCRYPTED_KEY_FILE="$DRIVERS_TOOLS/.evergreen/x509gen/client-pkcs8-unencrypted.pem" - - if [ "Windows_NT" = "$OS" ]; then - MONGO_GO_DRIVER_CA_FILE=$(cygpath -m $MONGO_GO_DRIVER_CA_FILE) - MONGO_GO_DRIVER_KEY_FILE=$(cygpath -m $MONGO_GO_DRIVER_KEY_FILE) - MONGO_GO_DRIVER_PKCS8_ENCRYPTED_KEY_FILE=$(cygpath -m $MONGO_GO_DRIVER_PKCS8_ENCRYPTED_KEY_FILE) - MONGO_GO_DRIVER_PKCS8_UNENCRYPTED_KEY_FILE=$(cygpath -m $MONGO_GO_DRIVER_PKCS8_UNENCRYPTED_KEY_FILE) - fi + MONGO_GO_DRIVER_CA_FILE="$DRIVERS_TOOLS/.evergreen/x509gen/ca.pem" + MONGO_GO_DRIVER_KEY_FILE="$DRIVERS_TOOLS/.evergreen/x509gen/client.pem" + MONGO_GO_DRIVER_PKCS8_ENCRYPTED_KEY_FILE="$DRIVERS_TOOLS/.evergreen/x509gen/client-pkcs8-encrypted.pem" + MONGO_GO_DRIVER_PKCS8_UNENCRYPTED_KEY_FILE="$DRIVERS_TOOLS/.evergreen/x509gen/client-pkcs8-unencrypted.pem" + + if [ "Windows_NT" = "$OS" ]; then + MONGO_GO_DRIVER_CA_FILE=$(cygpath -m $MONGO_GO_DRIVER_CA_FILE) + MONGO_GO_DRIVER_KEY_FILE=$(cygpath -m $MONGO_GO_DRIVER_KEY_FILE) + MONGO_GO_DRIVER_PKCS8_ENCRYPTED_KEY_FILE=$(cygpath -m $MONGO_GO_DRIVER_PKCS8_ENCRYPTED_KEY_FILE) + MONGO_GO_DRIVER_PKCS8_UNENCRYPTED_KEY_FILE=$(cygpath -m $MONGO_GO_DRIVER_PKCS8_UNENCRYPTED_KEY_FILE) + fi fi -cat < .test.env +cat <.test.env AUTH="${AUTH:-}" SSL="${SSL}" MONGO_GO_DRIVER_CA_FILE="${MONGO_GO_DRIVER_CA_FILE:-}" @@ -129,27 +138,28 @@ PKG_CONFIG_PATH="${PKG_CONFIG_PATH:-}" LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}" MACOS_LIBRARY_PATH="${DYLD_FALLBACK_LIBRARY_PATH:-}" SKIP_CSOT_TESTS=${SKIP_CSOT_TESTS:-} +MONGO_PROXY_URI="${MONGO_PROXY_URI:-}" EOT if [ -n "${MONGODB_URI}" ]; then - echo "MONGODB_URI=\"${MONGODB_URI}\"" >> .test.env + echo "MONGODB_URI=\"${MONGODB_URI}\"" >>.test.env fi if [ -n "${SERVERLESS}" ]; then - echo "SERVERLESS_ATLAS_USER=$SERVERLESS_ATLAS_USER" >> .test.env - echo "SERVERLESS_ATLAS_PASSWORD=$SERVERLESS_ATLAS_PASSWORD" >> .test.env + echo "SERVERLESS_ATLAS_USER=$SERVERLESS_ATLAS_USER" >>.test.env + echo "SERVERLESS_ATLAS_PASSWORD=$SERVERLESS_ATLAS_PASSWORD" >>.test.env fi -if [ -n "${LOAD_BALANCER}" ];then - echo "SINGLE_MONGOS_LB_URI=${SINGLE_MONGOS_LB_URI}" >> .test.env - echo "MULTI_MONGOS_LB_URI=${MULTI_MONGOS_LB_URI}" >> .test.env +if [ -n "${LOAD_BALANCER}" ]; then + echo "SINGLE_MONGOS_LB_URI=${SINGLE_MONGOS_LB_URI}" >>.test.env + echo "MULTI_MONGOS_LB_URI=${MULTI_MONGOS_LB_URI}" >>.test.env fi # Add secrets to the test file. if [ -f "secrets-export.sh" ]; then - while read p; do - if [[ "$p" =~ ^export ]]; then - echo "$p" | sed 's/export //' >> .test.env - fi - done >.test.env + fi + done 127.0.0.1:.*: i\/o timeout$`, ) assert.True(t, regex.MatchString(err.Error()), "error %q does not match pattern %q", err, regex) - assert.Nil(t, conn.pendingResponseState, "conn.awaitRemainingBytes should be nil") + assert.NotNil(t, conn.pendingResponseState) + assert.Len(t, conn.pendingResponseState.sizeBytesReadBeforeSocketTimeout, 3) }) t.Run("timeout reading message header, successful background read", func(t *testing.T) { timeout := 10 * time.Millisecond