router: backoff on storage being disabled

Gerold103 · Gerold103 · commit 2eb17c4e0e1c · 2021-12-21T00:33:09.000+01:00
If a storage reports it is disabled, then it probably will take some time before it can accept new requests. This patch makes STORAGE_IS_DISABLED error cause the connection's backoff. In line with 'access denied' and 'no such function' errors. Because the reason for all 3 is the same - the storage is not ready to accept requests yet. Such requests are transparently retried now. Closes #298 @TarantoolBot document Title: vshard.storage.enable/disable() `vshard.storage.disable()` makes most of the `vshard.storage` functions throw an error. As Lua exception, not via `nil, err` pattern. `vshard.storage.enable()` reverts the disable. By default the storage is enabled. Additionally, the storage is forcefully disabled automatically until `vshard.storage.cfg()` is finished and the instance finished recovery (its `box.info.status` is `'running'`, for example). Auto-disable protects from usage of vshard functions before the storage's global state is fully created. Manual `vshard.storage.disable()` helps to achieve the same for user's application. For instance, a user might want to do some preparatory work after `vshard.storage.cfg` before the application is ready for requests. Then the flow would be: ```Lua vshard.storage.disable() vshard.storage.cfg(...) -- Do your preparatory work here ... vshard.storage.enable() ``` The routers handle the errors signaling about the storage being disabled in a special way. They put connections to such instances into a backoff state for some time and will try to use other replicas. For example, assume a replicaset has replicas 'replica_1' and 'replica_2'. Assume 'replica_1' is disabled due to any reason. If a router will try to talk to 'replica_1', it will get a special error and will transparently retry to 'replica_2'. When 'replica_1' is enabled again, the router will notice it too and will send requests to it again. It all works exclusively for read-only requests. Read-write requests can only be sent to a master, which is one per replicaset. They are not retried.
diff --git a/test/router/router2.result b/test/router/router2.result
@@ -570,6 +570,98 @@ assert(not ok and err.message:match('Unknown mode') ~= nil)
  | - true
  | ...
 
+--
+-- Storage is disabled = backoff.
+--
+test_run:switch('storage_2_a')
+ | ---
+ | - true
+ | ...
+vshard.storage.disable()
+ | ---
+ | ...
+
+test_run:switch('router_1')
+ | ---
+ | - true
+ | ...
+-- Drop old backoffs.
+fiber.sleep(vshard.consts.REPLICA_BACKOFF_INTERVAL)
+ | ---
+ | ...
+-- Success, but internally the request was retried.
+res, err = vshard.router.callro(1, 'echo', {100}, long_timeout)
+ | ---
+ | ...
+assert(res == 100)
+ | ---
+ | - true
+ | ...
+-- The best replica entered backoff state.
+util = require('util')
+ | ---
+ | ...
+storage_2 = vshard.router.static.replicasets[replicasets[2]]
+ | ---
+ | ...
+storage_2_a = storage_2.replicas[util.name_to_uuid.storage_2_a]
+ | ---
+ | ...
+assert(storage_2_a.backoff_ts ~= nil)
+ | ---
+ | - true
+ | ...
+
+test_run:switch('storage_2_b')
+ | ---
+ | - true
+ | ...
+assert(echo_count == 1)
+ | ---
+ | - true
+ | ...
+echo_count = 0
+ | ---
+ | ...
+
+test_run:switch('storage_2_a')
+ | ---
+ | - true
+ | ...
+assert(echo_count == 0)
+ | ---
+ | - true
+ | ...
+vshard.storage.enable()
+ | ---
+ | ...
+
+test_run:switch('router_1')
+ | ---
+ | - true
+ | ...
+-- Drop the backoff.
+fiber.sleep(vshard.consts.REPLICA_BACKOFF_INTERVAL)
+ | ---
+ | ...
+-- Now goes to the best replica - it is enabled again.
+res, err = vshard.router.callro(1, 'echo', {100}, long_timeout)
+ | ---
+ | ...
+assert(res == 100)
+ | ---
+ | - true
+ | ...
+
+test_run:switch('storage_2_a')
+ | ---
+ | - true
+ | ...
+assert(echo_count == 1)
+ | ---
+ | - true
+ | ...
+
 _ = test_run:switch("default")
  | ---
  | ...
diff --git a/test/router/router2.test.lua b/test/router/router2.test.lua
@@ -226,6 +226,42 @@ ok, err = rs:callro('vshard.storage.call', {1, 'badmode', 'echo', {100}},
                     long_timeout)
 assert(not ok and err.message:match('Unknown mode') ~= nil)
 
+--
+-- Storage is disabled = backoff.
+--
+test_run:switch('storage_2_a')
+vshard.storage.disable()
+
+test_run:switch('router_1')
+-- Drop old backoffs.
+fiber.sleep(vshard.consts.REPLICA_BACKOFF_INTERVAL)
+-- Success, but internally the request was retried.
+res, err = vshard.router.callro(1, 'echo', {100}, long_timeout)
+assert(res == 100)
+-- The best replica entered backoff state.
+util = require('util')
+storage_2 = vshard.router.static.replicasets[replicasets[2]]
+storage_2_a = storage_2.replicas[util.name_to_uuid.storage_2_a]
+assert(storage_2_a.backoff_ts ~= nil)
+
+test_run:switch('storage_2_b')
+assert(echo_count == 1)
+echo_count = 0
+
+test_run:switch('storage_2_a')
+assert(echo_count == 0)
+vshard.storage.enable()
+
+test_run:switch('router_1')
+-- Drop the backoff.
+fiber.sleep(vshard.consts.REPLICA_BACKOFF_INTERVAL)
+-- Now goes to the best replica - it is enabled again.
+res, err = vshard.router.callro(1, 'echo', {100}, long_timeout)
+assert(res == 100)
+
+test_run:switch('storage_2_a')
+assert(echo_count == 1)
+
 _ = test_run:switch("default")
 _ = test_run:cmd("stop server router_1")
 _ = test_run:cmd("cleanup server router_1")
diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua
@@ -347,9 +347,21 @@ local function replica_call(replica, func, args, opts)
         if opts.timeout >= replica.net_timeout then
             replica_on_failed_request(replica)
         end
+        local err = storage_status
+        -- VShard functions can throw exceptions using error() function. When
+        -- it reaches the network layer, it is wrapped into LuajitError. Try to
+        -- extract the original error if this is the case. Not always is
+        -- possible - the string representation could be truncated.
+        --
+        -- In old Tarantool versions LuajitError turned into ClientError on the
+        -- client. Check both types.
+        if func:startswith('vshard.') and (err.type == 'LuajitError' or
+           err.type == 'ClientError') then
+            err = lerror.from_string(err.message) or err
+        end
         log.error("Exception during calling '%s' on '%s': %s", func, replica,
-                  storage_status)
-        return false, nil, lerror.make(storage_status)
+                  err)
+        return false, nil, lerror.make(err)
     else
         replica_on_success_request(replica)
     end
@@ -472,6 +484,9 @@ local function can_backoff_after_error(e, func)
             return e.message:startswith("Procedure 'vshard.")
         end
     end
+    if e.type == 'ShardingError' then
+        return e.code == vshard.error.code.STORAGE_IS_DISABLED
+    end
     return false
 end