diff --git a/README.md b/README.md index 9b5db4f1..7b7003be 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,8 @@ lockfile: /var/run/mysync/mysync.lock emergefile: /var/run/mysync/mysync.emerge resetupfile: /var/run/mysync/mysync.resetup +sql_modify_timeout: 30s + resetup_crashed_hosts: False db_timeout: 2s db_lost_check_timeout: 5s @@ -65,8 +67,6 @@ critical_disk_usage: 95.00 not_critical_disk_usage: 94.76 disable_semi_sync_replication_on_maintenance: true keep_super_writable_on_critical_disk_usage: true -db_set_ro_timeout: 30s -db_set_ro_force_timeout: 60s priority_choice_max_lag: 60s offline_mode_enable_interval: 900s offline_mode_enable_lag: 86400s diff --git a/internal/app/app.go b/internal/app/app.go index 68e4f1b5..31608b1b 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -760,7 +760,7 @@ func (app *App) checkQuorum(clusterStateFromDB, clusterStateDcs map[string]*node } } - managerElectionDelayAfterQuorumLoss := app.config.ManagerElectionDelayAfterQuorumLoss + managerElectionDelay := app.config.ManagerElectionDelayAfterQuorumLoss if workingHANodesCount > 0 && visibleHAHostsCount <= (workingHANodesCount-1)/2 { app.logger.Infof("manager lost quorum (%d/%d visible HAHosts)", visibleHAHostsCount, workingHANodesCount) @@ -769,13 +769,13 @@ func (app *App) checkQuorum(clusterStateFromDB, clusterStateDcs map[string]*node if app.lostQuorumTime.IsZero() { app.lostQuorumTime = time.Now() } else { - // Lost quorum less than 15 (default WaitingRecoveryNetworkTimestamp) seconds ago + // Lost quorum less than 30 (default ManagerElectionDelayAfterQuorumLoss) seconds ago // Just wait manager recover connection - if lostQuorumDuration <= managerElectionDelayAfterQuorumLoss { + if lostQuorumDuration <= managerElectionDelay { app.logger.Warnf("Quorum loss ongoing (%0.2fs): manager wait for network recovery", lostQuorumDuration.Seconds()) - // Lost quorum more than 15 (default WaitingRecoveryNetworkTimestamp) seconds ago - // Manager should release lock and dont acquire lock for 45 (default ManagerElectionDelayAfterQuorumLoss) seconds - } else if lostQuorumDuration > managerElectionDelayAfterQuorumLoss { + // Lost quorum more than 30 (default ManagerElectionDelayAfterQuorumLoss) seconds ago + // Manager should release lock and dont acquire lock for 60 (default ManagerElectionDelayAfterQuorumLoss * 2) seconds + } else { app.logger.Warnf("Quorum loss ongoing (%0.2fs): manager release lock", lostQuorumDuration.Seconds()) app.dcs.ReleaseLock(pathManagerLock) return stateCandidate, false @@ -794,29 +794,27 @@ func (app *App) AcquireLock(path string) bool { return app.dcs.AcquireLock(path) } - managerElectionDelayAfterQuorumLoss := app.config.ManagerElectionDelayAfterQuorumLoss - managerLockAcquireDelayAfterQuorumLoss := app.config.ManagerLockAcquireDelayAfterQuorumLoss + electionDelay := app.config.ManagerElectionDelayAfterQuorumLoss + lockAcquireDelay := 2 * electionDelay lostQuorumDuration := time.Since(app.lostQuorumTime) - if lostQuorumDuration < managerElectionDelayAfterQuorumLoss { + if lostQuorumDuration < electionDelay { app.logger.Debug("manager try to acquire lock") return app.dcs.AcquireLock(path) - } else if lostQuorumDuration <= managerElectionDelayAfterQuorumLoss+managerLockAcquireDelayAfterQuorumLoss { - // Manager cant AcquireLock in delay + } else if lostQuorumDuration <= electionDelay+lockAcquireDelay { + // Manager can't AcquireLock in delay app.logger.Debugf( "Quorum loss ongoing (%0.2fs): manager lock acquisition blocked (%0.2fs/%0.2fs cooldown)", lostQuorumDuration.Seconds(), - lostQuorumDuration.Seconds()-managerElectionDelayAfterQuorumLoss.Seconds(), - managerLockAcquireDelayAfterQuorumLoss.Seconds(), + lostQuorumDuration.Seconds()-electionDelay.Seconds(), + lockAcquireDelay.Seconds(), ) return false // Manager start to try to AcquireLock - } else if lostQuorumDuration > app.config.ManagerElectionDelayAfterQuorumLoss+app.config.ManagerLockAcquireDelayAfterQuorumLoss { + } else { app.lostQuorumTime = time.Time{} return app.dcs.AcquireLock(path) } - - return false } func (app *App) approveFailover(clusterState, clusterStateDcs map[string]*nodestate.NodeState, activeNodes []string, master string) error { diff --git a/internal/config/config.go b/internal/config/config.go index 49a0eda6..58e63ea5 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -34,6 +34,13 @@ type MySQLConfig struct { // Config contains all mysync configuration type Config struct { + // We need to group our timings, as we have too much options + //Ideas: + // - SQLRead/Write timeout + // - main loop interval + // etc + SQLModifyTimeout time.Duration `config:"sql_modify_timeout" yaml:"sql_modify_timeout"` + DevMode bool `config:"dev_mode" yaml:"dev_mode"` SemiSync bool `config:"semi_sync" yaml:"semi_sync"` SemiSyncEnableLag int64 `config:"semi_sync_enable_lag" yaml:"semi_sync_enable_lag"` @@ -58,16 +65,12 @@ type Config struct { DcsWaitTimeout time.Duration `config:"dcs_wait_timeout" yaml:"dcs_wait_timeout"` DBTimeout time.Duration `config:"db_timeout" yaml:"db_timeout"` DBLostCheckTimeout time.Duration `config:"db_lost_check_timeout" yaml:"db_lost_check_timeout"` - DBSetRoTimeout time.Duration `config:"db_set_ro_timeout" yaml:"db_set_ro_timeout"` - DBSetRoForceTimeout time.Duration `config:"db_set_ro_force_timeout" yaml:"db_set_ro_force_timeout"` - DBStopSlaveSQLThreadTimeout time.Duration `config:"db_stop_slave_sql_thread_timeout" yaml:"db_stop_slave_sql_thread_timeout"` TickInterval time.Duration `config:"tick_interval" yaml:"tick_interval"` HealthCheckInterval time.Duration `config:"healthcheck_interval" yaml:"healthcheck_interval"` InfoFileHandlerInterval time.Duration `config:"info_file_handler_interval" yaml:"info_file_handler_interval"` RecoveryCheckInterval time.Duration `config:"recoverycheck_interval" yaml:"recoverycheck_interval"` ExternalCAFileCheckInterval time.Duration `config:"external_ca_file_check_interval" yaml:"external_ca_file_check_interval"` ManagerElectionDelayAfterQuorumLoss time.Duration `config:"manager_election_delay_after_quorum_loss" yaml:"manager_election_delay_after_quorum_loss"` - ManagerLockAcquireDelayAfterQuorumLoss time.Duration `config:"manager_lock_acquire_delay_after_quorum_loss" yaml:"manager_lock_acquire_delay_after_quorum_loss"` MaxAcceptableLag float64 `config:"max_acceptable_lag" yaml:"max_acceptable_lag"` SlaveCatchUpTimeout time.Duration `config:"slave_catch_up_timeout" yaml:"slave_catch_up_timeout"` DisableSemiSyncReplicationOnMaintenance bool `config:"disable_semi_sync_replication_on_maintenance" yaml:"disable_semi_sync_replication_on_maintenance"` @@ -123,6 +126,7 @@ func DefaultConfig() (Config, error) { return Config{}, err } config := Config{ + SQLModifyTimeout: 30 * time.Second, DevMode: false, SemiSync: false, SemiSyncEnableLag: 100 * 1024 * 1024, // 100Mb @@ -156,18 +160,14 @@ func DefaultConfig() (Config, error) { DcsWaitTimeout: 10 * time.Second, DBTimeout: 5 * time.Second, DBLostCheckTimeout: 5 * time.Second, - DBSetRoTimeout: 30 * time.Second, - DBSetRoForceTimeout: 30 * time.Second, DisableSetReadonlyOnLost: false, ResetupCrashedHosts: false, - DBStopSlaveSQLThreadTimeout: 30 * time.Second, TickInterval: 5 * time.Second, HealthCheckInterval: 5 * time.Second, InfoFileHandlerInterval: 30 * time.Second, RecoveryCheckInterval: 5 * time.Second, ExternalCAFileCheckInterval: 5 * time.Second, - ManagerElectionDelayAfterQuorumLoss: 30 * time.Second, // need more than 15 sec - ManagerLockAcquireDelayAfterQuorumLoss: 45 * time.Second, + ManagerElectionDelayAfterQuorumLoss: 30 * time.Second, MaxAcceptableLag: 60.0, SlaveCatchUpTimeout: 30 * time.Minute, DisableSemiSyncReplicationOnMaintenance: true, diff --git a/internal/mysql/node.go b/internal/mysql/node.go index aacc581e..402563f8 100644 --- a/internal/mysql/node.go +++ b/internal/mysql/node.go @@ -713,7 +713,7 @@ func (n *Node) IsReadOnly() (bool, bool, error) { // Setting server read-only may take a while // as server waits all running commits (not transactions) to be finished func (n *Node) SetReadOnly(superReadOnly bool) error { - return n.setReadonlyWithTimeout(superReadOnly, n.config.DBSetRoTimeout) + return n.setReadonlyWithTimeout(superReadOnly, n.config.SQLModifyTimeout) } func (n *Node) setReadonlyWithTimeout(superReadOnly bool, timeout time.Duration) error { @@ -780,7 +780,7 @@ func (n *Node) SetReadOnlyWithForce(excludeUsers []string, superReadOnly bool) e defer func() { quit <- true }() - return n.setReadonlyWithTimeout(superReadOnly, n.config.DBSetRoForceTimeout) + return n.setReadonlyWithTimeout(superReadOnly, n.config.SQLModifyTimeout) } // SetWritable sets MySQL Node to be writable, eg. disables read-only @@ -796,7 +796,7 @@ func (n *Node) StopSlave() error { } return n.execMogrifyWithTimeout(q, map[string]any{ "channel": n.config.ReplicationChannel, - }, n.config.DBStopSlaveSQLThreadTimeout) + }, n.config.SQLModifyTimeout) } // StartSlave starts replication (both IO and SQL threads) @@ -858,7 +858,7 @@ func (n *Node) StopSlaveSQLThread() error { } return n.execMogrifyWithTimeout(q, map[string]any{ "channel": n.config.ReplicationChannel, - }, n.config.DBStopSlaveSQLThreadTimeout) + }, n.config.SQLModifyTimeout) } // StartSlaveSQLThread starts SQL replication thread diff --git a/tests/images/mysql/mysync.yaml b/tests/images/mysql/mysync.yaml index 435795c4..2d07e93f 100644 --- a/tests/images/mysql/mysync.yaml +++ b/tests/images/mysql/mysync.yaml @@ -37,13 +37,12 @@ mysql: error_log: /var/log/mysql/error.log queries: replication_lag: $MYSYNC_REPLICATION_LAG_QUERY +sql_modify_timeout: 30s disable_semi_sync_replication_on_maintenance: ${MYSYNC_DISABLE_REPLICATION_ON_MAINT:-false} rpl_semi_sync_master_wait_for_slave_count: ${MYSYNC_WAIT_FOR_SLAVE_COUNT:-1} critical_disk_usage: ${MYSYNC_CRITICAL_DISK_USAGE:-100} keep_super_writable_on_critical_disk_usage: ${MYSYNC_KEEP_SUPER_WRITABLE_ON_CRITICAL_DISK_USAGE:-false} test_disk_usage_file: /tmp/usedspace -db_set_ro_force_timeout: 40s -db_set_ro_timeout: ${MYSYNC_SET_RO_TIMEOUT:-30s} offline_mode_enable_lag: ${OFFLINE_MODE_ENABLE_LAG:-10s} offline_mode_disable_lag: 5s priority_choice_max_lag: ${MYSYNC_PRIORITY_CHOICE_LAG:-60s} @@ -67,7 +66,6 @@ manager_switchover: ${MANAGER_SWITCHOVER:-true} replication_convergence_timeout_switchover: 300s manager_election_delay_after_quorum_loss: ${MANAGER_ELECTION_DELAY_AFTER_QUORUM_LOSS:-15s} resetup_host_lag: ${RESETUP_HOST_LAG:-30000s} -manager_lock_acquire_delay_after_quorum_loss: ${MANAGER_LOCK_ACQUIRE_DELAY_AFTER_QUORUM_LOSS:-30s} optimization_config: high_replication_mark: ${HIGH_REPLICATION_MARK:-60s} low_replication_mark: ${LOW_REPLICATION_MARK:-60s}