Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ lockfile: /var/run/mysync/mysync.lock
emergefile: /var/run/mysync/mysync.emerge
resetupfile: /var/run/mysync/mysync.resetup

sql_modify_timeout: 30s

resetup_crashed_hosts: False
db_timeout: 2s
db_lost_check_timeout: 5s
Expand All @@ -65,8 +67,6 @@ critical_disk_usage: 95.00
not_critical_disk_usage: 94.76
disable_semi_sync_replication_on_maintenance: true
keep_super_writable_on_critical_disk_usage: true
db_set_ro_timeout: 30s
db_set_ro_force_timeout: 60s
priority_choice_max_lag: 60s
offline_mode_enable_interval: 900s
offline_mode_enable_lag: 86400s
Expand Down
30 changes: 14 additions & 16 deletions internal/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -760,7 +760,7 @@ func (app *App) checkQuorum(clusterStateFromDB, clusterStateDcs map[string]*node
}
}

managerElectionDelayAfterQuorumLoss := app.config.ManagerElectionDelayAfterQuorumLoss
managerElectionDelay := app.config.ManagerElectionDelayAfterQuorumLoss

if workingHANodesCount > 0 && visibleHAHostsCount <= (workingHANodesCount-1)/2 {
app.logger.Infof("manager lost quorum (%d/%d visible HAHosts)", visibleHAHostsCount, workingHANodesCount)
Expand All @@ -769,13 +769,13 @@ func (app *App) checkQuorum(clusterStateFromDB, clusterStateDcs map[string]*node
if app.lostQuorumTime.IsZero() {
app.lostQuorumTime = time.Now()
} else {
// Lost quorum less than 15 (default WaitingRecoveryNetworkTimestamp) seconds ago
// Lost quorum less than 30 (default ManagerElectionDelayAfterQuorumLoss) seconds ago
// Just wait manager recover connection
if lostQuorumDuration <= managerElectionDelayAfterQuorumLoss {
if lostQuorumDuration <= managerElectionDelay {
app.logger.Warnf("Quorum loss ongoing (%0.2fs): manager wait for network recovery", lostQuorumDuration.Seconds())
// Lost quorum more than 15 (default WaitingRecoveryNetworkTimestamp) seconds ago
// Manager should release lock and dont acquire lock for 45 (default ManagerElectionDelayAfterQuorumLoss) seconds
} else if lostQuorumDuration > managerElectionDelayAfterQuorumLoss {
// Lost quorum more than 30 (default ManagerElectionDelayAfterQuorumLoss) seconds ago
// Manager should release lock and dont acquire lock for 60 (default ManagerElectionDelayAfterQuorumLoss * 2) seconds
} else {
app.logger.Warnf("Quorum loss ongoing (%0.2fs): manager release lock", lostQuorumDuration.Seconds())
app.dcs.ReleaseLock(pathManagerLock)
return stateCandidate, false
Expand All @@ -794,29 +794,27 @@ func (app *App) AcquireLock(path string) bool {
return app.dcs.AcquireLock(path)
}

managerElectionDelayAfterQuorumLoss := app.config.ManagerElectionDelayAfterQuorumLoss
managerLockAcquireDelayAfterQuorumLoss := app.config.ManagerLockAcquireDelayAfterQuorumLoss
electionDelay := app.config.ManagerElectionDelayAfterQuorumLoss
lockAcquireDelay := 2 * electionDelay

lostQuorumDuration := time.Since(app.lostQuorumTime)
if lostQuorumDuration < managerElectionDelayAfterQuorumLoss {
if lostQuorumDuration < electionDelay {
app.logger.Debug("manager try to acquire lock")
return app.dcs.AcquireLock(path)
} else if lostQuorumDuration <= managerElectionDelayAfterQuorumLoss+managerLockAcquireDelayAfterQuorumLoss {
// Manager cant AcquireLock in delay
} else if lostQuorumDuration <= electionDelay+lockAcquireDelay {
// Manager can't AcquireLock in delay
app.logger.Debugf(
"Quorum loss ongoing (%0.2fs): manager lock acquisition blocked (%0.2fs/%0.2fs cooldown)",
lostQuorumDuration.Seconds(),
lostQuorumDuration.Seconds()-managerElectionDelayAfterQuorumLoss.Seconds(),
managerLockAcquireDelayAfterQuorumLoss.Seconds(),
lostQuorumDuration.Seconds()-electionDelay.Seconds(),
lockAcquireDelay.Seconds(),
)
return false
// Manager start to try to AcquireLock
} else if lostQuorumDuration > app.config.ManagerElectionDelayAfterQuorumLoss+app.config.ManagerLockAcquireDelayAfterQuorumLoss {
} else {
app.lostQuorumTime = time.Time{}
return app.dcs.AcquireLock(path)
}

return false
}

func (app *App) approveFailover(clusterState, clusterStateDcs map[string]*nodestate.NodeState, activeNodes []string, master string) error {
Expand Down
18 changes: 9 additions & 9 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ type MySQLConfig struct {

// Config contains all mysync configuration
type Config struct {
// We need to group our timings, as we have too much options
//Ideas:
// - SQLRead/Write timeout
// - main loop interval
// etc
SQLModifyTimeout time.Duration `config:"sql_modify_timeout" yaml:"sql_modify_timeout"`

DevMode bool `config:"dev_mode" yaml:"dev_mode"`
SemiSync bool `config:"semi_sync" yaml:"semi_sync"`
SemiSyncEnableLag int64 `config:"semi_sync_enable_lag" yaml:"semi_sync_enable_lag"`
Expand All @@ -58,16 +65,12 @@ type Config struct {
DcsWaitTimeout time.Duration `config:"dcs_wait_timeout" yaml:"dcs_wait_timeout"`
DBTimeout time.Duration `config:"db_timeout" yaml:"db_timeout"`
DBLostCheckTimeout time.Duration `config:"db_lost_check_timeout" yaml:"db_lost_check_timeout"`
DBSetRoTimeout time.Duration `config:"db_set_ro_timeout" yaml:"db_set_ro_timeout"`
DBSetRoForceTimeout time.Duration `config:"db_set_ro_force_timeout" yaml:"db_set_ro_force_timeout"`
DBStopSlaveSQLThreadTimeout time.Duration `config:"db_stop_slave_sql_thread_timeout" yaml:"db_stop_slave_sql_thread_timeout"`
TickInterval time.Duration `config:"tick_interval" yaml:"tick_interval"`
HealthCheckInterval time.Duration `config:"healthcheck_interval" yaml:"healthcheck_interval"`
InfoFileHandlerInterval time.Duration `config:"info_file_handler_interval" yaml:"info_file_handler_interval"`
RecoveryCheckInterval time.Duration `config:"recoverycheck_interval" yaml:"recoverycheck_interval"`
ExternalCAFileCheckInterval time.Duration `config:"external_ca_file_check_interval" yaml:"external_ca_file_check_interval"`
ManagerElectionDelayAfterQuorumLoss time.Duration `config:"manager_election_delay_after_quorum_loss" yaml:"manager_election_delay_after_quorum_loss"`
ManagerLockAcquireDelayAfterQuorumLoss time.Duration `config:"manager_lock_acquire_delay_after_quorum_loss" yaml:"manager_lock_acquire_delay_after_quorum_loss"`
MaxAcceptableLag float64 `config:"max_acceptable_lag" yaml:"max_acceptable_lag"`
SlaveCatchUpTimeout time.Duration `config:"slave_catch_up_timeout" yaml:"slave_catch_up_timeout"`
DisableSemiSyncReplicationOnMaintenance bool `config:"disable_semi_sync_replication_on_maintenance" yaml:"disable_semi_sync_replication_on_maintenance"`
Expand Down Expand Up @@ -123,6 +126,7 @@ func DefaultConfig() (Config, error) {
return Config{}, err
}
config := Config{
SQLModifyTimeout: 30 * time.Second,
DevMode: false,
SemiSync: false,
SemiSyncEnableLag: 100 * 1024 * 1024, // 100Mb
Expand Down Expand Up @@ -156,18 +160,14 @@ func DefaultConfig() (Config, error) {
DcsWaitTimeout: 10 * time.Second,
DBTimeout: 5 * time.Second,
DBLostCheckTimeout: 5 * time.Second,
DBSetRoTimeout: 30 * time.Second,
DBSetRoForceTimeout: 30 * time.Second,
DisableSetReadonlyOnLost: false,
ResetupCrashedHosts: false,
DBStopSlaveSQLThreadTimeout: 30 * time.Second,
TickInterval: 5 * time.Second,
HealthCheckInterval: 5 * time.Second,
InfoFileHandlerInterval: 30 * time.Second,
RecoveryCheckInterval: 5 * time.Second,
ExternalCAFileCheckInterval: 5 * time.Second,
ManagerElectionDelayAfterQuorumLoss: 30 * time.Second, // need more than 15 sec
ManagerLockAcquireDelayAfterQuorumLoss: 45 * time.Second,
ManagerElectionDelayAfterQuorumLoss: 30 * time.Second,
MaxAcceptableLag: 60.0,
SlaveCatchUpTimeout: 30 * time.Minute,
DisableSemiSyncReplicationOnMaintenance: true,
Expand Down
8 changes: 4 additions & 4 deletions internal/mysql/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -713,7 +713,7 @@ func (n *Node) IsReadOnly() (bool, bool, error) {
// Setting server read-only may take a while
// as server waits all running commits (not transactions) to be finished
func (n *Node) SetReadOnly(superReadOnly bool) error {
return n.setReadonlyWithTimeout(superReadOnly, n.config.DBSetRoTimeout)
return n.setReadonlyWithTimeout(superReadOnly, n.config.SQLModifyTimeout)
}

func (n *Node) setReadonlyWithTimeout(superReadOnly bool, timeout time.Duration) error {
Expand Down Expand Up @@ -780,7 +780,7 @@ func (n *Node) SetReadOnlyWithForce(excludeUsers []string, superReadOnly bool) e

defer func() { quit <- true }()

return n.setReadonlyWithTimeout(superReadOnly, n.config.DBSetRoForceTimeout)
return n.setReadonlyWithTimeout(superReadOnly, n.config.SQLModifyTimeout)
}

// SetWritable sets MySQL Node to be writable, eg. disables read-only
Expand All @@ -796,7 +796,7 @@ func (n *Node) StopSlave() error {
}
return n.execMogrifyWithTimeout(q, map[string]any{
"channel": n.config.ReplicationChannel,
}, n.config.DBStopSlaveSQLThreadTimeout)
}, n.config.SQLModifyTimeout)
}

// StartSlave starts replication (both IO and SQL threads)
Expand Down Expand Up @@ -858,7 +858,7 @@ func (n *Node) StopSlaveSQLThread() error {
}
return n.execMogrifyWithTimeout(q, map[string]any{
"channel": n.config.ReplicationChannel,
}, n.config.DBStopSlaveSQLThreadTimeout)
}, n.config.SQLModifyTimeout)
}

// StartSlaveSQLThread starts SQL replication thread
Expand Down
4 changes: 1 addition & 3 deletions tests/images/mysql/mysync.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,12 @@ mysql:
error_log: /var/log/mysql/error.log
queries:
replication_lag: $MYSYNC_REPLICATION_LAG_QUERY
sql_modify_timeout: 30s
disable_semi_sync_replication_on_maintenance: ${MYSYNC_DISABLE_REPLICATION_ON_MAINT:-false}
rpl_semi_sync_master_wait_for_slave_count: ${MYSYNC_WAIT_FOR_SLAVE_COUNT:-1}
critical_disk_usage: ${MYSYNC_CRITICAL_DISK_USAGE:-100}
keep_super_writable_on_critical_disk_usage: ${MYSYNC_KEEP_SUPER_WRITABLE_ON_CRITICAL_DISK_USAGE:-false}
test_disk_usage_file: /tmp/usedspace
db_set_ro_force_timeout: 40s
db_set_ro_timeout: ${MYSYNC_SET_RO_TIMEOUT:-30s}
offline_mode_enable_lag: ${OFFLINE_MODE_ENABLE_LAG:-10s}
offline_mode_disable_lag: 5s
priority_choice_max_lag: ${MYSYNC_PRIORITY_CHOICE_LAG:-60s}
Expand All @@ -67,7 +66,6 @@ manager_switchover: ${MANAGER_SWITCHOVER:-true}
replication_convergence_timeout_switchover: 300s
manager_election_delay_after_quorum_loss: ${MANAGER_ELECTION_DELAY_AFTER_QUORUM_LOSS:-15s}
resetup_host_lag: ${RESETUP_HOST_LAG:-30000s}
manager_lock_acquire_delay_after_quorum_loss: ${MANAGER_LOCK_ACQUIRE_DELAY_AFTER_QUORUM_LOSS:-30s}
optimization_config:
high_replication_mark: ${HIGH_REPLICATION_MARK:-60s}
low_replication_mark: ${LOW_REPLICATION_MARK:-60s}
Loading