概述
開啟redis集群模式后,各個節點間互相定期交互(ping,pong)來交換信息
當發送給某個節點的ping超時沒有收到Pong后,會標記為pfail
當交換信息后發現pfail達到多數派,則標識為fail,并發送廣播
滿足條件的主節點發起failover認證請求
主節點處理認證請求,發送認證響應
收到的認證成功是來自主節點,則提升自己為主節點,分配slots,進行廣播
前置代碼
int main(int argc, char **argv) {
...
initServer()
...
}
void initServer(void) {
...
if (aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL) == AE_ERR) {
serverPanic("Can't create event loop timers.");
exit(1);
}
...
}
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
...
run_with_period(100) {
if (server.cluster_enabled) clusterCron();
}
...
}
void clusterCron(void) {
...
每迭代10次,我們就隨機選一個節點發送ping,也就是我們通常每秒隨機給一個節點發送ping
if (!(iteration % 10)) {
int j;
隨機挑選一個節點,給收到pong消息到現在最久的節點發送ping
if (node->link == NULL) {
...
aeCreateFileEvent(server.el,link->fd,AE_READABLE,
clusterReadHandler,link);
...
}
for (j = 0; j < 5; j++) {
de = dictGetRandomKey(server.cluster->nodes);
clusterNode *this = dictGetVal(de);
不要挑選斷開連接的或者存在當前活躍ping的節點
if (this->link == NULL || this->ping_sent != 0) continue;
if (this->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE))
continue;
if (min_pong_node == NULL || min_pong > this->pong_received) {
min_pong_node = this;
min_pong = this->pong_received;
}
}
if (min_pong_node) {
發送ping
clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING);
}
}
while((de = dictNext(di)) != NULL) {
如果收到pong 則 node->ping_sent為0
delay = now - node->ping_sent;
超時標記為pfail
if (delay > server.cluster_node_timeout) {
如果不是fail或者pfail狀態
if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) {
node->flags |= CLUSTER_NODE_PFAIL;
update_state = 1;
}
}
}
如果自己是從節點
if (nodeIsSlave(myself)) {
處理手動failover,滿足條件則設置標識
clusterHandleManualFailover();
if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
處理failover
clusterHandleSlaveFailover();
if (orphaned_masters && max_slaves >= 2 && this_slaves == max_slaves)
clusterHandleSlaveMigration(max_slaves);
}
...
}
void clusterHandleManualFailover(void) {
if (server.cluster->mf_end == 0) return;
if (server.cluster->mf_can_start) return;
if (server.cluster->mf_master_offset == 0) return; /* Wait for offset... */
if (server.cluster->mf_master_offset == replicationGetSlaveOffset()) {
server.cluster->mf_can_start = 1;
}
gossip相關
void clusterSendPing(clusterLink *link, int type) {
...
添加把pfail狀態的節點 添加到發送信息中
if (pfail_wanted) {
dictIterator *di;
dictEntry *de;
di = dictGetSafeIterator(server.cluster->nodes);
while((de = dictNext(di)) != NULL && pfail_wanted > 0) {
clusterNode *node = dictGetVal(de);
if (node->flags & CLUSTER_NODE_HANDSHAKE) continue;
if (node->flags & CLUSTER_NODE_NOADDR) continue;
if (!(node->flags & CLUSTER_NODE_PFAIL)) continue;
clusterSetGossipEntry(hdr,gossipcount,node);
}
...
clusterSendMessage(link,buf,totlen);
}
void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
...
if (rcvbuflen >= 8 && rcvbuflen == ntohl(hdr->totlen)) {
if (clusterProcessPacket(link)) {
...
}
int clusterProcessPacket(clusterLink *link) {
...
if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG ||
type == CLUSTERMSG_TYPE_MEET)
{
...
if (sender) clusterProcessGossipSection(hdr,link);
...
}
...
} else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST) {
if (!sender) return 1; /* We don't know that node. */
clusterSendFailoverAuthIfNeeded(sender,hdr);
...
} else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) {
if (!sender) return 1; /* We don't know that node. */
/* We consider this vote only if the sender is a master serving
* a non zero number of slots, and its currentEpoch is greater or
* equal to epoch where this node started the election. */
if (nodeIsMaster(sender) && sender->numslots > 0 &&
senderCurrentEpoch >= server.cluster->failover_auth_epoch)
{
server.cluster->failover_auth_count++;
/* Maybe we reached a quorum here, set a flag to make sure
* we check ASAP. */
clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
}
} else if (type == CLUSTERMSG_TYPE_FAIL) {
...
failing = clusterLookupNode(hdr->data.fail.about.nodename);
if (failing &&
!(failing->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_MYSELF)))
{
...
}
...
}
void clusterDoBeforeSleep(int flags) {
server.cluster->todo_before_sleep |= flags;
}
void clusterHandleSlaveFailover(void) {
...
條件
1)是自身是從節點
2)主節點是fail狀態或者是手動failover
3) 沒有no failover配置,而且不是手動failover
4)服務了slots
if (nodeIsMaster(myself) ||
myself->slaveof == NULL ||
(!nodeFailed(myself->slaveof) && !manual_failover) ||
(server.cluster_slave_no_failover && !manual_failover) ||
myself->slaveof->numslots == 0)
{
/* There are no reasons to failover, so we set the reason why we
* are returning without failing over to NONE. */
server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
return;
}
...
檢查我們的數據是否夠新
if (server.cluster_slave_validity_factor &&
data_age >
(((mstime_t)server.repl_ping_slave_period * 1000) +
(server.cluster_node_timeout * server.cluster_slave_validity_factor)))
{
if (!manual_failover) {
clusterLogCantFailover(CLUSTER_CANT_FAILOVER_DATA_AGE);
return;
}
}
...
...
/* If the previous failover attempt timedout and the retry time has
* elapsed, we can setup a new one. */
if (auth_age > auth_retry_time) {
...
發送failover認證的請求
if (server.cluster->failover_auth_sent == 0) {
server.cluster->currentEpoch++;
server.cluster->failover_auth_epoch = server.cluster->currentEpoch;
clusterRequestFailoverAuth();
server.cluster->failover_auth_sent = 1;
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
CLUSTER_TODO_UPDATE_STATE|
CLUSTER_TODO_FSYNC_CONFIG);
return; /* Wait for replies. */
}
檢查認證是否達到多數派
if (server.cluster->failover_auth_count >= needed_quorum) {
...
clusterFailoverReplaceYourMaster();
...
}
...
}
void clusterRequestFailoverAuth(void) {
unsigned char buf[sizeof(clusterMsg)];
clusterMsg *hdr = (clusterMsg*) buf;
uint32_t totlen;
clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST);
if (server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK;
totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
hdr->totlen = htonl(totlen);
clusterBroadcastMessage(buf,totlen);
}
void clusterFailoverReplaceYourMaster(void) {
...
把自己提升為master
clusterSetNodeAsMaster(myself);
replicationUnsetMaster();
...
發送廣播
clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
}
void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
...
如果自身是從節點或者沒有服務slots則不處理
if (nodeIsSlave(myself) || myself->numslots == 0) return;
...
如果此節點是主節點或者此節點沒有主節點,或者他的主節點不是fail狀態
if (nodeIsMaster(node) || master == NULL ||
(!nodeFailed(master) && !force_ack))
{
}
...
發送failover認證響應
clusterSendFailoverAuth(node);
}
void clusterSendFailoverAuth(clusterNode *node) {
unsigned char buf[sizeof(clusterMsg)];
clusterMsg *hdr = (clusterMsg*) buf;
uint32_t totlen;
if (!node->link) return;
clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK);
totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
hdr->totlen = htonl(totlen);
clusterSendMessage(node->link,buf,totlen);
}
void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) {
node = clusterLookupNode(g->nodename);
if (node) {
如果這個節點是主節點并且不是自身
if (sender && nodeIsMaster(sender) && node != myself) {
如果是fail或者pfail狀態
if (flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) {
檢查是否需要標識此節點為fail
markNodeAsFailingIfNeeded(node);
} else {
if (clusterNodeDelFailureReport(node,sender)) {
serverLog(LL_VERBOSE,
"Node %.40s reported node %.40s is back online.",
sender->name, node->name);
}
}
}
}
void markNodeAsFailingIfNeeded(clusterNode *node) {
檢查我們自身是否可達這個節點
if (!nodeTimedOut(node)) return;
檢查這個節點已經是fail狀態
if (nodeFailed(node)) return;
...
自身是master才會進行增加
if (nodeIsMaster(myself)) failures++;
主從節點都會進行判斷pfail或者fail達到多數派
if (failures < needed_quorum) return;
...
如果自身是主節點,則發送此節點fail的廣播
if (nodeIsMaster(myself)) clusterSendFail(node->name);
...
}
void clusterSendFail(char *nodename) {
unsigned char buf[sizeof(clusterMsg)];
clusterMsg *hdr = (clusterMsg*) buf;
clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAIL);
memcpy(hdr->data.fail.about.nodename,nodename,CLUSTER_NAMELEN);
clusterBroadcastMessage(buf,ntohl(hdr->totlen));
}