redis cluster 源碼分析

概述

開啟redis集群模式后,各個節點間互相定期交互(ping,pong)來交換信息
當發送給某個節點的ping超時沒有收到Pong后,會標記為pfail
當交換信息后發現pfail達到多數派,則標識為fail,并發送廣播
滿足條件的主節點發起failover認證請求
主節點處理認證請求,發送認證響應
收到的認證成功是來自主節點,則提升自己為主節點,分配slots,進行廣播

前置代碼

int main(int argc, char **argv) {
...
initServer()
...
}
void initServer(void) {
...
    if (aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL) == AE_ERR) {
        serverPanic("Can't create event loop timers.");
        exit(1);
    }
...
}
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
...    
run_with_period(100) {
        if (server.cluster_enabled) clusterCron();
    }
...
}

void clusterCron(void) {
...
每迭代10次,我們就隨機選一個節點發送ping,也就是我們通常每秒隨機給一個節點發送ping
if (!(iteration % 10)) {
        int j;
         隨機挑選一個節點,給收到pong消息到現在最久的節點發送ping


        if (node->link == NULL) {
...
      aeCreateFileEvent(server.el,link->fd,AE_READABLE,
                    clusterReadHandler,link);
...
}
        for (j = 0; j < 5; j++) {
            de = dictGetRandomKey(server.cluster->nodes);
            clusterNode *this = dictGetVal(de);
            不要挑選斷開連接的或者存在當前活躍ping的節點
      
            if (this->link == NULL || this->ping_sent != 0) continue;
            if (this->flags & (CLUSTER_NODE_MYSELF|CLUSTER_NODE_HANDSHAKE))
                continue;
            if (min_pong_node == NULL || min_pong > this->pong_received) {
                min_pong_node = this;
                min_pong = this->pong_received;
            }
        }
        if (min_pong_node) {
發送ping
            clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING);
        }
    }



    while((de = dictNext(di)) != NULL) {
如果收到pong 則 node->ping_sent為0
        delay = now - node->ping_sent;
超時標記為pfail
        if (delay > server.cluster_node_timeout) {
      如果不是fail或者pfail狀態
            if (!(node->flags & (CLUSTER_NODE_PFAIL|CLUSTER_NODE_FAIL))) {
                node->flags |= CLUSTER_NODE_PFAIL;
                update_state = 1;
            }
        }
}
如果自己是從節點
   if (nodeIsSlave(myself)) {
        處理手動failover,滿足條件則設置標識
        clusterHandleManualFailover();
        if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
處理failover
            clusterHandleSlaveFailover();
     
        if (orphaned_masters && max_slaves >= 2 && this_slaves == max_slaves)
            clusterHandleSlaveMigration(max_slaves);
    }
...
}

void clusterHandleManualFailover(void) {
    if (server.cluster->mf_end == 0) return;


    if (server.cluster->mf_can_start) return;

    if (server.cluster->mf_master_offset == 0) return; /* Wait for offset... */

    if (server.cluster->mf_master_offset == replicationGetSlaveOffset()) {
     
        server.cluster->mf_can_start = 1;

}

gossip相關

void clusterSendPing(clusterLink *link, int type) {
...
添加把pfail狀態的節點 添加到發送信息中
 if (pfail_wanted) {
        dictIterator *di;
        dictEntry *de;

        di = dictGetSafeIterator(server.cluster->nodes);
        while((de = dictNext(di)) != NULL && pfail_wanted > 0) {
            clusterNode *node = dictGetVal(de);
            if (node->flags & CLUSTER_NODE_HANDSHAKE) continue;
            if (node->flags & CLUSTER_NODE_NOADDR) continue;
            if (!(node->flags & CLUSTER_NODE_PFAIL)) continue;
            clusterSetGossipEntry(hdr,gossipcount,node);

    }
...
    clusterSendMessage(link,buf,totlen);


}

void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
...
if (rcvbuflen >= 8 && rcvbuflen == ntohl(hdr->totlen)) {
            if (clusterProcessPacket(link)) {
...

}
int clusterProcessPacket(clusterLink *link) {

...
    if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG ||
        type == CLUSTERMSG_TYPE_MEET)
    {
...
        if (sender) clusterProcessGossipSection(hdr,link);

...
}
...
    } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST) {
if (!sender) return 1;  /* We don't know that node. */
        clusterSendFailoverAuthIfNeeded(sender,hdr);
...
} else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) {
        if (!sender) return 1;  /* We don't know that node. */
        /* We consider this vote only if the sender is a master serving
         * a non zero number of slots, and its currentEpoch is greater or
         * equal to epoch where this node started the election. */
        if (nodeIsMaster(sender) && sender->numslots > 0 &&
            senderCurrentEpoch >= server.cluster->failover_auth_epoch)
        {
            server.cluster->failover_auth_count++;
            /* Maybe we reached a quorum here, set a flag to make sure
             * we check ASAP. */
            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
        }
    } else if (type == CLUSTERMSG_TYPE_FAIL) {
...
       failing = clusterLookupNode(hdr->data.fail.about.nodename);
            if (failing &&
                !(failing->flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_MYSELF)))
            {
...
}
...
}

void clusterDoBeforeSleep(int flags) {
    server.cluster->todo_before_sleep |= flags;
}

void clusterHandleSlaveFailover(void) {

...
條件
1)是自身是從節點
2)主節點是fail狀態或者是手動failover
3) 沒有no failover配置,而且不是手動failover
4)服務了slots
 if (nodeIsMaster(myself) ||
        myself->slaveof == NULL ||
        (!nodeFailed(myself->slaveof) && !manual_failover) ||
        (server.cluster_slave_no_failover && !manual_failover) ||
        myself->slaveof->numslots == 0)
    {
        /* There are no reasons to failover, so we set the reason why we
         * are returning without failing over to NONE. */
        server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
        return;
    }
...

  檢查我們的數據是否夠新
    if (server.cluster_slave_validity_factor &&
        data_age >
        (((mstime_t)server.repl_ping_slave_period * 1000) +
         (server.cluster_node_timeout * server.cluster_slave_validity_factor)))
    {
        if (!manual_failover) {
            clusterLogCantFailover(CLUSTER_CANT_FAILOVER_DATA_AGE);
            return;
        }
    }

...

...
 /* If the previous failover attempt timedout and the retry time has
     * elapsed, we can setup a new one. */
    if (auth_age > auth_retry_time) {
...
發送failover認證的請求
if (server.cluster->failover_auth_sent == 0) {
        server.cluster->currentEpoch++;
        server.cluster->failover_auth_epoch = server.cluster->currentEpoch;
  
        clusterRequestFailoverAuth();
        server.cluster->failover_auth_sent = 1;
        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
                             CLUSTER_TODO_UPDATE_STATE|
                             CLUSTER_TODO_FSYNC_CONFIG);
        return; /* Wait for replies. */
    }

檢查認證是否達到多數派
    if (server.cluster->failover_auth_count >= needed_quorum) {
...
        clusterFailoverReplaceYourMaster();
...
}
...
}

void clusterRequestFailoverAuth(void) {
    unsigned char buf[sizeof(clusterMsg)];
    clusterMsg *hdr = (clusterMsg*) buf;
    uint32_t totlen;

    clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST);
  
    if (server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK;
    totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
    hdr->totlen = htonl(totlen);
    clusterBroadcastMessage(buf,totlen);
}

void clusterFailoverReplaceYourMaster(void) {

...
把自己提升為master
    clusterSetNodeAsMaster(myself);
    replicationUnsetMaster();
...
發送廣播
    clusterBroadcastPong(CLUSTER_BROADCAST_ALL);

}

void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
...
如果自身是從節點或者沒有服務slots則不處理
    if (nodeIsSlave(myself) || myself->numslots == 0) return;
...
如果此節點是主節點或者此節點沒有主節點,或者他的主節點不是fail狀態
if (nodeIsMaster(node) || master == NULL ||
        (!nodeFailed(master) && !force_ack))
    {
}
...
發送failover認證響應
    clusterSendFailoverAuth(node);

}

void clusterSendFailoverAuth(clusterNode *node) {
    unsigned char buf[sizeof(clusterMsg)];
    clusterMsg *hdr = (clusterMsg*) buf;
    uint32_t totlen;

    if (!node->link) return;
    clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK);
    totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
    hdr->totlen = htonl(totlen);
    clusterSendMessage(node->link,buf,totlen);
}


void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) {
node = clusterLookupNode(g->nodename);
        if (node) {
            如果這個節點是主節點并且不是自身
            if (sender && nodeIsMaster(sender) && node != myself) {
如果是fail或者pfail狀態
                if (flags & (CLUSTER_NODE_FAIL|CLUSTER_NODE_PFAIL)) {
                   檢查是否需要標識此節點為fail
                    markNodeAsFailingIfNeeded(node);
                } else {
                    if (clusterNodeDelFailureReport(node,sender)) {
                        serverLog(LL_VERBOSE,
                            "Node %.40s reported node %.40s is back online.",
                            sender->name, node->name);
                    }
                }
            }
}

void markNodeAsFailingIfNeeded(clusterNode *node) {
檢查我們自身是否可達這個節點
    if (!nodeTimedOut(node)) return;
檢查這個節點已經是fail狀態
    if (nodeFailed(node)) return;
...
自身是master才會進行增加
    if (nodeIsMaster(myself)) failures++;
主從節點都會進行判斷pfail或者fail達到多數派
    if (failures < needed_quorum) return; 
...
如果自身是主節點,則發送此節點fail的廣播
    if (nodeIsMaster(myself)) clusterSendFail(node->name);
...
}

void clusterSendFail(char *nodename) {
    unsigned char buf[sizeof(clusterMsg)];
    clusterMsg *hdr = (clusterMsg*) buf;

    clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAIL);
    memcpy(hdr->data.fail.about.nodename,nodename,CLUSTER_NAMELEN);
    clusterBroadcastMessage(buf,ntohl(hdr->totlen));
}
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。

推薦閱讀更多精彩內容