概述
下面以 v1.14.4 版本的代碼作為例子解釋,Cilium 在選擇 Chain 模式的部署下,是如何創建相關的資源的。
TL;DR
Cilium Chain 模式,Cilium 不負責容器組網的相關操作,可以認為只負責了 Endpoint, CiliumEndpoint, Identity, Policy 相關的資源的創建。
cmdAdd // plugins/cilium-cni/main.go
|-loadNetConf(args.StdinData) // plugins/cilium-cni/main.go
|-parsePrevResult(plugin) // plugins/cilium-cni/types/types.go
|-chainAction.Add
|-GenericVethChainer.Add // plugins/cilium-cni/chaining/generic-veth/generic-veth.go
|-ns.GetNS(pluginCtx.Args.Netns) // plugins/cilium-cni/chaining/generic-veth/generic-veth.go
|-ep := &models.EndpointChangeRequest // plugins/cilium-cni/chaining/generic-veth/generic-veth.go
|-EndpointCreate(ep) // plugins/cilium-cni/chaining/generic-veth/generic-veth.go
|-EndpointCreate // pkg/client/endpoint.go
|-PutEndpointID // api/v1/client/endpoint/endpoint_client.go
/\
||
\/
ServeHTTP // api/server/restapi/endpint/put_endpoint_id.go
|-Handler.Handle() // api/server/restapi/endpint/put_endpoint_id.go
|- Handle() // daemon/cmd/endpoint.go
|-createEndpoint // daemon/cmd/endpoint.go
|-NewEndpointFromChangeModel // pkg/endpoint/api.go
|-endpointmanager.AddEndpoint // daemon/cmd/endpoint.go
| |-Expose // pkg/endpointmanager/manager.go
| |-AllocateID // pkg/endpoint/manager.go
| |-RunK8sCiliumEndpointSync(e) // pkg/k8s/watchers/endpointsynchronizer.go
|-ep.UpdateLabels // pkg/endpoint/endpoint.go
| |-replaceInformationLabels // pkg/endpoint/endpoint.go
| |-ReplaceIdentityLabels // pkg/endpoint/endpoint.go
| |-RunIdentityResolver // pkg/endpoint/endpoint.go
| |-identityLabelsChanged // pkg/endpoint/endpoint.go
| |-AllocateIdentity // pkg/identity/cache/allocator.go
| |-forcePolicyComputation // pkg/endpoint/endpoint.go
| |-SetIdentity // pkg/identity/cache/allocator.go
| |-runIPIdentitySync // pkg/endpoint/policy.go
| |-UpsertIPToKVStore // pkg/ipcache/kvstore.go
|-Regenerate // pkg/endpoint/policy.go
|-regenerate // pkg/endpoint/policy.go
|-regenerateBPF // pkg/endpoint/bpf.go
|-runPreCompilationSteps
| |-regeneratePolicy
| |-writeHeaderfile
|-realizeBPFState
|-CompileAndLoad // pkt/datapath/loader/loader.go
|-compileAndLoad // pkt/datapath/loader/loader.go
|-compileDatapath // pkt/datapath/loader/loader.go
| |-compile // pkt/datapath/loader/compile.go
| |-compileAndLink // pkt/datapath/loader/compile.go
|-reloadDatapath // pkt/datapath/loader/loader.go
|-replaceDatapath // pkt/datapath/loader/netlink.go
loadNetConf()
讀取的文件如下,依賴于 CNI 的鏈式調用,kubelet 會先調用 contivk8s.bin,然后再調用 cilium-cni,后面關于創建 Endpoint 這些資源的流程,跟非 Chain 模式的 CNI 是類似的。
{
"name": "generic-veth",
"cniVersion": "0.1.0",
"plugins": [
{
"type": "contivk8s.bin"
},
{
"type": "cilium-cni"
}
]
}
查看網卡的信息。
[root@ns-k8s-noah-staging001-node-s0092 ~]# ethtool -i vvport16894
driver: veth
version: 1.0
firmware-version:
expansion-rom-version:
bus-info:
supports-statistics: yes
supports-test: no
supports-eeprom-access: no
supports-register-dump: no
supports-priv-flags: no
代碼實現
需要實現三個方法。
type ChainingPlugin interface {
// Add is called on CNI ADD. It is given the plugin context from the previous plugin. It must return a CNI result or an error.
Add(ctx context.Context, pluginContext PluginContext, client *client.Client) (res *cniTypesVer.Result, err error)
// Delete is called on CNI DELETE. It is given the plugin context from
the previous plugin.
Delete(ctx context.Context, pluginContext PluginContext, delClient *lib.DeletionFallbackClient) (err error)
// Check is called on CNI CHECK. The plugin should verify (to the best of its ability) that everything is reasonably configured, else return error.
Check(ctx context.Context, pluginContext PluginContext, client *client.Client) error
}
cmdAdd
func cmdAdd(args *skel.CmdArgs) (err error) {
n, err := types.LoadNetConf(args.StdinData)
if err != nil {
return fmt.Errorf("unable to parse CNI configuration \"%s\": %s", args.StdinData, err)
}
if err = setupLogging(n); err != nil {
return fmt.Errorf("unable to setup logging: %w", err)
}
logger := log.WithField("eventUUID", uuid.New())
if n.EnableDebug {
if err := gops.Listen(gops.Options{}); err != nil {
log.WithError(err).Warn("Unable to start gops")
} else {
defer gops.Close()
}
}
logger.Debugf("Processing CNI ADD request %#v", args)
logger.Debugf("CNI NetConf: %#v", n)
if n.PrevResult != nil {
logger.Debugf("CNI Previous result: %#v", n.PrevResult)
}
cniArgs := types.ArgsSpec{}
if err = cniTypes.LoadArgs(args.Args, &cniArgs); err != nil {
return fmt.Errorf("unable to extract CNI arguments: %s", err)
}
logger.Debugf("CNI Args: %#v", cniArgs)
c, err := client.NewDefaultClientWithTimeout(defaults.ClientConnectTimeout)
if err != nil {
return fmt.Errorf("unable to connect to Cilium daemon: %s", client.Hint(err))
}
// If CNI ADD gives us a PrevResult, we're a chained plugin and *must* detect a
// valid chained mode. If no chained mode we understand is specified, error out.
// Otherwise, continue with normal plugin execution.
if len(n.NetConf.RawPrevResult) != 0 {
if chainAction, err := getChainedAction(n, logger); chainAction != nil {
var (
res *cniTypesV1.Result
ctx = chainingapi.PluginContext{
Logger: logger,
Args: args,
CniArgs: cniArgs,
NetConf: n,
}
)
res, err = chainAction.Add(context.TODO(), ctx, c)
if err != nil {
logger.WithError(err).Warn("Chained ADD failed")
return err
}
logger.Debugf("Returning result %#v", res)
return cniTypes.PrintResult(res, n.CNIVersion)
} else if err != nil {
logger.WithError(err).Error("Invalid chaining mode")
return err
} else {
// no chained action supplied; this is an error
logger.Error("CNI PrevResult supplied, but not in chaining mode -- this is invalid, please set chaining-mode in CNI configuration")
return fmt.Errorf("CNI PrevResult supplied, but not in chaining mode -- this is invalid, please set chaining-mode in CNI configuration")
}
}
// 非Chain模式下的CNI插件,不是我們關注的
// ...
}
GenericVethChainer.Add
func (f *GenericVethChainer) Add(ctx context.Context, pluginCtx chainingapi.PluginContext, cli *client.Client) (res *cniTypesVer.Result, err error) {
err = cniVersion.ParsePrevResult(&pluginCtx.NetConf.NetConf)
if err != nil {
err = fmt.Errorf("unable to understand network config: %s", err)
return
}
var prevRes *cniTypesVer.Result
prevRes, err = cniTypesVer.NewResultFromResult(pluginCtx.NetConf.PrevResult)
if err != nil {
err = fmt.Errorf("unable to get previous network result: %s", err)
return
}
defer func() {
if err != nil {
pluginCtx.Logger.WithError(err).
WithFields(logrus.Fields{"cni-pre-result": pluginCtx.NetConf.PrevResult}).
Errorf("Unable to create endpoint")
}
}()
var (
hostMac, vethHostName, vethLXCMac, vethIP, vethIPv6 string
vethHostIdx, peerIndex int
peer netlink.Link
netNs ns.NetNS
)
netNs, err = ns.GetNS(pluginCtx.Args.Netns)
if err != nil {
err = fmt.Errorf("failed to open netns %q: %s", pluginCtx.Args.Netns, err)
return
}
defer netNs.Close()
if err = netNs.Do(func(_ ns.NetNS) error {
links, err := netlink.LinkList()
if err != nil {
return err
}
for _, link := range links {
pluginCtx.Logger.Debugf("Found interface in container %+v", link.Attrs())
// 針對veth處理
if link.Type() != "veth" {
continue
}
// 這些信息都是從prevResult獲取
vethLXCMac = link.Attrs().HardwareAddr.String()
veth, ok := link.(*netlink.Veth)
if !ok {
return fmt.Errorf("link %s is not a veth interface", vethHostName)
}
peerIndex, err = netlink.VethPeerIndex(veth)
if err != nil {
return fmt.Errorf("unable to retrieve index of veth peer %s: %s", vethHostName, err)
}
addrs, err := netlink.AddrList(link, netlink.FAMILY_V4)
if err == nil && len(addrs) > 0 {
vethIP = addrs[0].IPNet.IP.String()
} else if err != nil {
pluginCtx.Logger.WithError(err).WithFields(logrus.Fields{
logfields.Interface: link.Attrs().Name}).Warn("No valid IPv4 address found")
}
addrsv6, err := netlink.AddrList(link, netlink.FAMILY_V6)
if err == nil && len(addrsv6) > 0 {
vethIPv6 = addrsv6[0].IPNet.IP.String()
} else if err != nil {
pluginCtx.Logger.WithError(err).WithFields(logrus.Fields{
logfields.Interface: link.Attrs().Name}).Warn("No valid IPv6 address found")
}
return nil
}
return fmt.Errorf("no link found inside container")
}); err != nil {
return
}
peer, err = netlink.LinkByIndex(peerIndex)
if err != nil {
err = fmt.Errorf("unable to lookup link %d: %s", peerIndex, err)
return
}
hostMac = peer.Attrs().HardwareAddr.String()
vethHostName = peer.Attrs().Name
vethHostIdx = peer.Attrs().Index
switch {
case vethHostName == "":
err = errors.New("unable to determine name of veth pair on the host side")
return
case vethLXCMac == "":
err = errors.New("unable to determine MAC address of veth pair on the container side")
return
case vethIP == "" && vethIPv6 == "":
err = errors.New("unable to determine IP address of the container")
return
case vethHostIdx == 0:
err = errors.New("unable to determine index interface of veth pair on the host side")
return
}
var disabled = false
ep := &models.EndpointChangeRequest{
Addressing: &models.AddressPair{
IPV4: vethIP,
IPV6: vethIPv6,
},
ContainerID: pluginCtx.Args.ContainerID,
State: models.EndpointStateWaitingDashForDashIdentity.Pointer(),
HostMac: hostMac,
InterfaceIndex: int64(vethHostIdx),
Mac: vethLXCMac,
InterfaceName: vethHostName,
K8sPodName: string(pluginCtx.CniArgs.K8S_POD_NAME),
K8sNamespace: string(pluginCtx.CniArgs.K8S_POD_NAMESPACE),
SyncBuildEndpoint: true,
DatapathConfiguration: &models.EndpointDatapathConfiguration{
// aws-cni requires ARP passthrough between Linux and the pod
RequireArpPassthrough: true,
// The route is pointing directly into the veth of the pod, install a host-facing egress program to implement ingress policy and to provide reverse NAT
RequireEgressProg: true,
// The IP is managed by the aws-cni plugin, no need for Cilium to manage any aspect of addressing
ExternalIpam: true,
// All routing is performed by the Linux stack
RequireRouting: &disabled,
},
}
err = cli.EndpointCreate(ep)
if err != nil {
pluginCtx.Logger.WithError(err).WithFields(logrus.Fields{
logfields.ContainerID: ep.ContainerID}).Warn("Unable to create endpoint")
err = fmt.Errorf("unable to create endpoint: %s", err)
return
}
pluginCtx.Logger.WithFields(logrus.Fields{
logfields.ContainerID: ep.ContainerID}).Debug("Endpoint successfully created")
res = prevRes
return
}
關于compile
cilium-1.14.4/pkg/datapath/loader/compile.go
從一個 cilium-agent 的容器內執行下面的命令。
root@ns-k8s-noah-staging001-node-s0093:/home/cilium# clang --version
clang version 10.0.0 (https://github.com/llvm/llvm-project.git 0598a534371d5fd6debd129b1378b39b923b9787)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/local/bin
可以從日志中,找到 cilium 的編譯語句實際如下。
clang -emit-llvm -g -O2 --target=bpf -std=gnu89 -nostdinc -D__NR_CPUS__=24 -Wall -Wextra -Werror -Wshadow -Wno-address-of-packed-member -Wno-unknown-warning-option -Wno-gnu-variable-sized-type-not-at-end -Wdeclaration-after-statement -Wimplicit-int-conversion -Wenum-conversion -I/var/run/cilium/state/globals -I/var/run/cilium/state/templates/46b04c5af0a1cda6e6efde8d345f9cfee515c727b888a49f38393b6cb0c84e2f -I/var/lib/cilium/bpf -I/var/lib/cilium/bpf/include -c /var/lib/cilium/bpf/bpf_host.c -o -
這條命令使用 clang 編譯器來編譯一個名為 bpf_host.c 的 C 語言源文件,這個文件位于 /var/lib/cilium/bpf 路徑下。這個操作是在編譯 eBPF 程序,這類程序通常用于 Linux 內核中的網絡處理、監視和安全策略執行等任務。命令中的每個選項和參數都有特定的作用:
- -emit-llvm: 指示clang生成LLVM中間表示 (Intermediate Representation, IR) 而不是生成機器代碼。LLVM IR是一種低級的、類型豐富的匯編語言,它提供了LLVM優化和代碼生成工具鏈所需的信息
- -g: 在生成的文件中包括調試信息,這有助于后續的調試工作
- -O2: 啟用優化級別2,這是編譯器的標準優化級別,試圖在不犧牲太多編譯時間的情況下提供性能提升
- --target=bpf: 指定目標架構為BPF,這是編譯eBPF程序時必需的,因為eBPF程序需要在內核中執行
- -std=gnu89: 使用GNU89標準來編譯C代碼,這是ANSI C(也稱為C89)的一個變種,它包括了GNU的一些擴展
- -nostdinc: 不在標準系統目錄中查找頭文件,通常用于確保只使用指定的頭文件,而不是系統路徑中的頭文件
- -D__NR_CPUS__=24: 定義宏NR_CPUS并設置其值為24,這可能與eBPF程序中處理的CPU核心數量有關
- -Wall -Wextra -Werror: 啟用所有(-Wall)和額外(-Wextra)的警告,并將所有警告作為錯誤處理(-Werror),這有助于確保代碼質量
- -Wshadow: 警告局部變量遮蔽(shadowing)了其他變量
- -Wno-address-of-packed-member: 不警告訪問打包結構體(使用了 attribute((packed)))成員的地址
- -Wno-unknown-warning-option: 不警告未知的編譯器警告選項
- -Wno-gnu-variable-sized-type-not-at-end: 不警告 GNU 擴展的非最后位置的可變大小類型
- -Wdeclaration-after-statement: 警在任何語句之后聲明變量(在C89中不允許)
- -Wimplicit-int-conversion: 警告在整數類型之間進行的隱式轉換
- -Wenum-conversion: 警告枚舉類型到不同類型的轉換
- -I: 后面跟的是包含目錄的路徑,clang 將在這些路徑中查找包含的頭文件
- -c: 表示編譯操作,而不進行鏈接
- /var/lib/cilium/bpf/bpf_host.c: 指定要編譯的源文件
- -o -: 將輸出(此處是 LLVM IR)寫到標準輸出(stdout),而不是寫入文件。這通常用于將輸出傳遞給管道中的另一個命令
綜上所述,此命令編譯了一個 eBPF 程序,產生的輸出是 LLVM 中間表示的形式,包含了調試信息,且進行了一些優化。這個過程在編譯過程中使用了許多嚴格的警告選項來保證代碼質量,并且避免了使用標準包含路徑,而是添加了特定的 Cilium 相關的包含路徑。
編譯的信息來源
// epInfoCache describes the set of lxcmap entries necessary to describe an Endpoint
// in the BPF maps. It is generated while holding the Endpoint lock, then used
// after releasing that lock to push the entries into the datapath.
// Functions below implement the EndpointFrontend interface with this cached information.
type epInfoCache struct {
// revision is used by the endpoint regeneration code to determine
// whether this cache is out-of-date wrt the underlying endpoint.
revision uint64
// For datapath.loader.endpoint
epdir string
id uint64
ifName string
// For datapath.EndpointConfiguration
identity identity.NumericIdentity
mac mac.MAC
ipv4 netip.Addr
ipv6 netip.Addr
conntrackLocal bool
requireARPPassthrough bool
requireEgressProg bool
requireRouting bool
requireEndpointRoute bool
policyVerdictLogFilter uint32
cidr4PrefixLengths, cidr6PrefixLengths []int
options *option.IntOptions
lxcMAC mac.MAC
ifIndex int
// endpoint is used to get the endpoint's logger.
//
// Do NOT use this for fetching endpoint data directly; this structure
// is intended as a safe cache of endpoint data that is assembled while
// holding the endpoint lock, for use beyond the holding of that lock.
// Dereferencing fields in this endpoint is not guaranteed to be safe.
endpoint *Endpoint
}
tc操作
pkg/datapath/loader/netlink.go
包含一些 tc 的操作。編譯和加載,分開了兩個步驟。
func (l *Loader) compileAndLoad(ctx context.Context, ep datapath.Endpoint, dirs *directoryInfo, stats *metrics.SpanStat) error {
stats.BpfCompilation.Start()
// 先編譯
err := compileDatapath(ctx, dirs, ep.IsHost(), ep.Logger(Subsystem))
stats.BpfCompilation.End(err == nil)
if err != nil {
return err
}
stats.BpfLoadProg.Start()
// 后加載
err = l.reloadDatapath(ctx, ep, dirs)
stats.BpfLoadProg.End(err == nil)
return err
}
清理
Helm delete 的時候,會把網卡這些刪除。但是在安裝之前會有 uninstall 的腳本,以防止之前安裝的組件會有殘留。
preStop:
exec:
command:
- /cni-uninstall.sh
查看腳本的內容,實際就是去做一些清理配置文件的工作。
#!/bin/bash
set -e
HOST_PREFIX=${HOST_PREFIX:-/host}
BIN_NAME=cilium-cni
CNI_DIR=${CNI_DIR:-${HOST_PREFIX}/opt/cni}
CNI_CONF_DIR=${CNI_CONF_DIR:-${HOST_PREFIX}/etc/cni/net.d}
CILIUM_CUSTOM_CNI_CONF=${CILIUM_CUSTOM_CNI_CONF:-false}
if [[ "$(cat /tmp/cilium/config-map/cni-uninstall 2>/dev/null || true)" != "true" ]]; then
echo "cni-uninstall disabled, not removing CNI configuration"
exit
fi
# Do not interact with the host's CNI directory when the user specified they
# are managing CNI configs externally.
if [ "${CILIUM_CUSTOM_CNI_CONF}" != "true" ]; then
# .conf/.conflist/.json (undocumented) are read by kubelet/dockershim's CNI implementation.
# Remove any active Cilium CNI configurations to prevent scheduling Pods during agent
# downtime. Configs belonging to other CNI implementations have already been renamed
# to *.cilium_bak during agent startup.
echo "Removing active Cilium CNI configurations from ${CNI_CONF_DIR}..."
find "${CNI_CONF_DIR}" -maxdepth 1 -type f \
-name '*cilium*' -and \( \
-name '*.conf' -or \
-name '*.conflist' \
\) -delete
fi
RestfulAPI
cilium api server 用來處理 Endpoint 的請求,默認情況下,通過 UNIX 進行通信。類似的,cilium 的客戶端工具,可以通過讀取這個 unix-socket 和這個 HTTP 服務進行交互,讀取 cilium-agent 的一些實時數據。
默認情況下,只支持 unix 通信,如果想暴露出 http 接口,可以修改源碼來實現(不建議)。
curl --unix-socket /var/run/cilium/cilium.sock http://localhost/v1/cluster/nodes
curl --unix-socket /var/run/cilium/cilium.sock http://localhost/v1/config
curl --unix-socket /var/run/cilium/cilium.sock http://localhost/v1/healthz
Cilium Operator
甚至都不負責 CEP 的創建,但是會協助 Cilium 資源的 GC。