透过真实场景分析K8S的EndpointController的源码
场景重现
最近遇到一个问题,在K8S
的几台机器上中创建了Glusterfs
的集群,通过官方的教程一步步的来利用Glusterfs
创建Volume
以及PV
,不过只是创建了每个Volume
的Endpoint
,并没有相对应的创建Service
实例(官方说创建Service
会使Endpoint
持久化,当时并没有理会),然后在一次集群重启的时候发现Endpoint
实例并没有启动起来,很疑惑,像其他的K8S
对象,例如POD
,Deployment
,Service
都启动起来了,但是Endpoint
并没有,带着这个问题看了下官方的Issue
,并没有什么有效的解答,大家可以参考一下Issue: Endpoints are not persistented
1. 探究源码
1.1 源码版本
基于k8s release-1.13
1.2 源码目录结构
由于我们重点看Endpoint
部分,因此我们只看Endpoint
相关的源码
Endpoint
1.3 Endpoint的初始化
文件位置: endpoints_controller.go
// NewEndpointController returns a new *EndpointController. //我们可以看到在Endpoint初始化的时候,已经注册了三个informer,分别是podInformer,serviceInformer,endpointsInformer func NewEndpointController(podInformer coreinformers.PodInformer, serviceInformer coreinformers.ServiceInformer, endpointsInformer coreinformers.EndpointsInformer, client clientset.Interface) *EndpointController { broadcaster := record.NewBroadcaster() broadcaster.StartLogging(klog.Infof) broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: client.CoreV1().Events("")}) recorder := broadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "endpoint-controller"}) if client != nil && client.CoreV1().RESTClient().GetRateLimiter() != nil { metrics.RegisterMetricAndTrackRateLimiterUsage("endpoint_controller", client.CoreV1().RESTClient().GetRateLimiter()) } e := &EndpointController{ client: client, queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "endpoint"), workerLoopPeriod: time.Second, } //这里对service进行watch操作,并注册了对应的add\update\del等操作 serviceInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ //add:以添加的service的namespace/name形式为key,并将该key加入 queue AddFunc: e.enqueueService, //update:以更新后的service的namespace/name形式为key,并将该key加入 queue UpdateFunc: func(old, cur interface{}) { e.enqueueService(cur) }, //delete:以删除的service的namespace/name形式为key,并将该key加入 queue DeleteFunc: e.enqueueService, }) e.serviceLister = serviceInformer.Lister() e.servicesSynced = serviceInformer.Informer().HasSynced //这里对pod进行watch操作,并注册了对应的add\update\del等操作 podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: e.addPod, UpdateFunc: e.updatePod, DeleteFunc: e.deletePod, }) e.podLister = podInformer.Lister() e.podsSynced = podInformer.Informer().HasSynced e.endpointsLister = endpointsInformer.Lister() e.endpointsSynced = endpointsInformer.Informer().HasSynced e.triggerTimeTracker = NewTriggerTimeTracker() e.eventBroadcaster = broadcaster e.eventRecorder = recorder return e }
我们看看pod
注册的Handler
引用了哪些函数
1.3.1 e.addPod
func (e *EndpointController) addPod(obj interface{}) {//实例化一个pod对象 pod := obj.(*v1.Pod) services, err := e.getPodServiceMemberships(pod) if err != nil { utilruntime.HandleError(fmt.Errorf("Unable to get pod %s/%s's service memberships: %v", pod.Namespace, pod.Name, err)) return }//将service集合以namespace/name为key逐个加入到queue中 for key := range services { e.queue.Add(key) }}func (e *EndpointController) getPodServiceMemberships(pod *v1.Pod) (sets.String, error) { set := sets.String{}//获取pod与service的映射关系 services, err := e.serviceLister.GetPodServices(pod) if err != nil { // don't log this error because this function makes pointless // errors when no services match. return set, nil }//查找逻辑为逐个对比service的selector与该pod的label,如果service的selector为该pod label的子集,则表示该pod属于service for i := range services { key, err := controller.KeyFunc(services[i]) if err != nil { return nil, err } set.Insert(key) } return set, nil}
1.3.2 e.updatePod
func (e *EndpointController) updatePod(old, cur interface{}) { newPod := cur.(*v1.Pod) oldPod := old.(*v1.Pod)//比较两者的ResourceVersion,对比更新后的pod与原pod,如果两者的资源版本相等,则直接返回,不进行入队操作 if newPod.ResourceVersion == oldPod.ResourceVersion { //Periodicresync will send update events for all known pods. // Two different versions of the same pod will always have different RVs. return }//判断pod相关信息是否发生改变 podChangedFlag := podChanged(oldPod, newPod) // Check if the pod labels have changed, indicating a possible // change in the service membership labelsChanged := false//判断两者的label是否已经不一致,或者hostname或subdomain已改变 if !reflect.DeepEqual(newPod.Labels, oldPod.Labels) || !hostNameAndDomainAreEqual(newPod, oldPod) { labelsChanged = true } // If both the pod and labels areunchanged, no update is needed if !podChangedFlag && !labelsChanged { return }//判断错误,则获取对应的service和pod映射关系 services, err := e.getPodServiceMemberships(newPod) if err != nil { utilruntime.HandleError(fmt.Errorf("Unable to get pod %v/%v's service memberships: %v", newPod.Namespace, newPod.Name, err)) return } if labelsChanged { oldServices, err := e.getPodServiceMemberships(oldPod) if err != nil { utilruntime.HandleError(fmt.Errorf("Unable to get pod %v/%v's service memberships: %v", oldPod.Namespace, oldPod.Name, err)) return } services = determineNeededServiceUpdates(oldServices, services, podChangedFlag) } for key := range services { e.queue.Add(key) }}func podChanged(oldPod, newPod *v1.Pod) bool {//podChanged函数,其检测逻辑为,如果新旧两个pod的DeletionTimestamp字段不等则返回true,否则继续判断两者的就绪状态,如果不等则返回true,最后再判断新旧pod的ip、nodename、namespace、UID是否相等,如果相等则返回false,否则返回true。将返回结果赋值给podChangedFlag // If the pod'sdeletiontimestamp is set, removeendpointfrom ready address. if newPod.DeletionTimestamp != oldPod.DeletionTimestamp { return true } // If the pod'sreadinesshas changed, the associatedendpointaddress // will move from theunreadyendpointsset to the readyendpoints. // So for the purposes of anendpoint, areadinesschange on a pod // means we have a changed pod. if podutil.IsPodReady(oldPod) != podutil.IsPodReady(newPod) { return true } // Convert the pod to an EndpointAddress, clearinertfields, // and see if they are the same. newEndpointAddress := podToEndpointAddress(newPod) oldEndpointAddress := podToEndpointAddress(oldPod) // Ignore the ResourceVersion because it changes // with every pod update. This allows the comparison to // show equality if all other relevant fields match. newEndpointAddress.TargetRef.ResourceVersion = "" oldEndpointAddress.TargetRef.ResourceVersion = "" if reflect.DeepEqual(newEndpointAddress, oldEndpointAddress) { // The pod has not changed in any way that impacts theendpoints return false } return true}
1.4 Endpoint-Controller具体逻辑
// Run will not return until stopCh is closed. workers determines how many//endpointswill be handled in parallel.func (e *EndpointController) Run(workers int, stopCh <-chan struct{}) { defer utilruntime.HandleCrash() defer e.queue.ShutDown() klog.Infof("Startingendpointcontroller") defer klog.Infof("Shutting downendpointcontroller") // 等待pod、service、endpoint列表同步 if !controller.WaitForCacheSync("endpoint", stopCh, e.podsSynced, e.servicesSynced, e.endpointsSynced) { return } // 这里workers数为kube-controller-manager启动参数中的--concurrent-endpoint-syncs决定,默认为5,workerLoopPeriod为1秒 for i := 0; i < workers; i++ { // 执行worker函数,for死循环处理queue中的key go wait.Until(e.worker, e.workerLoopPeriod, stopCh) } go func() { defer utilruntime.HandleCrash() e.checkLeftoverEndpoints() }() <-stopCh}func (e *EndpointController) worker() { for e.processNextWorkItem() { }}func (e *EndpointController) processNextWorkItem() bool { eKey, quit := e.queue.Get() if quit { return false } defer e.queue.Done(eKey) err := e.syncService(eKey.(string)) e.handleErr(err, eKey) return true}
endpointController的主要逻辑在syncService
函数
func (e *EndpointController) syncService(key string) error { startTime := time.Now() defer func() { klog.V(4).Infof("Finished syncing service %qendpoints. (%v)", key, time.Since(startTime)) }() // 根据key获取service的namespace和name namespace, name, err := cache.SplitMetaNamespaceKey(key) if err != nil { return err } service, err := e.serviceLister.Services(namespace).Get(name) if err != nil { // 如果service已经被删除,则也要删除对用的endpoint资源 // Delete thecorrespondingendpoint, as the service has been deleted. // TODO: Please note that this will delete anendpointwhen a // service is deleted. However, if we're down at the time when // the service is deleted, we will miss thatdeletion, so this // doesn't completely solve the problem. See #6877. err = e.client.CoreV1().Endpoints(namespace).Delete(name, nil) if err != nil && !errors.IsNotFound(err) { return err } e.triggerTimeTracker.DeleteEndpoints(namespace, name) return nil } // 如果service的.spec.selector字段为空,直接返回,endpointController不处理这种情况 if service.Spec.Selector == nil { // services without aselectorreceive noendpointsfrom this controller; // these services will receive theendpointsthat are created out-of-band via the REST API. return nil } klog.V(5).Infof("About to upda***dpointsfor service %q", key) pods, err := e.podLister.Pods(service.Namespace).List(labels.Set(service.Spec.Selector).AsSelectorPreValidated()) if err != nil { // Since we're getting stuff from a local cache, it is // basically impossible to get this error. return err } // If the user specified the older (deprecated)annotation, we have to respect it. tolerateUnreadyEndpoints := service.Spec.PublishNotReadyAddresses //如果service的注解含有key为service.alpha.kubernetes.io/tolerate-unready-endpoints的值,该值为bool类型,默认tolerateUnreadyEndpoints值为false if v, ok := service.Annotations[TolerateUnreadyEndpointsAnnotation]; ok { b, err := strconv.ParseBool(v) if err == nil { tolerateUnreadyEndpoints = b } else { utilruntime.HandleError(fmt.Errorf("Failed toparseannotation%v: %v", TolerateUnreadyEndpointsAnnotation, err)) } } // We call ComputeEndpointsLastChangeTriggerTime here to make sure that the state of the trigger // time tracker gets updated even if the sync turns out to be no-op and we don't update the //endpointsobject. endpointsLastChangeTriggerTime := e.triggerTimeTracker. ComputeEndpointsLastChangeTriggerTime(namespace, name, service, pods) subsets := []v1.EndpointSubset{} var totalReadyEps int var totalNotReadyEps int //循环处理pod列表 for _, pod := range pods { // pod的podIp为空,则continue for循环 if len(pod.Status.PodIP) == 0 { klog.V(5).Infof("Failed to find an IP for pod %s/%s", pod.Namespace, pod.Name) continue } // 如果该pod正在被删除,则continue for循环 if !tolerateUnreadyEndpoints && pod.DeletionTimestamp != nil { // 获取该pod的信息,输出EndpointAddress结构体变量 klog.V(5).Infof("Pod is being deleted %s/%s", pod.Namespace, pod.Name) continue } epa := *podToEndpointAddress(pod) hostname := pod.Spec.Hostname // 如果pod存在hostname,则最后的FQDN为hostname.subdomain.namespace.svc.cluster.local if len(hostname) > 0 && pod.Spec.Subdomain == service.Name && service.Namespace == pod.Namespace { epa.Hostname = hostname } // Allowheadlessservice not to have ports. // 允许headless service没有端口 if len(service.Spec.Ports) == 0 { if service.Spec.ClusterIP == api.ClusterIPNone { // 1、如果tolerateUnreadyEndpoints为true,允许未就绪的pod也列入Addresses列表,如果tolerateUnreadyEndpoints为false但pod状态为ready则将pod列入Addresses列表; // 2、检测pod的重启策略,如果重启策略为Never,pod的运行状态不为Failed且不是Succeeded,将该pod列入NotReadyAddresses,如果重启策略为OnFailure并且pod的运行状态不为Succeeded,将该pod列入NotReadyAddresses,其它情况也将该pod列入NotReadyAddresses; subsets, totalReadyEps, totalNotReadyEps = addEndpointSubset(subsets, pod, epa, nil, tolerateUnreadyEndpoints) // No need to repack subsets forheadlessservice without ports. } } else { // 循环service的ports端口 for i := range service.Spec.Ports { servicePort := &service.Spec.Ports[i] portName := servicePort.Name portProto := servicePort.Protocol portNum, err := podutil.FindPort(pod, servicePort) // 如果service中的port在pod中不存在,则继续for循环 if err != nil { klog.V(4).Infof("Failed to find port for service %s/%s: %v", service.Namespace, service.Name, err) continue } var readyEps, notReadyEps int epp := &v1.EndpointPort{Name: portName, Port: int32(portNum), Protocol: portProto} subsets, readyEps, notReadyEps = addEndpointSubset(subsets, pod, epa, epp, tolerateUnreadyEndpoints) totalReadyEps = totalReadyEps + readyEps totalNotReadyEps = totalNotReadyEps + notReadyEps } } } // 重新整理subsets subsets = endpoints.RepackSubsets(subsets) // 如果endpoint不存在(通常该情况是新建一个service的情况),则新建一个,如果是其他未知错误,则返回err // See if there's actually an update here. currentEndpoints, err := e.endpointsLister.Endpoints(service.Namespace).Get(service.Name) if err != nil { if errors.IsNotFound(err) { currentEndpoints = &v1.Endpoints{ ObjectMeta: metav1.ObjectMeta{ Name: service.Name, Labels: service.Labels, }, } } else { return err } } // currentEndpoints的资源版本为空时,表示要创建endpoint createEndpoints := len(currentEndpoints.ResourceVersion) == 0 // 如果当前currentEndpoints的subset列表和重新整理后的subsets相等,并且label与service的label一致,则忽略本次更新操作 if !createEndpoints && apiequality.Semantic.DeepEqual(currentEndpoints.Subsets, subsets) && apiequality.Semantic.DeepEqual(currentEndpoints.Labels, service.Labels) { klog.V(5).Infof("endpointsare equal for %s/%s, skipping update", service.Namespace, service.Name) return nil } newEndpoints := currentEndpoints.DeepCopy() newEndpoints.Subsets = subsets newEndpoints.Labels = service.Labels if newEndpoints.Annotations == nil { newEndpoints.Annotations = make(map[string]string) } if !endpointsLastChangeTriggerTime.IsZero() { newEndpoints.Annotations[v1.EndpointsLastChangeTriggerTime] = endpointsLastChangeTriggerTime.Format(time.RFC3339Nano) } else { // No new trigger time, clear theannotation. delete(newEndpoints.Annotations, v1.EndpointsLastChangeTriggerTime) } klog.V(4).Infof("Upda***dpointsfor %v/%v, ready: %d not ready: %d", service.Namespace, service.Name, totalReadyEps, totalNotReadyEps) if createEndpoints { // 如果没有与service同命名空间和同名的endpoint,则生成新的endpoint // No previousendpoints, create them _, err = e.client.CoreV1().Endpoints(service.Namespace).Create(newEndpoints) } else { // Pre-existing // 已经存在与service同命名空间和同名的endpoint,需要更新endpoint _, err = e.client.CoreV1().Endpoints(service.Namespace).Update(newEndpoints) } if err != nil { if createEndpoints && errors.IsForbidden(err) { // A request is forbidden primarily for two reasons: // 1. namespace isterminating,endpointcreation is not allowed by default. // 2. policy is misconfigured, in which case no service would function anywhere. // Given the frequency of 1, we log at a lower level. klog.V(5).Infof("Forbidden from creatingendpoints: %v", err) } if createEndpoints { e.eventRecorder.Eventf(newEndpoints, v1.EventTypeWarning, "FailedToCreateEndpoint", "Failed to crea***dpointfor service %v/%v: %v", service.Namespace, service.Name, err) } else { e.eventRecorder.Eventf(newEndpoints, v1.EventTypeWarning, "FailedToUpdateEndpoint", "Failed to upda***dpoint%v/%v: %v", service.Namespace, service.Name, err) } return err } return nil}
1.5 Endpoint检测
之前说的是当Endpoint
和Service
绑定的时候Service
和Pod
改变时的一系列操作,现在我们回到问题,如果Endpoint
单独存在,K8S
是如何检测并且删除的?
我们重新看看Run
函数中的
go func() { defer utilruntime.HandleCrash() e.checkLeftoverEndpoints() }()
K8S
在运行Run
函数的时候启动了一个协程去检测当前所有的Endpoint
// checkLeftoverEndpoints lists all currently existingendpointsand adds their// service to the queue. This will detectendpointsthat exist with no//correspondingservice; theseendpointsneed to be deleted. We only need to// do this once on startup, because in steady-state these are detected (but// somestragglerscould have been left behind if theendpointcontroller// reboots).func (e *EndpointController) checkLeftoverEndpoints() {//拉取当前所有的endpoint对象 list, err := e.endpointsLister.List(labels.Everything()) if err != nil { utilruntime.HandleError(fmt.Errorf("Unable to listendpoints(%v);orphanedendpointswill not be cleaned up. (They're pretty harmless, but you can restart this component if you want another attempt made.)", err)) return }//轮询所有endpoint for _, ep := range list { if _, ok := ep.Annotations[resourcelock.LeaderElectionRecordAnnotationKey]; ok { // when there are multiple controller-manager instances, // we observe that it will delete leader-electionendpointsafter 5min // and cause re-election // so skip the delete here // as leader-election only haveendpointswithout service continue } key, err := controller.KeyFunc(ep) if err != nil { utilruntime.HandleError(fmt.Errorf("Unable to get key forendpoint%#v", ep)) continue }//假如此处endpoint没有对应的service,猜想会把endpoint的name当成key传入queue,然后在之前的逻辑中判断获取service name错误,于是删除endpoint e.queue.Add(key) }}
2. 总结
一句话,遇到如上问题有两种解决的方式:
创建
Service
的时候使用Selector
,这样可以自动创建Endpoint
在创建
Endpoint
还需要创建Service
,这样才可以持久化Endpoint
- 点赞
- 收藏
- 关注作者
评论(0)