Source code for airflow.providers.cncf.kubernetes.executors.kubernetes_executor

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""
KubernetesExecutor.

.. seealso::
    For more information on how the KubernetesExecutor works, take a look at the guide:
    :doc:`/kubernetes_executor`
"""

from __future__ import annotations

import contextlib
import json
import logging
import multiprocessing
import time
from collections import Counter, defaultdict
from contextlib import suppress
from dataclasses import dataclass
from datetime import datetime, timedelta
from queue import Empty, Queue
from typing import TYPE_CHECKING, Any

from deprecated import deprecated
from kubernetes.dynamic import DynamicClient
from sqlalchemy import select

from airflow.exceptions import AirflowProviderDeprecationWarning
from airflow.executors.base_executor import BaseExecutor
from airflow.providers.cncf.kubernetes.exceptions import PodMutationHookException, PodReconciliationError
from airflow.providers.cncf.kubernetes.executors.kubernetes_executor_types import (
    ADOPTED,
    POD_EXECUTOR_DONE_KEY,
    FailureDetails,
    KubernetesJob,
    KubernetesResults,
)
from airflow.providers.cncf.kubernetes.kube_config import KubeConfig
from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import annotations_to_key
from airflow.providers.cncf.kubernetes.pod_generator import PodGenerator
from airflow.providers.cncf.kubernetes.version_compat import AIRFLOW_V_3_0_PLUS
from airflow.providers.common.compat.sdk import Stats, conf
from airflow.utils.helpers import prune_dict
from airflow.utils.log.logging_mixin import remove_escape_codes
from airflow.utils.session import NEW_SESSION, provide_session
from airflow.utils.state import TaskInstanceState

if TYPE_CHECKING:
    from collections.abc import Sequence
    from multiprocessing.managers import SyncManager

    from kubernetes import client
    from kubernetes.client import models as k8s
    from sqlalchemy.orm import Session

    from airflow.cli.cli_config import GroupCommand
    from airflow.executors import workloads
    from airflow.models.taskinstance import TaskInstance
    from airflow.models.taskinstancekey import TaskInstanceKey
    from airflow.providers.cncf.kubernetes.executors.kubernetes_executor_utils import (
        AirflowKubernetesScheduler,
    )


@dataclass
class _PodLaunchAttempt:
    """
    Executor-side requeue state for a task whose worker pod may fail before the task process starts.

    ``requeued_for_pod`` records the pod a requeue was last issued for, so the duplicate
    ``Failed`` events Kubernetes can emit for a single pod don't each trigger another requeue.
    """

    job: KubernetesJob
    attempts: int = 0
    requeued_for_pod: str | None = None



[docs]
class KubernetesExecutor(BaseExecutor):
    """Executor for Kubernetes."""


[docs]
    RUNNING_POD_LOG_LINES = 100


[docs]
    supports_ad_hoc_ti_run: bool = True


[docs]
    supports_multi_team: bool = True


    if TYPE_CHECKING and AIRFLOW_V_3_0_PLUS:
        # In the v3 path, we store workloads, not commands as strings.
        # TODO: TaskSDK: move this type change into BaseExecutor

[docs]
        queued_tasks: dict[TaskInstanceKey, workloads.All]  # type: ignore[assignment]


    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Check if self has the ExecutorConf set on the self.conf attribute with all required methods.
        # In older Airflow versions, ExecutorConf exists but lacks methods like getint, getboolean, etc.
        # In such cases, fall back to the global configuration object.
        # This allows the changes to be backwards compatible with older versions of Airflow.
        # Can be removed when minimum supported provider version is equal to the version of core airflow
        # which introduces multi-team configuration (3.2+).
        if not hasattr(self, "conf") or not hasattr(self.conf, "getint"):
            self.conf = conf


[docs]
        self.kube_config = KubeConfig(executor_conf=self.conf)

        # Override parallelism with team-aware config value

[docs]
        self.parallelism = self.kube_config.parallelism


        # The multiprocessing.Manager() (and the queues it backs) is only needed once the
        # scheduler actually runs the executor, so it is created lazily in start(). Constructing
        # the executor without starting it -- as the API server does to call get_task_log() for a
        # RUNNING task -- must not spawn a Manager process, otherwise that serve_forever child is
        # orphaned and leaks (one per API-server worker).
        self._manager: SyncManager | None = None

[docs]
        self.task_queue: Queue[KubernetesJob] | None = None


[docs]
        self.result_queue: Queue[KubernetesResults] | None = None


[docs]
        self.kube_scheduler: AirflowKubernetesScheduler | None = None


[docs]
        self.kube_client: client.CoreV1Api | None = None


[docs]
        self.scheduler_job_id: str | None = None

        self._last_completed_pod_adoption = 0.0

[docs]
        self.kubernetes_queue: str | None = None


[docs]
        self.task_publish_retries: Counter[TaskInstanceKey] = Counter()


[docs]
        self.task_publish_max_retries = self.conf.getint(
            "kubernetes_executor", "task_publish_max_retries", fallback=0
        )


[docs]
        self.pod_launch_failure_max_retries = self.conf.getint(
            "kubernetes_executor", "pod_launch_failure_retries", fallback=1
        )

        excluded_reasons = self.conf.get(
            "kubernetes_executor", "pod_launch_failure_excluded_container_reasons", fallback="Error"
        )

[docs]
        self.pod_launch_failure_excluded_container_reasons = frozenset(
            reason.strip() for reason in excluded_reasons.split(",") if reason.strip()
        )

        # Per-key state for requeuing pods that fail before the task process starts (job spec,
        # requeue count, and the pod a requeue was last issued for), so the failure is never
        # observed by the scheduler and no task-level retry is consumed.
        # Intentionally in-memory and not persisted (like task_publish_retries): if this scheduler
        # dies the state is lost, and adoption by another scheduler is a safe no-op for it -- an
        # adopted pod has no entry here, so a pre-execution failure falls through to a normal fail
        # instead of requeuing. The orphaned task instance itself is still recovered by the
        # scheduler's adopt_or_reset_orphaned_tasks(), which re-queues it with a fresh attempt.

[docs]
        self.pod_launch_attempts: dict[TaskInstanceKey, _PodLaunchAttempt] = {}

        self.RUNNING_POD_LOG_LINES = self.conf.getint(
            "kubernetes_executor", "running_pod_log_lines", fallback=KubernetesExecutor.RUNNING_POD_LOG_LINES
        )
        if self.RUNNING_POD_LOG_LINES <= 0:
            raise ValueError(
                "The [kubernetes_executor] running_pod_log_lines configuration must be greater than 0, "
                f"got {self.RUNNING_POD_LOG_LINES}."
            )

[docs]
        self.completed: dict[tuple[str, str], KubernetesResults] = {}


[docs]
        self.create_pods_after: datetime | None = None


        # Maintain compatibility with older Airflow releases that do not define team_name.
        if not hasattr(self, "team_name"):
            self.team_name = None

    def _list_pods(self, query_kwargs):
        query_kwargs["header_params"] = {
            "Accept": "application/json;as=PartialObjectMetadataList;v=v1;g=meta.k8s.io"
        }
        dynamic_client = DynamicClient(self.kube_client.api_client)
        pod_resource = dynamic_client.resources.get(api_version="v1", kind="Pod")
        if self.kube_config.multi_namespace_mode:
            if self.kube_config.multi_namespace_mode_namespace_list:
                namespaces = self.kube_config.multi_namespace_mode_namespace_list
            else:
                namespaces = [None]
        else:
            namespaces = [self.kube_config.kube_namespace]

        pods = []
        for namespace in namespaces:
            pods.extend(dynamic_client.get(resource=pod_resource, namespace=namespace, **query_kwargs).items)

        return pods

    def _make_safe_label_value(self, input_value: str | datetime) -> str:
        """
        Normalize a provided label to be of valid length and characters.

        See airflow.providers.cncf.kubernetes.pod_generator.make_safe_label_value for more details.
        """
        # airflow.providers.cncf.kubernetes is an expensive import, locally import it here to
        # speed up load times of the kubernetes_executor module.
        from airflow.providers.cncf.kubernetes import pod_generator

        if isinstance(input_value, datetime):
            return pod_generator.datetime_to_label_safe_datestring(input_value)
        return pod_generator.make_safe_label_value(input_value)


[docs]
    def get_pod_combined_search_str_to_pod_map(self) -> dict[str, k8s.V1Pod]:
        """
        List the worker pods owned by this scheduler and create a map containing pod combined search str -> pod.

        For every pod, it creates two below entries in the map
        dag_id={dag_id},task_id={task_id},airflow-worker={airflow_worker},<map_index={map_index}>,run_id={run_id}
        """
        # airflow worker label selector batch call
        kwargs = {"label_selector": f"airflow-worker={self._make_safe_label_value(str(self.job_id))}"}
        if self.kube_config.kube_client_request_args:
            kwargs.update(self.kube_config.kube_client_request_args)
        pod_list = self._list_pods(kwargs)

        # create a set against pod query label fields
        pod_combined_search_str_to_pod_map = {}
        for pod in pod_list:
            dag_id = pod.metadata.annotations.get("dag_id", None)
            task_id = pod.metadata.annotations.get("task_id", None)
            map_index = pod.metadata.annotations.get("map_index", None)
            run_id = pod.metadata.annotations.get("run_id", None)
            if dag_id is None or task_id is None:
                continue
            search_base_str = f"dag_id={dag_id},task_id={task_id}"
            if map_index is not None:
                search_base_str += f",map_index={map_index}"
            if run_id is not None:
                search_str = f"{search_base_str},run_id={run_id}"
                pod_combined_search_str_to_pod_map[search_str] = pod
        return pod_combined_search_str_to_pod_map



[docs]
    def start(self) -> None:
        """Start the executor."""
        self.log.info("Start Kubernetes executor")
        self._manager = multiprocessing.Manager()
        self.task_queue = self._manager.JoinableQueue()
        self.result_queue = self._manager.JoinableQueue()
        self.scheduler_job_id = str(self.job_id)
        self.log.debug("Start with scheduler_job_id: %s", self.scheduler_job_id)
        from airflow.providers.cncf.kubernetes.executors.kubernetes_executor_utils import (
            AirflowKubernetesScheduler,
        )
        from airflow.providers.cncf.kubernetes.kube_client import get_kube_client

        self.kube_client = get_kube_client()
        self.kube_scheduler = AirflowKubernetesScheduler(
            kube_config=self.kube_config,
            result_queue=self.result_queue,
            kube_client=self.kube_client,
            scheduler_job_id=self.scheduler_job_id,
            team_name=self.team_name,
        )


    def _coordinator_extra(self, queue: str | None) -> dict[str, Any] | None:
        """
        Return the ``extra`` mapping a coordinator declares for *queue*, if any.

        Read from the coordinator's declarative ``[sdk]`` config without importing
        or instantiating the coordinator. The coordinator manager only exists on
        Airflow 3.3+; on older Task SDKs the import fails and we fall back to no
        extra. A malformed ``[sdk] coordinators`` / ``queue_to_coordinator`` config
        must not crash the scheduler on this first lookup either, so an invalid
        config also falls back to no extra. The exception types are imported from
        ``airflow.sdk`` so they match whatever Task SDK actually raised them.
        """
        if not queue:
            return None
        try:
            from airflow.sdk.exceptions import AirflowConfigException
            from airflow.sdk.execution_time.coordinator import get_coordinator_manager
        except ImportError:
            return None
        try:
            return get_coordinator_manager().extra_for_queue(queue)
        except (AirflowConfigException, ValueError):
            self.log.warning(
                "Ignoring coordinator config for queue %s: invalid [sdk] coordinator config",
                queue,
                exc_info=True,
            )
            return None

    def _coordinator_pod_template_file(self, extra: dict[str, Any]) -> str | None:
        """
        Return the pod template declared in a coordinator's *extra* mapping, if any.

        Lets a queue routed to a non-Python coordinator (via ``[sdk]
        queue_to_coordinator``) launch its worker pod from a coordinator-specific
        template — for example an image carrying the JVM for a Java coordinator.
        """
        return extra.get("pod_template_file")

    def _coordinator_kube_image(self, extra: dict[str, Any]) -> str | None:
        """
        Return the worker base image declared in a coordinator's *extra* mapping, if any.

        The base container image is never taken from a pod template; it comes
        from ``kube_image`` (``worker_container_repository:worker_container_tag``)
        or a per-task ``pod_override``. A coordinator may declare its own
        ``worker_container_repository`` and ``worker_container_tag`` in ``extra``
        (e.g. a JRE-bearing image for a Java coordinator); both are required to
        compose an override, otherwise the executor default applies.
        """
        if (repo := extra.get("worker_container_repository")) and (tag := extra.get("worker_container_tag")):
            return f"{repo}:{tag}"
        return None


[docs]
    def execute_async(
        self,
        key: TaskInstanceKey,
        command: Any,
        queue: str | None = None,
        executor_config: Any | None = None,
    ) -> None:
        """Execute task asynchronously."""
        if TYPE_CHECKING:
            assert self.task_queue

        if self.log.isEnabledFor(logging.DEBUG):
            self.log.debug("Add task %s with command %s, executor_config %s", key, command, executor_config)
        else:
            self.log.info("Add task %s with command %s", key, command)

        try:
            kube_executor_config = PodGenerator.from_obj(executor_config)
        except Exception:
            self.log.error("Invalid executor_config for %s. Executor_config: %s", key, executor_config)
            self.fail(key=key, info="Invalid executor_config passed")
            return

        if executor_config:
            pod_template_file = executor_config.get("pod_template_file", None)
        else:
            pod_template_file = None

        coordinator_kube_image: str | None = None
        if (coordinator_extra := self._coordinator_extra(queue)) is not None:
            # A coordinator-level pod_template wins (e.g. a JVM image for JavaCoordinator)
            coordinator_pod_template_file = self._coordinator_pod_template_file(coordinator_extra)
            if coordinator_pod_template_file is not None:
                self.log.debug(
                    "Using coordinator-declared pod template %s for task %s in queue %s",
                    coordinator_pod_template_file,
                    key,
                    queue,
                )
                pod_template_file = coordinator_pod_template_file

            # The base image is not carried by a pod template, so a coordinator routes
            # its worker base image separately (e.g. a JRE image for a Java queue).
            if (coordinator_kube_image := self._coordinator_kube_image(coordinator_extra)) is not None:
                self.log.debug(
                    "Using coordinator-declared base image %s for task %s in queue %s",
                    coordinator_kube_image,
                    key,
                    queue,
                )

        self.event_buffer[key] = (TaskInstanceState.QUEUED, self.scheduler_job_id)
        job = KubernetesJob(key, command, kube_executor_config, pod_template_file, coordinator_kube_image)
        self.pod_launch_attempts[key] = _PodLaunchAttempt(job=job)
        self.task_queue.put(job)



[docs]
    def queue_workload(self, workload: workloads.All, session: Session | None) -> None:
        from airflow.executors import workloads

        if not isinstance(workload, workloads.ExecuteTask):
            raise RuntimeError(f"{type(self)} cannot handle workloads of type {type(workload)}")
        ti = workload.ti
        self.queued_tasks[ti.key] = workload


    def _process_workloads(self, workloads: Sequence[workloads.All]) -> None:
        from airflow.executors.workloads import ExecuteTask

        # Airflow V3 version
        for w in workloads:
            if not isinstance(w, ExecuteTask):
                raise RuntimeError(f"{type(self)} cannot handle workloads of type {type(w)}")

            # TODO: AIP-72 handle populating tokens once https://github.com/apache/airflow/issues/45107 is handled.
            command = [w]
            key = w.ti.key
            queue = w.ti.queue
            executor_config = w.ti.executor_config or {}

            del self.queued_tasks[key]
            self.execute_async(key=key, command=command, queue=queue, executor_config=executor_config)
            self.running.add(key)


[docs]
    def sync(self) -> None:
        """Synchronize task state."""
        if TYPE_CHECKING:
            assert self.scheduler_job_id
            assert self.kube_scheduler
            assert self.kube_config
            assert self.result_queue
            assert self.task_queue
            assert self.kube_client

        adoption_interval = conf.getfloat("scheduler", "orphaned_tasks_check_interval", fallback=300.0)
        now = time.monotonic()
        if now - self._last_completed_pod_adoption >= adoption_interval:
            self._last_completed_pod_adoption = now
            self._adopt_completed_pods(self.kube_client)

        if self.running:
            self.log.debug("self.running: %s", self.running)
        if self.queued_tasks:
            self.log.debug("self.queued: %s", self.queued_tasks)
        self.kube_scheduler.sync()

        last_resource_version: dict[str, str] = defaultdict(lambda: "0")
        with contextlib.suppress(Empty):
            while True:
                results = self.result_queue.get_nowait()
                try:
                    last_resource_version[results.namespace] = results.resource_version
                    self.log.info("Changing state of %s to %s", results, results.state)
                    try:
                        self._change_state(results)
                    except Exception as e:
                        self.log.exception(
                            "Exception: %s when attempting to change state of %s to %s, re-queueing.",
                            e,
                            results,
                            results.state,
                        )
                        self.result_queue.put(results)
                finally:
                    self.result_queue.task_done()

        if self.completed:
            still_pending: dict[tuple[str, str], KubernetesResults] = {}
            for pod_key, result in self.completed.items():
                try:
                    self._change_state(result)
                except Exception:
                    self.log.exception(
                        "Exception when attempting to change state of adopted completed pod %s, will retry.",
                        result,
                    )
                    still_pending[pod_key] = result
            self.completed = still_pending

        from airflow.providers.cncf.kubernetes.executors.kubernetes_executor_utils import ResourceVersion

        resource_instance = ResourceVersion()
        for ns in resource_instance.resource_version:
            resource_instance.resource_version[ns] = (
                last_resource_version[ns] or resource_instance.resource_version[ns]
            )

        from kubernetes.client.rest import ApiException

        if self.create_pods_after and self.create_pods_after > datetime.now():
            self.log.warning("Skipping pod creation due to kubernetes rate limit")
            return

        self.create_pods_after = None

        with contextlib.suppress(Empty):
            for _ in range(self.kube_config.worker_pods_creation_batch_size):
                task = self.task_queue.get_nowait()

                try:
                    key = task.key
                    self.kube_scheduler.run_next(task)
                    self.task_publish_retries.pop(key, None)
                except PodReconciliationError as e:
                    self.log.exception(
                        "Pod reconciliation failed, likely due to kubernetes library upgrade. "
                        "Try clearing the task to re-run.",
                    )
                    self.fail(task[0], e)
                except ApiException as e:
                    try:
                        if e.body:
                            body = json.loads(e.body)
                        else:
                            # If no body content, use reason as the message
                            body = {"message": e.reason}
                    except (json.JSONDecodeError, ValueError, TypeError):
                        # If the body is a string (e.g., in a 429 error), it can't be parsed as JSON.
                        # Use the body directly as the message instead.
                        body = {"message": e.body}

                    headers = e.headers or {}
                    retries = self.task_publish_retries[key]
                    # In case of exceeded quota or conflict errors, requeue the task as per the task_publish_max_retries
                    # In case of a rate limit, wait and do not create new pods for "Retry-After" seconds
                    can_retry_publish = (
                        self.task_publish_max_retries == -1 or retries < self.task_publish_max_retries
                    )
                    message = body.get("message", "")
                    if (
                        (str(e.status) == "403" and "exceeded quota" in message)
                        or (str(e.status) == "409" and "object has been modified" in message)
                        or (str(e.status) == "410" and "too old resource version" in message)
                        or str(e.status) == "500"
                        or str(e.status) == "429"
                    ) and can_retry_publish:
                        self.log.warning(
                            "[Try %s of %s] Kube ApiException for Task: (%s). Reason: %r. Message: %s",
                            self.task_publish_retries[key] + 1,
                            self.task_publish_max_retries,
                            key,
                            e.reason,
                            message,
                        )

                        self.task_queue.put(task)
                        self.task_publish_retries[key] = retries + 1

                        if str(e.status) == "429":
                            self.create_pods_after = datetime.now() + timedelta(
                                seconds=int(headers.get("Retry-After", "0"))
                            )
                            self.log.warning(
                                "Got rate limit from k8s api, skipping pod creation until %s",
                                self.create_pods_after,
                            )
                            # stop pod creation to stop api requests
                            break
                    else:
                        self.log.error("Pod creation failed with reason %r. Failing task", e.reason)
                        key = task.key
                        self.fail(key, e)
                        self.task_publish_retries.pop(key, None)
                except PodMutationHookException as e:
                    key = task.key
                    self.log.error(
                        "Pod Mutation Hook failed for the task %s. Failing task. Details: %s",
                        key,
                        e.__cause__,
                    )
                    self.fail(key, e)
                finally:
                    self.task_queue.task_done()


    @provide_session
    def _change_state(
        self,
        results: KubernetesResults,
        *,
        session: Session = NEW_SESSION,
    ) -> None:
        """Change state of the task based on KubernetesResults."""
        if TYPE_CHECKING:
            assert self.kube_scheduler

        key = results.key
        state = results.state
        pod_name = results.pod_name
        namespace = results.namespace
        failure_details = results.failure_details

        termination_reason: str | None = None

        if state == TaskInstanceState.FAILED:
            # Use pre-collected failure details from the watcher to avoid additional API calls
            if failure_details:
                pod_status = failure_details.get("pod_status")
                pod_reason = failure_details.get("pod_reason")
                pod_message = failure_details.get("pod_message")
                container_state = failure_details.get("container_state")
                container_reason = failure_details.get("container_reason")
                container_message = failure_details.get("container_message")
                exit_code = failure_details.get("exit_code")
                container_type = failure_details.get("container_type")
                container_name = failure_details.get("container_name")

                termination_reason = f"Pod failed because of {pod_reason}"

                task_key_str = f"{key.dag_id}.{key.task_id}.{key.try_number}"
                self.log.warning(
                    "Task %s failed in pod %s/%s. Pod phase: %s, reason: %s, message: %s, "
                    "container_type: %s, container_name: %s, container_state: %s, container_reason: %s, "
                    "container_message: %s, exit_code: %s",
                    task_key_str,
                    namespace,
                    pod_name,
                    pod_status,
                    pod_reason,
                    pod_message,
                    container_type,
                    container_name,
                    container_state,
                    container_reason,
                    container_message,
                    exit_code,
                )
            else:
                task_key_str = f"{key.dag_id}.{key.task_id}.{key.try_number}"
                self.log.warning(
                    "Task %s failed in pod %s/%s (no details available)", task_key_str, namespace, pod_name
                )

        if state == ADOPTED:
            # When the task pod is adopted by another executor,
            # then remove the task from the current executor running queue.
            self.pod_launch_attempts.pop(key, None)
            try:
                self.running.remove(key)
            except KeyError:
                self.log.debug("TI key not in running: %s", key)
            return

        if state == TaskInstanceState.RUNNING:
            self.event_buffer[key] = state, None
            return

        if self.kube_config.delete_worker_pods:
            if state != TaskInstanceState.FAILED or self.kube_config.delete_worker_pods_on_failure:
                self.kube_scheduler.delete_pod(pod_name=pod_name, namespace=namespace)
                self.log.info(
                    "Deleted pod associated with the TI %s. Pod name: %s. Namespace: %s",
                    key,
                    pod_name,
                    namespace,
                )
        else:
            self.kube_scheduler.patch_pod_executor_done(pod_name=pod_name, namespace=namespace)
            self.log.info("Patched pod %s in namespace %s to mark it as done", key, namespace)

        # Only pods this executor launched and is still tracking can be requeued; checking the
        # in-memory attempt first avoids a metadata-db lookup for adopted or already-finalized pods.
        attempt = self.pod_launch_attempts.get(key)
        if (
            attempt is not None
            and state == TaskInstanceState.FAILED
            and self.pod_launch_failure_max_retries != 0
            and self._is_pre_execution_failure(
                state,
                self._get_task_instance_state(key, session=session),
                failure_details,
                self.pod_launch_failure_excluded_container_reasons,
            )
        ):
            if attempt.requeued_for_pod == pod_name:
                # Kubernetes can emit several Failed events for one pod; we already requeued
                # for this one, so ignore the duplicates instead of requeuing again.
                self.log.debug(
                    "Ignoring duplicate pre-execution failure for already-requeued pod %s/%s",
                    namespace,
                    pod_name,
                )
                return
            if (
                self.pod_launch_failure_max_retries == -1
                or attempt.attempts < self.pod_launch_failure_max_retries
            ):
                attempt.attempts += 1
                attempt.requeued_for_pod = pod_name
                self.log.warning(
                    "[Try %s of %s] Pod %s/%s for task %s failed before the task process started "
                    "(container_reason: %s). Requeuing without consuming a task retry.",
                    attempt.attempts,
                    self.pod_launch_failure_max_retries,
                    namespace,
                    pod_name,
                    key,
                    failure_details.get("container_reason") if failure_details else None,
                )
                # Leave the key in self.running and do not write to event_buffer: the scheduler
                # never observes this failure, so no task-level retry is consumed.
                if TYPE_CHECKING:
                    assert self.task_queue
                self.task_queue.put(attempt.job)
                return

        self.pod_launch_attempts.pop(key, None)

        try:
            self.running.remove(key)
        except KeyError:
            self.log.debug("TI key not in running, not adding to event_buffer: %s", key)
            return

        # If we don't have a TI state, look it up from the db. event_buffer expects the TI state
        if state is None:
            state = self._get_task_instance_state(key, session=session)

        self.event_buffer[key] = state, termination_reason

    def _get_task_instance_state(self, key: TaskInstanceKey, *, session: Session) -> TaskInstanceState | None:
        """Look up the current task instance state from the metadata database."""
        from airflow.models.taskinstance import TaskInstance

        filter_for_tis = TaskInstance.filter_for_tis([key])
        if filter_for_tis is None:
            return None
        db_state = session.scalar(select(TaskInstance.state).where(filter_for_tis))
        return TaskInstanceState(db_state) if db_state else None

    @staticmethod
    def _is_pre_execution_failure(
        state: TaskInstanceState | str | None,
        ti_state: TaskInstanceState | None,
        failure_details: FailureDetails | None,
        excluded_container_reasons: frozenset[str],
    ) -> bool:
        """
        Return ``True`` if a failed pod's task process never started running.

        Both conditions are required:

        - ``state`` is ``FAILED``: the pod actually terminated.
        - ``ti_state`` is ``QUEUED``: the task instance never transitioned to ``running``, so no
          task code ran. This is the authoritative signal and holds regardless of the specific
          container failure reason (node drain, autoscaler scale-down, transient image pull
          error, deferrable resume pod killed before ``execute_complete`` started, etc.).

        Pods whose ``container_reason`` is in ``excluded_container_reasons`` are not treated as
        pre-execution failures. The default exclusion of ``Error`` covers a container that
        started executing but whose worker process exited before writing ``running`` to the
        database, which is most likely an Airflow-specific startup error.
        """
        if state != TaskInstanceState.FAILED or ti_state != TaskInstanceState.QUEUED:
            return False
        if failure_details:
            container_reason = failure_details.get("container_reason")
            if container_reason and container_reason in excluded_container_reasons:
                return False
        return True

    def _get_pod_namespace(self, ti: TaskInstance):
        pod_override = (ti.executor_config or {}).get("pod_override")
        namespace = None
        with suppress(Exception):
            if pod_override is not None:
                namespace = pod_override.metadata.namespace
        return namespace or self.conf.get("kubernetes_executor", "namespace")


[docs]
    def get_task_log(self, ti: TaskInstance, try_number: int) -> tuple[list[str], list[str]]:
        messages = []
        log = []
        try:
            from airflow.providers.cncf.kubernetes.kube_client import get_kube_client
            from airflow.providers.cncf.kubernetes.pod_generator import PodGenerator

            client = get_kube_client()

            messages.append(f"Attempting to fetch logs from pod {ti.hostname} through kube API")
            selector = PodGenerator.build_selector_for_k8s_executor_pod(
                dag_id=ti.dag_id,
                task_id=ti.task_id,
                try_number=try_number,
                map_index=ti.map_index,
                run_id=ti.run_id,
                airflow_worker=ti.queued_by_job_id,
            )
            namespace = self._get_pod_namespace(ti)
            pod_list = client.list_namespaced_pod(
                namespace=namespace,
                label_selector=selector,
            ).items
            if not pod_list:
                raise RuntimeError("Cannot find pod for ti %s", ti)
            if len(pod_list) > 1:
                raise RuntimeError("Found multiple pods for ti %s: %s", ti, pod_list)
            res = client.read_namespaced_pod_log(
                name=pod_list[0].metadata.name,
                namespace=namespace,
                container="base",
                follow=False,
                tail_lines=self.RUNNING_POD_LOG_LINES,
                _preload_content=False,
            )
            for line in res:
                log.append(remove_escape_codes(line.decode()))
            if log:
                messages.append("Found logs through kube API")
        except Exception as e:
            messages.append(f"Reading from k8s pod logs failed: {e}")
        return messages, ["\n".join(log)]



[docs]
    def try_adopt_task_instances(self, tis: Sequence[TaskInstance]) -> Sequence[TaskInstance]:
        with Stats.timer(
            "kubernetes_executor.adopt_task_instances.duration",
            tags=prune_dict({"team_name": self.team_name}),
        ):
            # Always flush TIs without queued_by_job_id
            tis_to_flush = [ti for ti in tis if not ti.queued_by_job_id]
            scheduler_job_ids = {ti.queued_by_job_id for ti in tis}
            tis_to_flush_by_key = {ti.key: ti for ti in tis if ti.queued_by_job_id}
            kube_client: client.CoreV1Api = self.kube_client
            for scheduler_job_id in scheduler_job_ids:
                scheduler_job_id_safe_label = self._make_safe_label_value(str(scheduler_job_id))
                # We will look for any pods owned by the no-longer-running scheduler,
                # but will exclude only successful pods, as those TIs will have a terminal state
                # and not be up for adoption!
                # Those workers that failed, however, are okay to adopt here as their TI will
                # still be in queued.
                query_kwargs = {
                    "field_selector": "status.phase!=Succeeded",
                    "label_selector": (
                        "kubernetes_executor=True,"
                        f"airflow-worker={scheduler_job_id_safe_label},{POD_EXECUTOR_DONE_KEY}!=True"
                    ),
                }
                pod_list = self._list_pods(query_kwargs)
                for pod in pod_list:
                    self.adopt_launched_task(kube_client, pod, tis_to_flush_by_key)
            self._adopt_completed_pods(kube_client)

            # as this method can be retried within a short time frame
            # (wrapped in a run_with_db_retries of scheduler_job_runner,
            # and get retried due to an OperationalError, for example),
            # there is a chance that in second attempt, adopt_launched_task will not be called even once
            # as all pods are already adopted in the first attempt.
            # and tis_to_flush_by_key will contain TIs that are already adopted.
            # therefore, we need to check if the TIs are already adopted by the first attempt and remove them.
            def _iter_tis_to_flush():
                for key, ti in tis_to_flush_by_key.items():
                    if key in self.running:
                        self.log.info("%s is already adopted, no need to flush.", ti)
                    else:
                        yield ti

            tis_to_flush.extend(_iter_tis_to_flush())
            return tis_to_flush


    @deprecated(
        reason="Replaced by function `revoke_task`. Upgrade airflow core to make this go away.",
        category=AirflowProviderDeprecationWarning,
    )

[docs]
    def cleanup_stuck_queued_tasks(self, tis: list[TaskInstance]) -> list[str]:
        """
        Handle remnants of tasks that were failed because they were stuck in queued.

        Tasks can get stuck in queued. If such a task is detected, it will be marked
        as `UP_FOR_RETRY` if the task instance has remaining retries or marked as `FAILED`
        if it doesn't.

        :param tis: List of Task Instances to clean up
        :return: List of readable task instances for a warning message
        """
        reprs = []
        for ti in tis:
            reprs.append(repr(ti))
            self.revoke_task(ti=ti)
            self.fail(ti.key)
        return reprs



[docs]
    def revoke_task(self, *, ti: TaskInstance):
        """
        Revoke task that may be running.

        :param ti: task instance to revoke
        """
        if TYPE_CHECKING:
            assert self.kube_client
            assert self.kube_scheduler
        self.running.discard(ti.key)
        self.queued_tasks.pop(ti.key, None)
        pod_combined_search_str_to_pod_map = self.get_pod_combined_search_str_to_pod_map()
        # Build the pod selector
        base_label_selector = f"dag_id={ti.dag_id},task_id={ti.task_id}"
        if ti.map_index >= 0:
            # Old tasks _couldn't_ be mapped, so we don't have to worry about compat
            base_label_selector += f",map_index={ti.map_index}"

        search_str = f"{base_label_selector},run_id={ti.run_id}"
        pod = pod_combined_search_str_to_pod_map.get(search_str, None)
        if not pod:
            self.log.warning("Cannot find pod for ti %s", ti)
            return

        self.kube_scheduler.patch_pod_revoked(pod_name=pod.metadata.name, namespace=pod.metadata.namespace)
        self.kube_scheduler.delete_pod(pod_name=pod.metadata.name, namespace=pod.metadata.namespace)



[docs]
    def adopt_launched_task(
        self,
        kube_client: client.CoreV1Api,
        pod: k8s.V1Pod,
        tis_to_flush_by_key: dict[TaskInstanceKey, k8s.V1Pod],
    ) -> None:
        """
        Patch existing pod so that the current KubernetesJobWatcher can monitor it via label selectors.

        :param kube_client: kubernetes client for speaking to kube API
        :param pod: V1Pod spec that we will patch with new label
        :param tis_to_flush_by_key: TIs that will be flushed if they aren't adopted
        """
        if TYPE_CHECKING:
            assert self.scheduler_job_id

        self.log.info("attempting to adopt pod %s", pod.metadata.name)
        ti_key = annotations_to_key(pod.metadata.annotations)
        if ti_key not in tis_to_flush_by_key:
            self.log.error("attempting to adopt taskinstance which was not specified by database: %s", ti_key)
            return

        new_worker_id_label = self._make_safe_label_value(self.scheduler_job_id)
        from kubernetes.client.rest import ApiException

        try:
            kube_client.patch_namespaced_pod(
                name=pod.metadata.name,
                namespace=pod.metadata.namespace,
                body={"metadata": {"labels": {"airflow-worker": new_worker_id_label}}},
            )
        except ApiException as e:
            self.log.info("Failed to adopt pod %s. Reason: %s", pod.metadata.name, e)
            return

        del tis_to_flush_by_key[ti_key]
        self.running.add(ti_key)


    def _alive_other_scheduler_job_ids(self) -> set[int]:
        """
        Return job IDs of every SchedulerJob that is currently alive — excluding self.

        "Alive" means ``Job.state == RUNNING`` AND its ``latest_heartbeat`` is
        within ``[scheduler] scheduler_health_check_threshold``.

        Used by ``_adopt_completed_pods`` to scope cross-scheduler pod
        adoption to pods owned by no-longer-alive schedulers (#66396).
        With a single scheduler the returned set is always empty — the
        original "exclude self only" behavior is preserved. With multiple
        schedulers each one only adopts pods whose owning scheduler is gone,
        eliminating the relabel-thrash that PR #61839 introduced.

        Returns an empty set on any DB error so the caller falls back to
        the pre-#61839 "exclude self only" selector — a transient DB issue
        must not break completed-pod cleanup.
        """
        if TYPE_CHECKING:
            assert self.scheduler_job_id

        try:
            self_id = int(self.scheduler_job_id)
        except (TypeError, ValueError):
            # Tests sometimes set scheduler_job_id to a non-numeric string.
            # In production it's always Job.id (int), but be defensive.
            return set()

        try:
            from datetime import timedelta

            from sqlalchemy import select

            from airflow.jobs.job import Job
            from airflow.utils import timezone
            from airflow.utils.session import create_session
            from airflow.utils.state import JobState

            timeout = conf.getint("scheduler", "scheduler_health_check_threshold")
            cutoff = timezone.utcnow() - timedelta(seconds=timeout)
            # Must be an *independent* (non-scoped) session. try_adopt_task_instances runs
            # inside the scheduler's own transaction (adopt_or_reset_orphaned_tasks); a scoped
            # session here would resolve to that same thread-local session, and the context
            # manager's commit()/close() on exit would commit the scheduler's in-flight work
            # early (releasing its FOR UPDATE SKIP LOCKED row locks) and detach the orphaned
            # TaskInstances it still holds, crashing the reset path (#67813).
            with create_session(scoped=False) as session:
                # Iterate the scalar cursor straight into the set so we never
                # materialize an intermediate list — keeps the memory
                # footprint flat regardless of how many sibling schedulers
                # are alive
                return {
                    jid
                    for jid in session.scalars(
                        select(Job.id).where(
                            Job.job_type == "SchedulerJob",
                            Job.state == JobState.RUNNING,
                            Job.latest_heartbeat >= cutoff,
                            Job.id != self_id,
                        )
                    )
                }
        except Exception as exc:
            self.log.warning(
                "Could not query alive SchedulerJobs for completed-pod adoption "
                "scoping: %s. Falling back to exclude-self-only.",
                exc,
            )
            return set()

    def _adopt_completed_pods(self, kube_client: client.CoreV1Api) -> None:
        """
        Patch completed pods owned by no-longer-alive schedulers so this scheduler's watcher can delete them.

        Originally this method patched every Succeeded pod that did not carry
        THIS scheduler's ``airflow-worker`` label. With multi-scheduler
        deployments that caused thrashing — every scheduler relabeled every
        other scheduler's completed pods on each interval tick, fighting over
        ownership and burning kube-API and watcher cycles (see #66396).

        The fix scopes the selector to also exclude pods owned by every
        currently-alive sibling scheduler. With one scheduler, behavior is
        unchanged (no siblings → original "exclude self only" selector). With
        multiple schedulers, each one only adopts pods whose owning scheduler
        is gone — preserving the original goal of #61839 (cleanup after a
        scheduler restart) without the multi-scheduler regression.

        :param kube_client: kubernetes client for speaking to kube API
        """
        if TYPE_CHECKING:
            assert self.scheduler_job_id

        self_label = self._make_safe_label_value(self.scheduler_job_id)
        excluded_labels = sorted(
            {
                self_label,
                *(self._make_safe_label_value(str(jid)) for jid in self._alive_other_scheduler_job_ids()),
            }
        )

        if len(excluded_labels) == 1:
            # Equality-based selector — preserves the pre-fix label_selector
            # exactly when no sibling scheduler is alive, so single-scheduler
            # deployments see no behavior change.
            worker_filter = f"airflow-worker!={excluded_labels[0]}"
        else:
            # Set-based requirement: K8s parses `notin (a,b,c)` as "label
            # value is none of these". Mixed with the surrounding
            # equality-based requirements via comma separator.
            worker_filter = f"airflow-worker notin ({','.join(excluded_labels)})"

        query_kwargs = {
            "field_selector": "status.phase=Succeeded",
            "label_selector": (f"kubernetes_executor=True,{worker_filter},{POD_EXECUTOR_DONE_KEY}!=True"),
        }
        pod_list = self._list_pods(query_kwargs)
        for pod in pod_list:
            self.log.info("Attempting to adopt pod %s", pod.metadata.name)
            from kubernetes.client.rest import ApiException

            try:
                kube_client.patch_namespaced_pod(
                    name=pod.metadata.name,
                    namespace=pod.metadata.namespace,
                    body={"metadata": {"labels": {"airflow-worker": self_label}}},
                )
            except ApiException as e:
                self.log.info("Failed to adopt pod %s. Reason: %s", pod.metadata.name, e)
                continue

            ti_id = annotations_to_key(pod.metadata.annotations)
            pod_name = pod.metadata.name
            namespace = pod.metadata.namespace
            self.completed[(namespace, pod_name)] = KubernetesResults(
                key=ti_id,
                state="completed",
                pod_name=pod_name,
                namespace=namespace,
                resource_version=pod.metadata.resource_version,
                failure_details=None,
            )

    def _flush_task_queue(self) -> None:
        if TYPE_CHECKING:
            assert self.task_queue

        self.log.debug("Executor shutting down, task_queue approximate size=%d", self.task_queue.qsize())
        with contextlib.suppress(Empty):
            while True:
                task = self.task_queue.get_nowait()
                # This is a new task to run thus ok to ignore.
                self.log.warning("Executor shutting down, will NOT run task=%s", task)
                self.task_queue.task_done()

    def _flush_result_queue(self) -> None:
        if TYPE_CHECKING:
            assert self.result_queue

        self.log.debug("Executor shutting down, result_queue approximate size=%d", self.result_queue.qsize())
        with contextlib.suppress(Empty):
            while True:
                results = self.result_queue.get_nowait()
                self.log.warning("Executor shutting down, flushing results=%s", results)
                try:
                    self.log.info(
                        "Changing state of %s to %s : resource_version=%s",
                        results,
                        results.state,
                        results.resource_version,
                    )
                    try:
                        self._change_state(results)
                    except Exception as e:
                        self.log.exception(
                            "Ignoring exception: %s when attempting to change state of %s to %s.",
                            e,
                            results,
                            results.state,
                        )
                finally:
                    self.result_queue.task_done()


[docs]
    def end(self) -> None:
        """Shut down the executor."""
        if self._manager is None:
            # start() was never called (e.g. the executor was only constructed to read task
            # logs), so there is no Manager process or queues to shut down.
            return
        if TYPE_CHECKING:
            assert self.task_queue
            assert self.result_queue
            assert self.kube_scheduler
            assert self._manager

        self.log.info("Shutting down Kubernetes executor")
        try:
            self.log.debug("Flushing task_queue...")
            self._flush_task_queue()
            self.log.debug("Flushing result_queue...")
            self._flush_result_queue()
            # Both queues should be empty...
            self.task_queue.join()
            self.result_queue.join()
        except ConnectionResetError:
            self.log.exception("Connection Reset error while flushing task_queue and result_queue.")
        except Exception:
            self.log.exception("Unknown error while flushing task queue and result queue.")
        if self.kube_scheduler:
            try:
                self.kube_scheduler.terminate()
            except Exception:
                self.log.exception("Unknown error while flushing task queue and result queue.")
        self._manager.shutdown()
        # Return to the unstarted state so a second end() is a no-op (the guard above) and the
        # Manager/queues are recreated cleanly if start() is ever called again.
        self._manager = None
        self.task_queue = None
        self.result_queue = None



[docs]
    def terminate(self):
        """Terminate the executor is not doing anything."""


    @staticmethod

[docs]
    def get_cli_commands() -> list[GroupCommand]:
        from airflow.providers.cncf.kubernetes.cli.definition import get_kubernetes_cli_commands

        return get_kubernetes_cli_commands()