Source code for airflow.providers.cncf.kubernetes.utils.pod_manager

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""Launches PODs."""

from __future__ import annotations

import asyncio
import enum
import json
import logging
import math
import re
import time
from collections.abc import Callable, Generator, Iterable
from contextlib import closing
from dataclasses import dataclass
from datetime import timedelta
from typing import TYPE_CHECKING, Literal, cast

import pendulum
from kubernetes import client, watch
from kubernetes.client.rest import ApiException
from kubernetes.stream import stream as kubernetes_stream
from pendulum import DateTime
from pendulum.parsing.exceptions import ParserError
from urllib3.exceptions import HTTPError, TimeoutError

from airflow.providers.cncf.kubernetes.callbacks import ExecutionMode, KubernetesPodOperatorCallback
from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import (
    KubernetesApiException,
    PodLaunchFailedException,
    generic_api_retry,
)
from airflow.providers.cncf.kubernetes.utils.container import (
    container_is_completed,
    container_is_running,
    container_is_terminated,
    container_is_wait,
    get_container_status,
)
from airflow.providers.cncf.kubernetes.utils.xcom_sidecar import PodDefaults
from airflow.providers.common.compat.sdk import AirflowException, timezone
from airflow.utils.log.logging_mixin import LoggingMixin

if TYPE_CHECKING:
    from kubernetes.client.models.core_v1_event import CoreV1Event
    from kubernetes.client.models.core_v1_event_list import CoreV1EventList
    from kubernetes.client.models.v1_container_state import V1ContainerState
    from kubernetes.client.models.v1_container_state_waiting import V1ContainerStateWaiting
    from kubernetes.client.models.v1_object_reference import V1ObjectReference
    from kubernetes.client.models.v1_pod import V1Pod
    from kubernetes.client.models.v1_pod_condition import V1PodCondition
    from urllib3.response import HTTPResponse

    from airflow.providers.cncf.kubernetes.hooks.kubernetes import AsyncKubernetesHook



[docs]
EMPTY_XCOM_RESULT = "__airflow_xcom_result_empty__"

"""
Sentinel for no xcom result.

:meta private:
"""

_POD_LOG_LEVEL_PATTERN = re.compile(
    r"^\s*(?:\[)?(DEBUG|INFO|WARNING|WARN|ERROR|CRITICAL|FATAL)(?:\])?\s*[:\-]?\s*",
    re.IGNORECASE,
)
_POD_LOG_LEVEL_MAP: dict[str, int] = {
    "DEBUG": logging.DEBUG,
    "INFO": logging.INFO,
    "WARNING": logging.WARNING,
    "WARN": logging.WARNING,
    "ERROR": logging.ERROR,
    "CRITICAL": logging.CRITICAL,
    "FATAL": logging.CRITICAL,
}


def _parse_log_level(message: str) -> int:
    """
    Detect the Python logging level from a pod log line's prefix.

    Recognises common formats: ``ERROR:``, ``[ERROR]``, ``WARNING -``, etc.
    Returns ``logging.INFO`` when no known prefix is found (backwards-compatible).
    """
    match = _POD_LOG_LEVEL_PATTERN.match(message)
    if match:
        return _POD_LOG_LEVEL_MAP.get(match.group(1).upper(), logging.INFO)
    return logging.INFO



[docs]
class XComRetrievalError(AirflowException):
    """When not possible to get xcom."""




[docs]
class PodPhase:
    """
    Possible pod phases.

    See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase.
    """


[docs]
    PENDING = "Pending"


[docs]
    RUNNING = "Running"


[docs]
    FAILED = "Failed"


[docs]
    SUCCEEDED = "Succeeded"


[docs]
    UNKNOWN = "Unknown"



[docs]
    terminal_states = {FAILED, SUCCEEDED}





[docs]
def check_exception_is_kubernetes_api_unauthorized(exc: BaseException):
    return isinstance(exc, ApiException) and exc.status and str(exc.status) == "401"




[docs]
def log_pod_event(
    pod_manager: PodManager | AsyncPodManager, event: CoreV1Event, seen_events: set[str]
) -> None:
    """
    Log a pod event if not already seen.

    :param pod_manager: The pod manager instance for logging
    :param event: Kubernetes event
    :param seen_events: Set of event UIDs already logged to avoid duplicates
    """
    event_uid = event.metadata.uid
    if event_uid not in seen_events:
        seen_events.add(event_uid)
        involved_object: V1ObjectReference = event.involved_object
        pod_manager.log.info("The Pod has an Event: %s from %s", event.message, involved_object.field_path)




[docs]
async def await_pod_start(
    pod_manager: PodManager | AsyncPodManager,
    pod: V1Pod,
    schedule_timeout: int = 120,
    startup_timeout: int = 120,
    check_interval: float = 1,
):
    """
    Monitor the startup phase of a Kubernetes pod, waiting for it to leave the ``Pending`` state.

    This function is shared by both PodManager and AsyncPodManager to provide consistent pod startup tracking.

    :param pod_manager: The pod manager instance (PodManager or AsyncPodManager).
    :param pod: The pod object to monitor.
    :param schedule_timeout: Maximum time (in seconds) to wait for the pod to be scheduled.
    :param startup_timeout: Maximum time (in seconds) to wait for the pod to start running after being scheduled.
    :param check_interval: Interval (in seconds) between status checks.
    :param is_async: Set to True if called in an async context; otherwise, False.
    """
    pod_manager.log.info("::group::Waiting up to %ss to get the POD scheduled...", schedule_timeout)
    pod_was_scheduled = False
    start_check_time = time.time()
    is_async = isinstance(pod_manager, AsyncPodManager)
    while True:
        if is_async:
            remote_pod = await pod_manager.read_pod(pod)
        else:
            remote_pod = pod_manager.read_pod(pod)
        pod_status = remote_pod.status
        if pod_status.phase not in (PodPhase.PENDING, PodPhase.UNKNOWN):
            pod_manager.stop_watching_events = True
            pod_manager.log.info("::endgroup::")
            break

        # Check for timeout
        pod_conditions: list[V1PodCondition] = pod_status.conditions
        if pod_conditions and any(
            (condition.type == "PodScheduled" and condition.status == "True") for condition in pod_conditions
        ):
            if not pod_was_scheduled:
                # POD was initially scheduled update timeout for getting POD launched
                pod_was_scheduled = True
                start_check_time = time.time()
                pod_manager.log.info("Waiting %ss to get the POD running...", startup_timeout)

            if time.time() - start_check_time >= startup_timeout:
                pod_manager.stop_watching_events = True
                pod_manager.log.info("::endgroup::")
                raise PodLaunchTimeoutException(
                    f"Pod took too long to start. More than {startup_timeout}s. Check the pod events in kubernetes."
                )
        else:
            if time.time() - start_check_time >= schedule_timeout:
                pod_manager.stop_watching_events = True
                pod_manager.log.info("::endgroup::")
                raise PodLaunchTimeoutException(
                    f"Pod took too long to be scheduled on the cluster, giving up. More than {schedule_timeout}s. Check the pod events in kubernetes."
                )

        # Check for general problems to terminate early
        error_message = detect_pod_terminate_early_issues(remote_pod)
        if error_message:
            pod_manager.log.info("::endgroup::")
            raise PodLaunchFailedException(error_message)

        await asyncio.sleep(check_interval)




[docs]
def detect_pod_terminate_early_issues(pod: V1Pod) -> str | None:
    """
    Identify issues that justify terminating the pod early.

    This method distinguishes between permanent failures (e.g., invalid image names)
    and transient errors (e.g., rate limits) that should be retried by Kubernetes.

    :param pod: The pod object to check.
    :return: An error message if an issue is detected; otherwise, None.
    """
    # Indicators in error messages that suggest transient issues
    TRANSIENT_ERROR_PATTERNS = [
        "pull qps exceeded",
        "rate limit",
        "too many requests",
        "quota exceeded",
        "temporarily unavailable",
        "timeout",
        "account limit",
        # Upstream registry/auth outages (e.g. Docker Hub auth returning 5xx).
        # kubelet will retry automatically; the caller's startup_timeout still
        # bounds the total wait. These match both the underlying ErrImagePull
        # message and the ImagePullBackOff message on kubelet >= 1.32, which
        # appends the previous pull error to "Back-off pulling image ...".
        # On older kubelets the bare ImagePullBackOff message carries no
        # detail, so we fall back to the existing fail-fast path — matching
        # "back-off pulling" unconditionally would cause a 120s wait for a
        # genuinely missing image instead of failing fast.
        "bad gateway",
        "service unavailable",
        "gateway timeout",
    ]

    FATAL_STATES = ["InvalidImageName", "ErrImageNeverPull"]
    TRANSIENT_STATES = ["ErrImagePull", "ImagePullBackOff"]
    ERROR_MESSAGE = "Image cannot be pulled, unable to start: {reason}\n{message}"

    pod_status = pod.status
    if not pod_status.container_statuses:
        return None

    for container_status in pod_status.container_statuses:
        container_state: V1ContainerState = container_status.state
        container_waiting: V1ContainerStateWaiting | None = container_state.waiting
        if not container_waiting:
            continue

        if container_waiting.reason in FATAL_STATES:
            return ERROR_MESSAGE.format(
                reason=container_waiting.reason, message=container_waiting.message or ""
            )

        if container_waiting.reason in TRANSIENT_STATES:
            message_lower = (container_waiting.message or "").lower()
            is_transient = any(pattern in message_lower for pattern in TRANSIENT_ERROR_PATTERNS)
            if not is_transient:
                return ERROR_MESSAGE.format(
                    reason=container_waiting.reason, message=container_waiting.message or ""
                )
    return None




[docs]
class PodLaunchTimeoutException(AirflowException):
    """When pod does not leave the ``Pending`` phase within specified timeout."""




[docs]
class PodNotFoundException(AirflowException):
    """Expected pod does not exist in kube-api."""




[docs]
class PodCommandException(AirflowException):
    """When a pod command execution fails."""




[docs]
class PodLogsConsumer:
    """
    Responsible for pulling pod logs from a stream with checking a container status before reading data.

    This class is a workaround for the issue https://github.com/apache/airflow/issues/23497.

    :param response: HTTP response with logs
    :param pod: Pod instance from Kubernetes client
    :param pod_manager: Pod manager instance
    :param container_name: Name of the container that we're reading logs from
    :param post_termination_timeout: (Optional) The period of time in seconds representing for how long time
        logs are available after the container termination.
    :param read_pod_cache_timeout: (Optional) The container's status cache lifetime.
        The container status is cached to reduce API calls.

    :meta private:
    """

    def __init__(
        self,
        response: HTTPResponse,
        pod: V1Pod,
        pod_manager: PodManager,
        container_name: str,
        post_termination_timeout: int = 120,
        read_pod_cache_timeout: int = 120,
    ):

[docs]
        self.response = response


[docs]
        self.pod = pod


[docs]
        self.pod_manager = pod_manager


[docs]
        self.container_name = container_name


[docs]
        self.post_termination_timeout = post_termination_timeout


[docs]
        self.last_read_pod_at = None


[docs]
        self.read_pod_cache = None


[docs]
        self.read_pod_cache_timeout = read_pod_cache_timeout



[docs]
    def __iter__(self) -> Generator[bytes, None, None]:
        r"""Yield log items divided by the '\n' symbol."""
        incomplete_log_item: list[bytes] = []
        if self.logs_available():
            for data_chunk in self.response.stream(amt=None, decode_content=True):
                if b"\n" in data_chunk:
                    log_items = data_chunk.split(b"\n")
                    yield from self._extract_log_items(incomplete_log_item, log_items)
                    incomplete_log_item = self._save_incomplete_log_item(log_items[-1])
                else:
                    incomplete_log_item.append(data_chunk)
                if not self.logs_available():
                    break
        if incomplete_log_item:
            yield b"".join(incomplete_log_item)


    @staticmethod
    def _extract_log_items(incomplete_log_item: list[bytes], log_items: list[bytes]):
        yield b"".join(incomplete_log_item) + log_items[0] + b"\n"
        for x in log_items[1:-1]:
            yield x + b"\n"

    @staticmethod
    def _save_incomplete_log_item(sub_chunk: bytes):
        return [sub_chunk] if [sub_chunk] else []


[docs]
    def logs_available(self):
        remote_pod = self.read_pod()
        if container_is_running(pod=remote_pod, container_name=self.container_name):
            return True
        container_status = get_container_status(pod=remote_pod, container_name=self.container_name)
        state = container_status.state if container_status else None
        terminated = state.terminated if state else None
        if terminated:
            termination_time = terminated.finished_at
            if termination_time:
                return termination_time + timedelta(seconds=self.post_termination_timeout) > timezone.utcnow()
        return False



[docs]
    def read_pod(self):
        _now = timezone.utcnow()
        if (
            self.read_pod_cache is None
            or self.last_read_pod_at + timedelta(seconds=self.read_pod_cache_timeout) < _now
        ):
            self.read_pod_cache = self.pod_manager.read_pod(self.pod)
            self.last_read_pod_at = _now
        return self.read_pod_cache




@dataclass

[docs]
class PodLoggingStatus:
    """Return the status of the pod and last log time when exiting from `fetch_container_logs`."""


[docs]
    running: bool


[docs]
    last_log_time: DateTime | None





[docs]
class PodManager(LoggingMixin):
    """Create, monitor, and otherwise interact with Kubernetes pods for use with the KubernetesPodOperator."""

    def __init__(
        self,
        kube_client: client.CoreV1Api,
        callbacks: list[type[KubernetesPodOperatorCallback]] | None = None,
    ):
        """
        Create the launcher.

        :param kube_client: kubernetes client
        :param callbacks:
        """
        super().__init__()
        self._client = kube_client
        self._watch = watch.Watch()
        self._callbacks = callbacks or []

[docs]
        self.stop_watching_events = False


[docs]
        self.container_log_times: dict[tuple[str, str, str], DateTime] = {}



[docs]
    def run_pod_async(self, pod: V1Pod, **kwargs) -> V1Pod:
        """Run POD asynchronously."""
        sanitized_pod = self._client.api_client.sanitize_for_serialization(pod)
        json_pod = json.dumps(sanitized_pod, indent=2)

        self.log.debug("Pod Creation Request: \n%s", json_pod)
        try:
            resp = self._client.create_namespaced_pod(
                body=sanitized_pod, namespace=pod.metadata.namespace, **kwargs
            )
            self.log.debug("Pod Creation Response: %s", resp)
        except Exception as e:
            self.log.exception(
                "Exception when attempting to create Namespaced Pod: %s", str(json_pod).replace("\n", " ")
            )
            raise e
        return resp


    @generic_api_retry

[docs]
    def delete_pod(self, pod: V1Pod) -> None:
        """Delete POD."""
        try:
            self._client.delete_namespaced_pod(
                pod.metadata.name, pod.metadata.namespace, body=client.V1DeleteOptions()
            )
        except ApiException as e:
            # If the pod is already deleted
            if str(e.status) != "404":
                raise


    @generic_api_retry

[docs]
    def create_pod(self, pod: V1Pod) -> V1Pod:
        """Launch the pod asynchronously."""
        return self.run_pod_async(pod)



[docs]
    async def watch_pod_events(self, pod: V1Pod, check_interval: float = 10) -> None:
        """Read pod events and write into log."""
        resource_version = None
        seen_events: set[str] = set()
        while not self.stop_watching_events:
            events = self.read_pod_events(pod, resource_version)
            if events.metadata and events.metadata.resource_version:
                resource_version = events.metadata.resource_version
            for event in events.items:
                log_pod_event(self, event, seen_events)
                resource_version = event.metadata.resource_version
            await asyncio.sleep(check_interval)



[docs]
    async def await_pod_start(
        self, pod: V1Pod, schedule_timeout: int = 120, startup_timeout: int = 120, check_interval: int = 1
    ) -> None:
        """
        Wait for the pod to reach phase other than ``Pending``.

        :param pod:
        :param schedule_timeout: Timeout (in seconds) for pod stay in schedule state
            (if pod is taking to long in schedule state, fails task)
        :param startup_timeout: Timeout (in seconds) for startup of the pod
            (if pod is pending for too long after being scheduled, fails task)
        :param check_interval: Interval (in seconds) between checks
        :return:
        """
        await await_pod_start(
            pod_manager=self,
            pod=pod,
            schedule_timeout=schedule_timeout,
            startup_timeout=startup_timeout,
            check_interval=check_interval,
        )


    def _log_message(
        self,
        message: str,
        container_name: str,
        container_name_log_prefix_enabled: bool,
        log_formatter: Callable[[str, str], str] | None,
    ) -> None:
        """Log a message at the level detected from its prefix, with appropriate formatting."""
        if is_log_group_marker(message):
            print(message)
        else:
            level = _parse_log_level(message)
            if log_formatter:
                formatted_message = log_formatter(container_name, message)
            else:
                formatted_message = (
                    f"[{container_name}] {message}" if container_name_log_prefix_enabled else message
                )
            self.log.log(level, formatted_message)


[docs]
    def fetch_container_logs(
        self,
        pod: V1Pod,
        container_name: str,
        *,
        follow=False,
        since_time: DateTime | None = None,
        post_termination_timeout: int = 120,
        container_name_log_prefix_enabled: bool = True,
        log_formatter: Callable[[str, str], str] | None = None,
    ) -> PodLoggingStatus:
        """
        Follow the logs of container and stream to airflow logging.

        Returns when container exits.

        Between when the pod starts and logs being available, there might be a delay due to CSR not approved
        and signed yet. In such situation, ApiException is thrown. This is why we are retrying on this
        specific exception.

        :meta private:
        """

        def consume_logs(*, since_time: DateTime | None = None) -> tuple[DateTime | None, Exception | None]:
            """
            Try to follow container logs until container completes.

            For a long-running container, sometimes the log read may be interrupted
            Such errors of this kind are suppressed.

            Returns the last timestamp observed in logs.
            """
            exception = None
            last_captured_timestamp = None
            # We timeout connections after 30 minutes because otherwise they can get
            # stuck forever. The 30 is somewhat arbitrary.
            # As a consequence, a TimeoutError will be raised no more than 30 minutes
            # after starting read.
            connection_timeout = 60 * 30
            # We set a shorter read timeout because that helps reduce *connection* timeouts
            # (since the connection will be restarted periodically). And with read timeout,
            # we don't need to worry about either duplicate messages or losing messages; we
            # can safely resume from a few seconds later
            read_timeout = 60 * 5
            try:
                since_seconds = None
                if since_time:
                    try:
                        if isinstance(
                            since_time, str
                        ):  # against interface spec but accept string as safeguard
                            since_time = pendulum.parse(since_time.replace("Z", "+00:00"))
                        since_seconds = math.ceil((pendulum.now() - since_time).total_seconds())
                    except (TypeError, ValueError):
                        self.log.warning(
                            "Error calculating since_seconds with since_time %s. Using None instead.",
                            since_time,
                        )
                logs = self.read_pod_logs(
                    pod=pod,
                    container_name=container_name,
                    timestamps=True,
                    since_seconds=since_seconds,
                    follow=follow,
                    post_termination_timeout=post_termination_timeout,
                    _request_timeout=(connection_timeout, read_timeout),
                )
                message_to_log = None
                message_timestamp = None
                try:
                    for raw_line in logs:
                        line = raw_line.decode("utf-8", errors="backslashreplace")
                        line_timestamp, message = parse_log_line(line)
                        if line_timestamp:  # detect new log line
                            if not message:
                                # Empty container write: advance the resume
                                # marker but do not emit a noisy ``[base] ``
                                # row or break the previous buffered message
                                # with a stray continuation (#36571).
                                self.container_log_times[
                                    (pod.metadata.namespace, pod.metadata.name, container_name)
                                ] = line_timestamp
                                continue
                            if message_to_log is None:  # first line in the log
                                message_to_log = message
                                message_timestamp = line_timestamp
                            else:  # previous log line is complete
                                for callback in self._callbacks:
                                    callback.progress_callback(
                                        line=message_to_log,
                                        client=self._client,
                                        mode=ExecutionMode.SYNC,
                                        container_name=container_name,
                                        timestamp=message_timestamp,
                                        pod=pod,
                                    )
                                self._log_message(
                                    message_to_log,
                                    container_name,
                                    container_name_log_prefix_enabled,
                                    log_formatter,
                                )
                                last_captured_timestamp = message_timestamp
                                if last_captured_timestamp is not None:
                                    self.container_log_times[
                                        (pod.metadata.namespace, pod.metadata.name, container_name)
                                    ] = last_captured_timestamp
                                message_to_log = message
                                message_timestamp = line_timestamp
                        else:  # continuation of the previous log line
                            message_to_log = f"{message_to_log}\n{message}"
                finally:
                    # log the last line and update the last_captured_timestamp
                    if message_to_log is not None:
                        for callback in self._callbacks:
                            callback.progress_callback(
                                line=message_to_log,
                                client=self._client,
                                mode=ExecutionMode.SYNC,
                                container_name=container_name,
                                timestamp=message_timestamp,
                                pod=pod,
                            )
                        self._log_message(
                            message_to_log, container_name, container_name_log_prefix_enabled, log_formatter
                        )
                    last_captured_timestamp = message_timestamp
                    if last_captured_timestamp:
                        self.container_log_times[
                            (pod.metadata.namespace, pod.metadata.name, container_name)
                        ] = last_captured_timestamp
            except TimeoutError as e:
                # in case of timeout, increment return time by 2 seconds to avoid
                # duplicate log entries
                if val := (last_captured_timestamp or since_time):
                    return val.add(seconds=2), e
            except HTTPError as e:
                exception = e
                self._http_error_timestamps = getattr(self, "_http_error_timestamps", [])
                self._http_error_timestamps = [
                    t for t in self._http_error_timestamps if t > timezone.utcnow() - timedelta(seconds=60)
                ]
                self._http_error_timestamps.append(timezone.utcnow())
                # Log only if more than 2 errors occurred in the last 60 seconds
                if len(self._http_error_timestamps) > 2:
                    self.log.exception(
                        "Reading of logs interrupted for container %r; will retry.",
                        container_name,
                    )
            return last_captured_timestamp or since_time, exception

        # note: `read_pod_logs` follows the logs, so we shouldn't necessarily *need* to
        # loop as we do here. But in a long-running process we might temporarily lose connectivity.
        # So the looping logic is there to let us resume following the logs.
        last_log_time = since_time
        while True:
            last_log_time, exc = consume_logs(since_time=last_log_time)
            if not self.container_is_running(pod, container_name=container_name):
                return PodLoggingStatus(running=False, last_log_time=last_log_time)
            if not follow:
                return PodLoggingStatus(running=True, last_log_time=last_log_time)
            # a timeout is a normal thing and we ignore it and resume following logs
            if not isinstance(exc, TimeoutError):
                self.log.warning(
                    "Pod %s log read interrupted but container %s still running. Logs generated in the last one second might get duplicated.",
                    pod.metadata.name,
                    container_name,
                )
            time.sleep(1)


    def _reconcile_requested_log_containers(
        self, requested: Iterable[str] | str | bool | None, actual: list[str], pod_name
    ) -> list[str]:
        """Return actual containers based on requested."""
        containers_to_log = []
        if actual:
            if isinstance(requested, str):
                # fetch logs only for requested container if only one container is provided
                if requested in actual:
                    containers_to_log.append(requested)
                else:
                    self.log.error(
                        "container %s whose logs were requested not found in the pod %s",
                        requested,
                        pod_name,
                    )
            elif isinstance(requested, bool):
                # if True is provided, get logs for all the containers
                if requested is True:
                    containers_to_log.extend(actual)
                else:
                    self.log.error(
                        "False is not a valid value for container_logs",
                    )
            else:
                # if a sequence of containers are provided, iterate for every container in the pod
                if isinstance(requested, Iterable):
                    for container in requested:
                        if container in actual:
                            containers_to_log.append(container)
                        else:
                            self.log.error(
                                "Container %s whose logs were requests not found in the pod %s",
                                container,
                                pod_name,
                            )
                else:
                    self.log.error(
                        "Invalid type %s specified for container names input parameter", type(requested)
                    )
        else:
            self.log.error("Could not retrieve containers for the pod: %s", pod_name)
        return containers_to_log


[docs]
    def fetch_requested_init_container_logs(
        self,
        pod: V1Pod,
        init_containers: Iterable[str] | str | Literal[True] | None,
        follow_logs=False,
        container_name_log_prefix_enabled: bool = True,
        log_formatter: Callable[[str, str], str] | None = None,
    ) -> list[PodLoggingStatus]:
        """
        Follow the logs of containers in the specified pod and publish it to airflow logging.

        Returns when all the containers exit.

        :meta private:
        """
        pod_logging_statuses = []
        all_containers = self.get_init_container_names(pod)
        containers_to_log = self._reconcile_requested_log_containers(
            requested=init_containers,
            actual=all_containers,
            pod_name=pod.metadata.name,
        )
        # sort by spec.initContainers because containers runs sequentially
        containers_to_log = sorted(containers_to_log, key=lambda cn: all_containers.index(cn))
        for c in containers_to_log:
            self._await_init_container_start(pod=pod, container_name=c)
            since_time = self.container_log_times.get((pod.metadata.namespace, pod.metadata.name, c))
            status = self.fetch_container_logs(
                pod=pod,
                container_name=c,
                follow=follow_logs,
                since_time=since_time,
                container_name_log_prefix_enabled=container_name_log_prefix_enabled,
                log_formatter=log_formatter,
            )
            pod_logging_statuses.append(status)
        return pod_logging_statuses



[docs]
    def fetch_requested_container_logs(
        self,
        pod: V1Pod,
        containers: Iterable[str] | str | Literal[True],
        follow_logs=False,
        container_name_log_prefix_enabled: bool = True,
        log_formatter: Callable[[str, str], str] | None = None,
    ) -> list[PodLoggingStatus]:
        """
        Follow the logs of containers in the specified pod and publish it to airflow logging.

        Returns when all the containers exit.

        :meta private:
        """
        pod_logging_statuses = []
        all_containers = self.get_container_names(pod)
        containers_to_log = self._reconcile_requested_log_containers(
            requested=containers,
            actual=all_containers,
            pod_name=pod.metadata.name,
        )
        for c in containers_to_log:
            since_time = self.container_log_times.get((pod.metadata.namespace, pod.metadata.name, c))
            status = self.fetch_container_logs(
                pod=pod,
                container_name=c,
                follow=follow_logs,
                since_time=since_time,
                container_name_log_prefix_enabled=container_name_log_prefix_enabled,
                log_formatter=log_formatter,
            )
            pod_logging_statuses.append(status)
        return pod_logging_statuses



[docs]
    def await_container_completion(self, pod: V1Pod, container_name: str, polling_time: float = 1) -> None:
        """
        Wait for the given container in the given pod to be completed.

        :param pod: pod spec that will be monitored
        :param container_name: name of the container within the pod to monitor
        :param polling_time: polling time between two container status checks.
            Defaults to 1s.
        """
        while True:
            remote_pod = self.read_pod(pod)
            terminated = container_is_completed(remote_pod, container_name)
            if terminated:
                break
            self.log.info("Waiting for container '%s' state to be completed", container_name)
            time.sleep(polling_time)



[docs]
    def await_pod_completion(
        self,
        pod: V1Pod,
        istio_enabled: bool = False,
        container_name: str = "base",
        do_xcom_push: bool = False,
    ) -> V1Pod:
        """
        Monitor a pod and return the final state.

        :param istio_enabled: whether istio is enabled in the namespace
        :param pod: pod spec that will be monitored
        :param container_name: name of the container within the pod
        :param do_xcom_push: whether to push XComs
        :return: V1Pod
        """
        while True:
            remote_pod = self.read_pod(pod)
            if remote_pod.status.phase in PodPhase.terminal_states:
                break
            if (istio_enabled or do_xcom_push) and container_is_completed(remote_pod, container_name):
                self.log.info(
                    "Container '%s' completed but pod %s still has phase %s "
                    "(likely due to a sidecar container). Skipping waiting for pod completion.",
                    container_name,
                    pod.metadata.name,
                    remote_pod.status.phase,
                )
                break
            # abort waiting if defined issues are detected
            if detect_pod_terminate_early_issues(remote_pod):
                break
            self.log.info("Pod %s has phase %s", pod.metadata.name, remote_pod.status.phase)
            time.sleep(2)
        return remote_pod



[docs]
    def container_is_running(self, pod: V1Pod, container_name: str) -> bool:
        """Read pod and checks if container is running."""
        remote_pod = self.read_pod(pod)
        return container_is_running(pod=remote_pod, container_name=container_name)



[docs]
    def container_is_terminated(self, pod: V1Pod, container_name: str) -> bool:
        """Read pod and checks if container is terminated."""
        remote_pod = self.read_pod(pod)
        return container_is_terminated(pod=remote_pod, container_name=container_name)


    @generic_api_retry

[docs]
    def read_pod_logs(
        self,
        pod: V1Pod,
        container_name: str,
        tail_lines: int | None = None,
        timestamps: bool = False,
        since_seconds: int | None = None,
        follow=True,
        post_termination_timeout: int = 120,
        **kwargs,
    ) -> PodLogsConsumer:
        """Read log from the POD."""
        additional_kwargs = {}
        if since_seconds:
            additional_kwargs["since_seconds"] = since_seconds

        if tail_lines:
            additional_kwargs["tail_lines"] = tail_lines
        additional_kwargs.update(**kwargs)

        try:
            logs = self._client.read_namespaced_pod_log(
                name=pod.metadata.name,
                namespace=pod.metadata.namespace,
                container=container_name,
                follow=follow,
                timestamps=timestamps,
                _preload_content=False,
                **additional_kwargs,
            )
        except HTTPError:
            self.log.exception("There was an error reading the kubernetes API.")
            raise

        return PodLogsConsumer(
            response=logs,
            pod=pod,
            pod_manager=self,
            container_name=container_name,
            post_termination_timeout=post_termination_timeout,
        )



[docs]
    def get_init_container_names(self, pod: V1Pod) -> list[str]:
        """
        Return container names from the POD except for the airflow-xcom-sidecar container.

        :meta private:
        """
        return [container_spec.name for container_spec in pod.spec.init_containers]



[docs]
    def get_container_names(self, pod: V1Pod) -> list[str]:
        """
        Return container names from the POD except for the airflow-xcom-sidecar container.

        :meta private:
        """
        pod_info = self.read_pod(pod)
        return [
            container_spec.name
            for container_spec in pod_info.spec.containers
            if container_spec.name != PodDefaults.SIDECAR_CONTAINER_NAME
        ]


    @generic_api_retry

[docs]
    def read_pod_events(self, pod: V1Pod, resource_version: str | None = None) -> CoreV1EventList:
        """
        Read events from the POD with optimization parameters to reduce API load.

        :param pod: The pod to get events for
        :param resource_version: Only return events newer than this resource version
        :param limit: Maximum number of events to return
        """
        try:
            return self._client.list_namespaced_event(
                namespace=pod.metadata.namespace,
                field_selector=f"involvedObject.name={pod.metadata.name}",
                resource_version=resource_version,
                resource_version_match="NotOlderThan" if resource_version else None,
            )
        except TypeError:
            return self._client.list_namespaced_event(
                namespace=pod.metadata.namespace,
                field_selector=f"involvedObject.name={pod.metadata.name}",
                resource_version=resource_version,
            )
        except HTTPError as e:
            raise KubernetesApiException(f"There was an error reading the kubernetes API: {e}")


    @generic_api_retry

[docs]
    def read_pod(self, pod: V1Pod) -> V1Pod:
        """Read POD information."""
        try:
            return self._client.read_namespaced_pod(pod.metadata.name, pod.metadata.namespace)
        except HTTPError as e:
            raise KubernetesApiException(f"There was an error reading the kubernetes API: {e}")



[docs]
    def await_xcom_sidecar_container_start(
        self, pod: V1Pod, timeout: int = 900, log_interval: int = 30
    ) -> None:
        """Check if the sidecar container has reached the 'Running' state before performing do_xcom_push."""
        self.log.info("Checking if xcom sidecar container is started.")
        start_time = time.time()
        last_log_time = start_time

        while True:
            elapsed_time = time.time() - start_time
            if self.container_is_running(pod, PodDefaults.SIDECAR_CONTAINER_NAME):
                self.log.info("The xcom sidecar container has started.")
                break
            if self.container_is_terminated(pod, PodDefaults.SIDECAR_CONTAINER_NAME):
                raise AirflowException(
                    "Xcom sidecar container is already terminated! Not possible to read xcom output of task."
                )
            if (time.time() - last_log_time) >= log_interval:
                self.log.warning(
                    "Still waiting for the xcom sidecar container to start. Elapsed time: %d seconds.",
                    int(elapsed_time),
                )
                last_log_time = time.time()
            if elapsed_time > timeout:
                raise AirflowException(
                    f"Xcom sidecar container did not start within {timeout // 60} minutes."
                )
            time.sleep(1)



[docs]
    def extract_xcom(self, pod: V1Pod) -> str:
        """Retrieve XCom value and kill xcom sidecar container."""
        # make sure that xcom sidecar container is still running
        if not self.container_is_running(pod, PodDefaults.SIDECAR_CONTAINER_NAME):
            raise XComRetrievalError(
                f"{PodDefaults.SIDECAR_CONTAINER_NAME} container is not running! Not possible to read xcom from pod: {pod.metadata.name}"
            )

        try:
            result = self.extract_xcom_json(pod)
            return result
        finally:
            self.extract_xcom_kill(pod)


    @generic_api_retry

[docs]
    def extract_xcom_json(self, pod: V1Pod) -> str:
        """Retrieve XCom value and also check if xcom json is valid."""
        command = (
            f"if [ -s {PodDefaults.XCOM_MOUNT_PATH}/return.json ]; "
            f"then cat {PodDefaults.XCOM_MOUNT_PATH}/return.json; "
            f"else echo {EMPTY_XCOM_RESULT}; fi"
        )
        result = None
        with closing(
            kubernetes_stream(
                self._client.connect_get_namespaced_pod_exec,
                pod.metadata.name,
                pod.metadata.namespace,
                container=PodDefaults.SIDECAR_CONTAINER_NAME,
                command=[
                    "/bin/sh",
                    "-c",
                    command,
                ],
                stdin=False,
                stdout=True,
                stderr=True,
                tty=False,
                _preload_content=False,
            )
        ) as client:
            self.log.info("Running command... %s", command)
            client.run_forever()
            if client.peek_stderr():
                stderr = client.read_stderr()
                self.log.error("stderr from command: %s", stderr)
            result = client.read_all()
            if result and result.rstrip() != EMPTY_XCOM_RESULT:
                # Note: result string is parsed to check if its valid json.
                # This function still returns a string which is converted into json in the calling method.
                json.loads(result)

        if result is None:
            raise XComRetrievalError(f"Failed to extract xcom from pod: {pod.metadata.name}")
        return result


    @generic_api_retry

[docs]
    def extract_xcom_kill(self, pod: V1Pod):
        """Kill xcom sidecar container."""
        with closing(
            kubernetes_stream(
                self._client.connect_get_namespaced_pod_exec,
                pod.metadata.name,
                pod.metadata.namespace,
                container=PodDefaults.SIDECAR_CONTAINER_NAME,
                command=["/bin/sh"],
                stdin=True,
                stdout=True,
                stderr=True,
                tty=False,
                _preload_content=False,
            )
        ) as resp:
            xcom_kill_command = "kill -2 $(pgrep -u $(id -u) -f 'sh')"
            # fallback command for containers that don't support pgrep -u
            fallback_xcom_kill_command = (
                "for f in /proc/[0-9]*/comm; do "
                '[ -O $f ] && read c < $f && [ "$c" = "sh" ] && pid=${f%/comm} && kill -2 ${pid##*/}; '
                "done"
            )
            try:
                self._exec_pod_command(resp, xcom_kill_command)
            except PodCommandException:
                self.log.info("Primary kill command failed, trying fallback command")
                self._exec_pod_command(resp, fallback_xcom_kill_command)


    def _exec_pod_command(self, resp, command: str) -> str | None:
        res = ""
        if not resp.is_open():
            return None
        self.log.info("Running command... %s", command)
        resp.write_stdin(f"{command}\n")
        while resp.is_open():
            resp.update(timeout=1)
            while resp.peek_stdout():
                res += resp.read_stdout()
            error_res = ""
            while resp.peek_stderr():
                error_res += resp.read_stderr()
            if error_res:
                self.log.warning("stderr from command: %s", error_res)
                raise PodCommandException(f"Command failed with stderr: {error_res}")
            if res:
                return res
        return None

    def _await_init_container_start(self, pod: V1Pod, container_name: str):
        while True:
            remote_pod = self.read_pod(pod)

            if (
                remote_pod.status is not None
                and remote_pod.status.phase != PodPhase.PENDING
                and get_container_status(remote_pod, container_name) is not None
                and not container_is_wait(remote_pod, container_name)
            ):
                return

            time.sleep(1)




[docs]
class OnFinishAction(str, enum.Enum):
    """Action to take when the pod finishes."""


[docs]
    KEEP_POD = "keep_pod"


[docs]
    DELETE_POD = "delete_pod"


[docs]
    DELETE_ACTIVE_POD = "delete_active_pod"


[docs]
    DELETE_SUCCEEDED_POD = "delete_succeeded_pod"





[docs]
class OnKillAction(str, enum.Enum):
    """Action to take when the task is killed by the user."""


[docs]
    DELETE_POD = "delete_pod"


[docs]
    KEEP_POD = "keep_pod"





[docs]
def is_log_group_marker(line: str) -> bool:
    """Check if the line is a log group marker like `::group::` or `::endgroup::`."""
    return line.startswith("::group::") or line.startswith("::endgroup::")




[docs]
def parse_log_line(line: str) -> tuple[DateTime | None, str]:
    """
    Parse K8s log line and returns the final state.

    :param line: k8s log line
    :return: timestamp and log message
    """
    # Strip only the trailing newline so an empty container write (which
    # kubelet streams back as "<rfc3339-ts> \n" under ``timestamps=True``)
    # keeps the separator space and is recognised as a real log line, not a
    # continuation of the previous one (#36571). When kubelet emits "<ts>\n"
    # with no trailing space, ``partition`` returns the whole line as
    # ``timestamp`` and ``message`` as ``""`` -- the parse below handles both.
    stripped = line.rstrip("\n")
    timestamp, _, message = stripped.partition(" ")
    try:
        last_log_time = cast("DateTime", pendulum.parse(timestamp))
    except (ParserError, ValueError):
        return None, line
    return last_log_time, message




[docs]
class AsyncPodManager(LoggingMixin):
    """Create, monitor, and otherwise interact with Kubernetes pods for use with the KubernetesPodTriggerer."""

    def __init__(
        self,
        async_hook: AsyncKubernetesHook,
        callbacks: list[type[KubernetesPodOperatorCallback]] | None = None,
    ):
        """
        Create the launcher.

        :param kube_client: kubernetes client
        :param callbacks:
        """
        super().__init__()
        self._hook = async_hook
        self._watch = watch.Watch()
        self._callbacks = callbacks or []

[docs]
        self.stop_watching_events = False



[docs]
    async def read_pod(self, pod: V1Pod) -> V1Pod:
        """Read POD information."""
        return await self._hook.get_pod(
            pod.metadata.name,
            pod.metadata.namespace,
        )



[docs]
    async def read_pod_events(self, pod: V1Pod, resource_version: str | None = None) -> CoreV1EventList:
        """Get pod's events."""
        return await self._hook.get_pod_events(
            pod.metadata.name,
            pod.metadata.namespace,
            resource_version=resource_version,
        )



[docs]
    async def watch_pod_events(self, pod: V1Pod, startup_check_interval: float = 30) -> None:
        """Watch pod events and write to log."""
        seen_events: set[str] = set()
        resource_version = None
        while not self.stop_watching_events:
            async for event in self._hook.watch_pod_events(
                name=pod.metadata.name,
                namespace=pod.metadata.namespace,
                resource_version=resource_version,
                timeout_seconds=startup_check_interval,
            ):
                if event:
                    log_pod_event(self, event, seen_events)
                    resource_version = event.metadata.resource_version



[docs]
    async def await_pod_start(
        self, pod: V1Pod, schedule_timeout: int = 120, startup_timeout: int = 120, check_interval: float = 1
    ) -> None:
        """
        Wait for the pod to reach phase other than ``Pending``.

        :param pod:
        :param schedule_timeout: Timeout (in seconds) for pod stay in schedule state
            (if pod is taking to long in schedule state, fails task)
        :param startup_timeout: Timeout (in seconds) for startup of the pod
            (if pod is pending for too long after being scheduled, fails task)
        :param check_interval: Interval (in seconds) between checks
        :return:
        """
        await await_pod_start(
            pod_manager=self,
            pod=pod,
            schedule_timeout=schedule_timeout,
            startup_timeout=startup_timeout,
            check_interval=check_interval,
        )



[docs]
    async def fetch_container_logs_before_current_sec(
        self, pod: V1Pod, container_name: str, since_time: DateTime | None = None
    ) -> DateTime | None:
        """
        Asynchronously read the log file of the specified pod.

        This method streams logs from the base container, skipping log lines from the current second to prevent duplicate entries on subsequent reads. It is designed to handle long-running containers and gracefully suppresses transient interruptions.

        :param pod: The pod specification to monitor.
        :param container_name: The name of the container within the pod.
        :param since_time: The timestamp from which to start reading logs.
        :return: The timestamp to use for the next log read, representing the start of the current second. Returns None if an exception occurred.
        """
        now = pendulum.now()
        logs = await self._hook.read_logs(
            name=pod.metadata.name,
            namespace=pod.metadata.namespace,
            container_name=container_name,
            since_seconds=(math.ceil((now - since_time).total_seconds()) if since_time else None),
        )
        message_to_log = None
        try:
            now_seconds = now.replace(microsecond=0)
            for line in logs:
                line_timestamp, message = parse_log_line(line)
                # Skip log lines from the current second to prevent duplicate entries on the next read.
                # The API only allows specifying 'since_seconds', not an exact timestamp.
                if line_timestamp and line_timestamp.replace(microsecond=0) == now_seconds:
                    break
                if line_timestamp:  # detect new log line
                    if not message:
                        # Empty container write -- drop it instead of letting
                        # it overwrite the buffered message with "" or be
                        # emitted as a noisy ``[base] `` row (#36571).
                        continue
                    if message_to_log is None:  # first line in the log
                        message_to_log = message
                    else:  # previous log line is complete
                        if message_to_log is not None:
                            if is_log_group_marker(message_to_log):
                                print(message_to_log)
                            else:
                                level = _parse_log_level(message_to_log)
                                self.log.log(level, "[%s] %s", container_name, message_to_log)
                        message_to_log = message
                elif message_to_log:  # continuation of the previous log line
                    message_to_log = f"{message_to_log}\n{message}"
        finally:
            # log the last line and update the last_captured_timestamp
            if message_to_log is not None:
                if is_log_group_marker(message_to_log):
                    print(message_to_log)
                else:
                    level = _parse_log_level(message_to_log)
                    self.log.log(level, "[%s] %s", container_name, message_to_log)
        return now  # Return the current time as the last log time to ensure logs from the current second are read in the next fetch.