Source code for airflow.providers.apache.livy.operators.livy

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

import time
from collections.abc import Sequence
from functools import cached_property
from typing import TYPE_CHECKING, Any, cast

from airflow.configuration import conf
from airflow.exceptions import AirflowException
from airflow.providers.apache.livy.hooks.livy import BatchState, LivyHook
from airflow.providers.apache.livy.triggers.livy import LivyTrigger
from airflow.providers.common.compat.openlineage.utils.spark import (
    inject_parent_job_information_into_spark_properties,
    inject_transport_information_into_spark_properties,
)
from airflow.providers.common.compat.sdk import BaseOperator

if TYPE_CHECKING:
    from airflow.providers.common.compat.sdk import Context



[docs]
class LivyOperator(BaseOperator):
    """
    Wraps the Apache Livy batch REST API, allowing to submit a Spark application to the underlying cluster.

    :param file: path of the file containing the application to execute (required). (templated)
    :param class_name: name of the application Java/Spark main class. (templated)
    :param args: application command line arguments. (templated)
    :param jars: jars to be used in this sessions. (templated)
    :param py_files: python files to be used in this session. (templated)
    :param files: files to be used in this session. (templated)
    :param driver_memory: amount of memory to use for the driver process. (templated)
    :param driver_cores: number of cores to use for the driver process. (templated)
    :param executor_memory: amount of memory to use per executor process. (templated)
    :param executor_cores: number of cores to use for each executor. (templated)
    :param num_executors: number of executors to launch for this session. (templated)
    :param archives: archives to be used in this session. (templated)
    :param queue: name of the YARN queue to which the application is submitted. (templated)
    :param name: name of this session. (templated)
    :param conf: Spark configuration properties. (templated)
    :param proxy_user: user to impersonate when running the job. (templated)
    :param livy_conn_id: reference to a pre-defined Livy Connection.
    :param livy_conn_auth_type: The auth type for the Livy Connection.
    :param polling_interval: time in seconds between polling for job completion. Don't poll for values <= 0
    :param extra_options: A dictionary of options, where key is string and value
        depends on the option that's being modified.
    :param extra_headers: A dictionary of headers passed to the HTTP request to livy.
    :param retry_args: Arguments which define the retry behaviour.
        See Tenacity documentation at https://github.com/jd/tenacity
    :param deferrable: Run operator in the deferrable mode
    """


[docs]
    template_fields: Sequence[str] = ("spark_params",)


[docs]
    template_fields_renderers = {"spark_params": "json"}


    def __init__(
        self,
        *,
        file: str,
        class_name: str | None = None,
        args: Sequence[str | int | float] | None = None,
        conf: dict[Any, Any] | None = None,
        jars: Sequence[str] | None = None,
        py_files: Sequence[str] | None = None,
        files: Sequence[str] | None = None,
        driver_memory: str | None = None,
        driver_cores: int | str | None = None,
        executor_memory: str | None = None,
        executor_cores: int | str | None = None,
        num_executors: int | str | None = None,
        archives: Sequence[str] | None = None,
        queue: str | None = None,
        name: str | None = None,
        proxy_user: str | None = None,
        livy_conn_id: str = "livy_default",
        livy_conn_auth_type: Any | None = None,
        livy_endpoint_prefix: str | None = None,
        polling_interval: int = 0,
        extra_options: dict[str, Any] | None = None,
        extra_headers: dict[str, Any] | None = None,
        retry_args: dict[str, Any] | None = None,
        deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
        openlineage_inject_parent_job_info: bool = conf.getboolean(
            "openlineage", "spark_inject_parent_job_info", fallback=False
        ),
        openlineage_inject_transport_info: bool = conf.getboolean(
            "openlineage", "spark_inject_transport_info", fallback=False
        ),
        **kwargs: Any,
    ) -> None:
        super().__init__(**kwargs)

        if conf is None:
            conf = {}

        spark_params = {
            # Prepare spark parameters, it will be templated later.
            "file": file,
            "class_name": class_name,
            "args": args,
            "jars": jars,
            "py_files": py_files,
            "files": files,
            "driver_memory": driver_memory,
            "driver_cores": driver_cores,
            "executor_memory": executor_memory,
            "executor_cores": executor_cores,
            "num_executors": num_executors,
            "archives": archives,
            "queue": queue,
            "name": name,
            "conf": conf,
            "proxy_user": proxy_user,
        }

[docs]
        self.spark_params = spark_params

        self._livy_conn_id = livy_conn_id
        self._livy_conn_auth_type = livy_conn_auth_type
        self._livy_endpoint_prefix = livy_endpoint_prefix
        self._polling_interval = polling_interval
        self._extra_options = extra_options or {}
        self._extra_headers = extra_headers or {}

        self._batch_id: int | str | None = None

[docs]
        self.retry_args = retry_args


[docs]
        self.deferrable = deferrable


[docs]
        self.openlineage_inject_parent_job_info = openlineage_inject_parent_job_info


[docs]
        self.openlineage_inject_transport_info = openlineage_inject_transport_info


    @cached_property

[docs]
    def hook(self) -> LivyHook:
        """
        Get valid hook.

        :return: LivyHook
        """
        return LivyHook(
            livy_conn_id=self._livy_conn_id,
            extra_headers=self._extra_headers,
            extra_options=self._extra_options,
            auth_type=self._livy_conn_auth_type,
            endpoint_prefix=self._livy_endpoint_prefix,
        )



[docs]
    def execute(self, context: Context) -> Any:
        if self.openlineage_inject_parent_job_info:
            self.log.debug("Injecting parent job information into Spark properties")
            self.spark_params["conf"] = inject_parent_job_information_into_spark_properties(
                cast("dict", self.spark_params["conf"]), context
            )
        if self.openlineage_inject_transport_info:
            self.log.debug("Injecting transport information into Spark properties")
            self.spark_params["conf"] = inject_transport_information_into_spark_properties(
                cast("dict", self.spark_params["conf"]), context
            )

        _batch_id: int | str = self.hook.post_batch(**self.spark_params)
        self._batch_id = _batch_id
        self.log.info("Generated batch-id is %s", self._batch_id)

        # Wait for the job to complete
        if not self.deferrable:
            if self._polling_interval > 0:
                self.poll_for_termination(self._batch_id)
            context["ti"].xcom_push(key="app_id", value=self.hook.get_batch(self._batch_id)["appId"])
            return self._batch_id

        state = self.hook.get_batch_state(self._batch_id, retry_args=self.retry_args)
        self.log.debug("Batch with id %s is in state: %s", self._batch_id, state.value)
        if state not in self.hook.TERMINAL_STATES:
            self.defer(
                timeout=self.execution_timeout,
                trigger=LivyTrigger(
                    batch_id=self._batch_id,
                    spark_params=self.spark_params,
                    livy_conn_id=self._livy_conn_id,
                    polling_interval=self._polling_interval,
                    extra_options=self._extra_options,
                    extra_headers=self._extra_headers,
                    execution_timeout=self.execution_timeout,
                ),
                method_name="execute_complete",
            )
        else:
            self.log.info("Batch with id %s terminated with state: %s", self._batch_id, state.value)
            self.hook.dump_batch_logs(self._batch_id)
            if state != BatchState.SUCCESS:
                raise AirflowException(f"Batch {self._batch_id} did not succeed")

            context["ti"].xcom_push(key="app_id", value=self.hook.get_batch(self._batch_id)["appId"])
            return self._batch_id



[docs]
    def poll_for_termination(self, batch_id: int | str) -> None:
        """
        Pool Livy for batch termination.

        :param batch_id: id of the batch session to monitor.
        """
        state = self.hook.get_batch_state(batch_id, retry_args=self.retry_args)
        while state not in self.hook.TERMINAL_STATES:
            self.log.debug("Batch with id %s is in state: %s", batch_id, state.value)
            time.sleep(self._polling_interval)
            state = self.hook.get_batch_state(batch_id, retry_args=self.retry_args)
        self.log.info("Batch with id %s terminated with state: %s", batch_id, state.value)
        self.hook.dump_batch_logs(batch_id)
        if state != BatchState.SUCCESS:
            raise AirflowException(f"Batch {batch_id} did not succeed")



[docs]
    def on_kill(self) -> None:
        self.kill()



[docs]
    def kill(self) -> None:
        """Delete the current batch session."""
        if self._batch_id is not None:
            self.hook.delete_batch(self._batch_id)



[docs]
    def execute_complete(self, context: Context, event: dict[str, Any]) -> Any:
        """
        Execute when the trigger fires - returns immediately.

        Relies on trigger to throw an exception, otherwise it assumes execution was successful.
        """
        # dump the logs from livy to worker through triggerer.
        if event.get("log_lines", None) is not None:
            for log_line in event["log_lines"]:
                self.log.info(log_line)

        if event["status"] == "timeout":
            self.hook.delete_batch(event["batch_id"])

        if event["status"] in ["error", "timeout"]:
            raise AirflowException(event["response"])

        self.log.info(
            "%s completed with response %s",
            self.task_id,
            event["response"],
        )
        context["ti"].xcom_push(key="app_id", value=self.hook.get_batch(event["batch_id"])["appId"])
        return event["batch_id"]