Source code for airflow.providers.google.cloud.operators.dataprep

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""This module contains a Google Dataprep operator."""

from __future__ import annotations

from collections.abc import Sequence
from typing import TYPE_CHECKING

from airflow.providers.google.cloud.hooks.dataprep import GoogleDataprepHook
from airflow.providers.google.cloud.links.dataprep import DataprepFlowLink, DataprepJobGroupLink
from airflow.providers.google.cloud.operators.cloud_base import GoogleCloudBaseOperator
from airflow.providers.google.common.hooks.base_google import PROVIDE_PROJECT_ID

if TYPE_CHECKING:
    from airflow.utils.context import Context


[docs]class DataprepGetJobsForJobGroupOperator(GoogleCloudBaseOperator): """ Get information about the batch jobs within a Cloud Dataprep job. API documentation: https://clouddataprep.com/documentation/api#section/Overview. .. seealso:: For more information on how to use this operator, take a look at the guide: :ref:`howto/operator:DataprepGetJobsForJobGroupOperator` :param job_group_id The ID of the job group that will be requests """
[docs] template_fields: Sequence[str] = ("job_group_id",)
def __init__( self, *, dataprep_conn_id: str = "dataprep_default", job_group_id: int | str, **kwargs, ) -> None: super().__init__(**kwargs) self.dataprep_conn_id = dataprep_conn_id self.job_group_id = job_group_id
[docs] def execute(self, context: Context) -> dict: self.log.info("Fetching data for job with id: %d ...", self.job_group_id) hook = GoogleDataprepHook( dataprep_conn_id=self.dataprep_conn_id, ) response = hook.get_jobs_for_job_group(job_id=int(self.job_group_id)) return response
[docs]class DataprepGetJobGroupOperator(GoogleCloudBaseOperator): """ Get the specified job group. A job group is a job that is executed from a specific node in a flow. API documentation: https://clouddataprep.com/documentation/api#section/Overview. .. seealso:: For more information on how to use this operator, take a look at the guide: :ref:`howto/operator:DataprepGetJobGroupOperator` :param job_group_id: The ID of the job group that will be requests :param embed: Comma-separated list of objects to pull in as part of the response :param include_deleted: if set to "true", will include deleted objects """
[docs] template_fields: Sequence[str] = ( "job_group_id", "embed", "project_id", )
def __init__( self, *, dataprep_conn_id: str = "dataprep_default", project_id: str = PROVIDE_PROJECT_ID, job_group_id: int | str, embed: str, include_deleted: bool, **kwargs, ) -> None: super().__init__(**kwargs) self.dataprep_conn_id: str = dataprep_conn_id self.project_id = project_id self.job_group_id = job_group_id self.embed = embed self.include_deleted = include_deleted
[docs] def execute(self, context: Context) -> dict: self.log.info("Fetching data for job with id: %d ...", self.job_group_id) if self.project_id: DataprepJobGroupLink.persist( context=context, task_instance=self, project_id=self.project_id, job_group_id=int(self.job_group_id), ) hook = GoogleDataprepHook(dataprep_conn_id=self.dataprep_conn_id) response = hook.get_job_group( job_group_id=int(self.job_group_id), embed=self.embed, include_deleted=self.include_deleted, ) return response
[docs]class DataprepRunJobGroupOperator(GoogleCloudBaseOperator): """ Create a ``jobGroup``, which launches the specified job as the authenticated user. This performs the same action as clicking on the Run Job button in the application. To get recipe_id please follow the Dataprep API documentation: https://clouddataprep.com/documentation/api#operation/runJobGroup. .. seealso:: For more information on how to use this operator, take a look at the guide: :ref:`howto/operator:DataprepRunJobGroupOperator` :param dataprep_conn_id: The Dataprep connection ID :param body_request: Passed as the body_request to GoogleDataprepHook's run_job_group, where it's the identifier for the recipe to run """
[docs] template_fields: Sequence[str] = ("body_request",)
def __init__( self, *, project_id: str = PROVIDE_PROJECT_ID, dataprep_conn_id: str = "dataprep_default", body_request: dict, **kwargs, ) -> None: super().__init__(**kwargs) self.project_id = project_id self.dataprep_conn_id = dataprep_conn_id self.body_request = body_request
[docs] def execute(self, context: Context) -> dict: self.log.info("Creating a job...") hook = GoogleDataprepHook(dataprep_conn_id=self.dataprep_conn_id) response = hook.run_job_group(body_request=self.body_request) job_group_id = response.get("id") if self.project_id and job_group_id: DataprepJobGroupLink.persist( context=context, task_instance=self, project_id=self.project_id, job_group_id=int(job_group_id), ) return response
[docs]class DataprepCopyFlowOperator(GoogleCloudBaseOperator): """ Create a copy of the provided flow id, as well as all contained recipes. :param dataprep_conn_id: The Dataprep connection ID :param flow_id: ID of the flow to be copied :param name: Name for the copy of the flow :param description: Description of the copy of the flow :param copy_datasources: Bool value to define should the copy of data inputs be made or not. """
[docs] template_fields: Sequence[str] = ( "flow_id", "name", "project_id", "description", )
def __init__( self, *, project_id: str = PROVIDE_PROJECT_ID, dataprep_conn_id: str = "dataprep_default", flow_id: int | str, name: str = "", description: str = "", copy_datasources: bool = False, **kwargs, ) -> None: super().__init__(**kwargs) self.project_id = project_id self.dataprep_conn_id = dataprep_conn_id self.flow_id = flow_id self.name = name self.description = description self.copy_datasources = copy_datasources
[docs] def execute(self, context: Context) -> dict: self.log.info("Copying flow with id %d...", self.flow_id) hook = GoogleDataprepHook(dataprep_conn_id=self.dataprep_conn_id) response = hook.copy_flow( flow_id=int(self.flow_id), name=self.name, description=self.description, copy_datasources=self.copy_datasources, ) copied_flow_id = response.get("id") if self.project_id and copied_flow_id: DataprepFlowLink.persist( context=context, task_instance=self, project_id=self.project_id, flow_id=int(copied_flow_id), ) return response
[docs]class DataprepDeleteFlowOperator(GoogleCloudBaseOperator): """ Delete the flow with provided id. :param dataprep_conn_id: The Dataprep connection ID :param flow_id: ID of the flow to be copied """
[docs] template_fields: Sequence[str] = ("flow_id",)
def __init__( self, *, dataprep_conn_id: str = "dataprep_default", flow_id: int | str, **kwargs, ) -> None: super().__init__(**kwargs) self.dataprep_conn_id = dataprep_conn_id self.flow_id = flow_id
[docs] def execute(self, context: Context) -> None: self.log.info("Start delete operation of the flow with id: %d...", self.flow_id) hook = GoogleDataprepHook(dataprep_conn_id=self.dataprep_conn_id) hook.delete_flow(flow_id=int(self.flow_id))
[docs]class DataprepRunFlowOperator(GoogleCloudBaseOperator): """ Runs the flow with the provided id copy of the provided flow id. :param dataprep_conn_id: The Dataprep connection ID :param flow_id: ID of the flow to be copied :param body_request: Body of the POST request to be sent. """
[docs] template_fields: Sequence[str] = ( "flow_id", "project_id", )
def __init__( self, *, project_id: str = PROVIDE_PROJECT_ID, flow_id: int | str, body_request: dict, dataprep_conn_id: str = "dataprep_default", **kwargs, ): super().__init__(**kwargs) self.project_id = project_id self.flow_id = flow_id self.body_request = body_request self.dataprep_conn_id = dataprep_conn_id
[docs] def execute(self, context: Context) -> dict: self.log.info("Running the flow with id: %d...", self.flow_id) hooks = GoogleDataprepHook(dataprep_conn_id=self.dataprep_conn_id) response = hooks.run_flow(flow_id=int(self.flow_id), body_request=self.body_request) if self.project_id: job_group_id = response["data"][0]["id"] DataprepJobGroupLink.persist( context=context, task_instance=self, project_id=self.project_id, job_group_id=int(job_group_id), ) return response

Was this entry helpful?