Source code for tests.system.amazon.aws.example_glue

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

from datetime import datetime
from typing import TYPE_CHECKING

import boto3

from airflow.decorators import task
from airflow.models.baseoperator import chain
from airflow.models.dag import DAG
from airflow.providers.amazon.aws.operators.glue import GlueJobOperator
from airflow.providers.amazon.aws.operators.glue_crawler import GlueCrawlerOperator
from airflow.providers.amazon.aws.operators.s3 import (
    S3CreateBucketOperator,
    S3CreateObjectOperator,
    S3DeleteBucketOperator,
)
from airflow.providers.amazon.aws.sensors.glue import GlueJobSensor
from airflow.providers.amazon.aws.sensors.glue_catalog_partition import GlueCatalogPartitionSensor
from airflow.providers.amazon.aws.sensors.glue_crawler import GlueCrawlerSensor
from airflow.utils.trigger_rule import TriggerRule

from providers.tests.system.amazon.aws.utils import ENV_ID_KEY, SystemTestContextBuilder, prune_logs

if TYPE_CHECKING:
    from botocore.client import BaseClient

[docs]DAG_ID = "example_glue"
# Externally fetched variables: # Role needs S3 putobject/getobject access as well as the glue service role, # see docs here: https://docs.aws.amazon.com/glue/latest/dg/create-an-iam-role.html
[docs]ROLE_ARN_KEY = "ROLE_ARN"
[docs]sys_test_context_task = SystemTestContextBuilder().add_variable(ROLE_ARN_KEY).build()
# Example csv data used as input to the example AWS Glue Job.
[docs]EXAMPLE_CSV = """product,value apple,0.5 milk,2.5 bread,4.0 """
# Example Spark script to operate on the above sample csv data.
[docs]EXAMPLE_SCRIPT = """ from pyspark.context import SparkContext from awsglue.context import GlueContext glueContext = GlueContext(SparkContext.getOrCreate()) datasource = glueContext.create_dynamic_frame.from_catalog( database='{db_name}', table_name='input') print('There are %s items in the table' % datasource.count()) datasource.toDF().write.format('csv').mode("append").save('s3://{bucket_name}/output') """
@task
[docs]def get_role_name(arn: str) -> str: return arn.split("/")[-1]
@task(trigger_rule=TriggerRule.ALL_DONE)
[docs]def glue_cleanup(crawler_name: str, job_name: str, db_name: str) -> None: client: BaseClient = boto3.client("glue") client.delete_crawler(Name=crawler_name) client.delete_job(JobName=job_name) client.delete_database(Name=db_name)
with DAG( dag_id=DAG_ID, schedule="@once", start_date=datetime(2021, 1, 1), tags=["example"], catchup=False, ) as dag:
[docs] test_context = sys_test_context_task()
env_id = test_context[ENV_ID_KEY] role_arn = test_context[ROLE_ARN_KEY] glue_crawler_name = f"{env_id}_crawler" glue_db_name = f"{env_id}_glue_db" glue_job_name = f"{env_id}_glue_job" bucket_name = f"{env_id}-bucket" role_name = get_role_name(role_arn) glue_crawler_config = { "Name": glue_crawler_name, "Role": role_arn, "DatabaseName": glue_db_name, "Targets": {"S3Targets": [{"Path": f"{bucket_name}/input"}]}, } create_bucket = S3CreateBucketOperator( task_id="create_bucket", bucket_name=bucket_name, ) upload_csv = S3CreateObjectOperator( task_id="upload_csv", s3_bucket=bucket_name, s3_key="input/category=mixed/input.csv", data=EXAMPLE_CSV, replace=True, ) upload_script = S3CreateObjectOperator( task_id="upload_script", s3_bucket=bucket_name, s3_key="etl_script.py", data=EXAMPLE_SCRIPT.format(db_name=glue_db_name, bucket_name=bucket_name), replace=True, ) # [START howto_operator_glue_crawler] crawl_s3 = GlueCrawlerOperator( task_id="crawl_s3", config=glue_crawler_config, ) # [END howto_operator_glue_crawler] # GlueCrawlerOperator waits by default, setting as False to test the Sensor below. crawl_s3.wait_for_completion = False # [START howto_sensor_glue_crawler] wait_for_crawl = GlueCrawlerSensor( task_id="wait_for_crawl", crawler_name=glue_crawler_name, ) # [END howto_sensor_glue_crawler] wait_for_crawl.timeout = 500 # [START howto_sensor_glue_catalog_partition] wait_for_catalog_partition = GlueCatalogPartitionSensor( task_id="wait_for_catalog_partition", table_name="input", database_name=glue_db_name, expression="category='mixed'", ) # [END howto_sensor_glue_catalog_partition] # [START howto_operator_glue] submit_glue_job = GlueJobOperator( task_id="submit_glue_job", job_name=glue_job_name, script_location=f"s3://{bucket_name}/etl_script.py", s3_bucket=bucket_name, iam_role_name=role_name, create_job_kwargs={"GlueVersion": "3.0", "NumberOfWorkers": 2, "WorkerType": "G.1X"}, ) # [END howto_operator_glue] # GlueJobOperator waits by default, setting as False to test the Sensor below. submit_glue_job.wait_for_completion = False # [START howto_sensor_glue] wait_for_job = GlueJobSensor( task_id="wait_for_job", job_name=glue_job_name, # Job ID extracted from previous Glue Job Operator task run_id=submit_glue_job.output, verbose=True, # prints glue job logs in airflow logs ) # [END howto_sensor_glue] wait_for_job.poke_interval = 5 delete_bucket = S3DeleteBucketOperator( task_id="delete_bucket", trigger_rule=TriggerRule.ALL_DONE, bucket_name=bucket_name, force_delete=True, ) log_cleanup = prune_logs( [ # Format: ('log group name', 'log stream prefix') ("/aws-glue/crawlers", glue_crawler_name), ("/aws-glue/jobs/logs-v2", submit_glue_job.output), ("/aws-glue/jobs/error", submit_glue_job.output), ("/aws-glue/jobs/output", submit_glue_job.output), ] ) chain( # TEST SETUP test_context, create_bucket, upload_csv, upload_script, # TEST BODY crawl_s3, wait_for_crawl, wait_for_catalog_partition, submit_glue_job, wait_for_job, # TEST TEARDOWN glue_cleanup(glue_crawler_name, glue_job_name, glue_db_name), delete_bucket, log_cleanup, ) from dev.tests_common.test_utils.watcher import watcher # This test needs watcher in order to properly mark success/failure # when "tearDown" task with trigger rule is part of the DAG list(dag.tasks) >> watcher() from dev.tests_common.test_utils.system_tests import get_test_run # noqa: E402 # Needed to run the example DAG with pytest (see: tests/system/README.md#run_via_pytest)
[docs]test_run = get_test_run(dag)

Was this entry helpful?