Airflow Summit 2025 is coming October 07-09. Register now for early bird ticket!

Source code for tests.system.google.cloud.bigquery.example_bigquery_to_mssql

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Example Airflow DAG for Google BigQuery service.

This DAG relies on the following OS environment variables

* AIRFLOW__API__GOOGLE_KEY_PATH - Path to service account key file. Note, you can skip this variable if you
  run this DAG in a Composer environment.
"""

from __future__ import annotations

import logging
import os
from datetime import datetime

import pytest
from pendulum import duration

from tests_common.test_utils.version_compat import AIRFLOW_V_3_0_PLUS

if AIRFLOW_V_3_0_PLUS:
    from airflow.sdk import task
else:
    # Airflow 2 path
    from airflow.decorators import task  # type: ignore[attr-defined,no-redef]

from airflow.models.dag import DAG
from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator
from airflow.providers.google.cloud.hooks.compute import ComputeEngineHook
from airflow.providers.google.cloud.hooks.compute_ssh import ComputeEngineSSHHook
from airflow.providers.google.cloud.operators.bigquery import (
    BigQueryCreateEmptyDatasetOperator,
    BigQueryCreateTableOperator,
    BigQueryDeleteDatasetOperator,
    BigQueryInsertJobOperator,
)
from airflow.providers.google.cloud.operators.compute import (
    ComputeEngineDeleteInstanceOperator,
    ComputeEngineInsertInstanceOperator,
)
from airflow.providers.ssh.operators.ssh import SSHOperator
from airflow.providers.standard.operators.bash import BashOperator
from airflow.utils.trigger_rule import TriggerRule

from tests_common.test_utils.api_client_helpers import create_airflow_connection, delete_airflow_connection

try:
    from airflow.providers.google.cloud.transfers.bigquery_to_mssql import BigQueryToMsSqlOperator
except ImportError:
    pytest.skip("MsSQL not available", allow_module_level=True)

[docs] ENV_ID = os.environ.get("SYSTEM_TESTS_ENV_ID", "default")
[docs] PROJECT_ID = os.environ.get("SYSTEM_TESTS_GCP_PROJECT", "example-project")
[docs] DAG_ID = "bigquery_to_mssql"
[docs] REGION = "us-west2"
[docs] ZONE = REGION + "-a"
[docs] NETWORK = "default"
[docs] CONNECTION_ID = f"connection_{DAG_ID}_{ENV_ID}".replace("-", "_")
[docs] CONNECTION_TYPE = "mssql"
[docs] BIGQUERY_DATASET_NAME = f"dataset_{DAG_ID}_{ENV_ID}".replace("-", "_")
[docs] BIGQUERY_TABLE = "table_42"
[docs] INSERT_ROWS_QUERY = ( f"INSERT INTO {BIGQUERY_DATASET_NAME}.{BIGQUERY_TABLE} (emp_name, salary) " "VALUES ('emp 1', 10000), ('emp 2', 15000);" )
[docs] SCHEMA = [ {"name": "emp_name", "type": "STRING", "mode": "REQUIRED"}, {"name": "salary", "type": "INTEGER", "mode": "NULLABLE"}, ]
[docs] DB_PORT = 1433
[docs] DB_USER_NAME = "sa"
[docs] DB_USER_PASSWORD = "5FHq4fSZ85kK6g0n"
[docs] SETUP_MSSQL_COMMAND = f""" sudo apt update && sudo apt install -y docker.io && sudo docker run -e ACCEPT_EULA=Y -e MSSQL_SA_PASSWORD={DB_USER_PASSWORD} -p {DB_PORT}:{DB_PORT} \ -d mcr.microsoft.com/mssql/server:2022-latest """
[docs] SQL_TABLE = "test_table"
[docs] SQL_CREATE_TABLE = f"""if not exists (select * from sys.tables where sys.tables.name='{SQL_TABLE}' and sys.tables.type='U') create table {SQL_TABLE} ( emp_name VARCHAR(8), salary INT ) """
[docs] GCE_MACHINE_TYPE = "n1-standard-1"
[docs] GCE_INSTANCE_NAME = f"instance-{DAG_ID}-{ENV_ID}".replace("_", "-")
[docs] GCE_INSTANCE_BODY = { "name": GCE_INSTANCE_NAME, "machine_type": f"zones/{ZONE}/machineTypes/{GCE_MACHINE_TYPE}", "disks": [ { "boot": True, "auto_delete": True, "device_name": GCE_INSTANCE_NAME, "initialize_params": { "disk_size_gb": "10", "disk_type": f"zones/{ZONE}/diskTypes/pd-balanced", "source_image": "projects/debian-cloud/global/images/debian-12-bookworm-v20240611", }, } ], "network_interfaces": [ { "access_configs": [{"name": "External NAT", "network_tier": "PREMIUM"}], "stack_type": "IPV4_ONLY", "subnetwork": f"regions/{REGION}/subnetworks/default", } ], }
[docs] FIREWALL_RULE_NAME = f"allow-http-{DB_PORT}-{DAG_ID}-{ENV_ID}".replace("_", "-")
[docs] CREATE_FIREWALL_RULE_COMMAND = f""" if [ $AIRFLOW__API__GOOGLE_KEY_PATH ]; then \ gcloud auth activate-service-account --key-file=$AIRFLOW__API__GOOGLE_KEY_PATH; \ fi; if [ -z $(gcloud compute firewall-rules list --filter=name:{FIREWALL_RULE_NAME} --format="value(name)" --project={PROJECT_ID}) ]; then \ gcloud compute firewall-rules create {FIREWALL_RULE_NAME} \ --project={PROJECT_ID} \ --direction=INGRESS \ --priority=100 \ --network={NETWORK} \ --action=ALLOW \ --rules=tcp:{DB_PORT} \ --source-ranges=0.0.0.0/0 else echo "Firewall rule {FIREWALL_RULE_NAME} already exists." fi """
[docs] DELETE_FIREWALL_RULE_COMMAND = f""" if [ $AIRFLOW__API__GOOGLE_KEY_PATH ]; then \ gcloud auth activate-service-account --key-file=$AIRFLOW__API__GOOGLE_KEY_PATH; \ fi; \ if [ $(gcloud compute firewall-rules list --filter=name:{FIREWALL_RULE_NAME} --format="value(name)" --project={PROJECT_ID}) ]; then \ gcloud compute firewall-rules delete {FIREWALL_RULE_NAME} --project={PROJECT_ID} --quiet; \ fi; """
[docs] log = logging.getLogger(__name__)
with DAG( DAG_ID, schedule="@once", # Override to match your needs start_date=datetime(2021, 1, 1), catchup=False, tags=["example", "bigquery"], ) as dag:
[docs] create_bigquery_dataset = BigQueryCreateEmptyDatasetOperator( task_id="create_bigquery_dataset", dataset_id=BIGQUERY_DATASET_NAME )
create_bigquery_table = BigQueryCreateTableOperator( task_id="create_bigquery_table", dataset_id=BIGQUERY_DATASET_NAME, table_id=BIGQUERY_TABLE, table_resource={ "schema": {"fields": SCHEMA}, }, ) insert_bigquery_data = BigQueryInsertJobOperator( task_id="insert_bigquery_data", configuration={ "query": { "query": INSERT_ROWS_QUERY, "useLegacySql": False, "priority": "BATCH", } }, ) create_gce_instance = ComputeEngineInsertInstanceOperator( task_id="create_gce_instance", project_id=PROJECT_ID, zone=ZONE, body=GCE_INSTANCE_BODY, ) create_firewall_rule = BashOperator( task_id="create_firewall_rule", bash_command=CREATE_FIREWALL_RULE_COMMAND, ) setup_mssql = SSHOperator( task_id="setup_mssql", ssh_hook=ComputeEngineSSHHook( user="username", instance_name=GCE_INSTANCE_NAME, zone=ZONE, project_id=PROJECT_ID, use_oslogin=False, use_iap_tunnel=False, cmd_timeout=180, ), command=SETUP_MSSQL_COMMAND, retries=4, ) @task def get_public_ip() -> str: hook = ComputeEngineHook() address = hook.get_instance_address(resource_id=GCE_INSTANCE_NAME, zone=ZONE, project_id=PROJECT_ID) return address get_public_ip_task = get_public_ip() @task def create_connection(connection_id: str, ip_address: str) -> None: create_airflow_connection( connection_id=connection_id, connection_conf={ "description": "Example connection", "conn_type": CONNECTION_TYPE, "host": ip_address, "login": DB_USER_NAME, "password": DB_USER_PASSWORD, "port": DB_PORT, }, ) create_connection_task = create_connection(connection_id=CONNECTION_ID, ip_address=get_public_ip_task) create_sql_table = SQLExecuteQueryOperator( task_id="create_sql_table", conn_id=CONNECTION_ID, sql=SQL_CREATE_TABLE, retries=4, retry_delay=duration(seconds=20), retry_exponential_backoff=False, ) # [START howto_operator_bigquery_to_mssql] bigquery_to_mssql = BigQueryToMsSqlOperator( task_id="bigquery_to_mssql", mssql_conn_id=CONNECTION_ID, source_project_dataset_table=f"{PROJECT_ID}.{BIGQUERY_DATASET_NAME}.{BIGQUERY_TABLE}", target_table_name=SQL_TABLE, replace=False, ) # [END howto_operator_bigquery_to_mssql] delete_bigquery_dataset = BigQueryDeleteDatasetOperator( task_id="delete_bigquery_dataset", dataset_id=BIGQUERY_DATASET_NAME, delete_contents=True, trigger_rule=TriggerRule.ALL_DONE, ) delete_firewall_rule = BashOperator( task_id="delete_firewall_rule", bash_command=DELETE_FIREWALL_RULE_COMMAND, trigger_rule=TriggerRule.ALL_DONE, ) delete_gce_instance = ComputeEngineDeleteInstanceOperator( task_id="delete_gce_instance", resource_id=GCE_INSTANCE_NAME, zone=ZONE, project_id=PROJECT_ID, trigger_rule=TriggerRule.ALL_DONE, ) @task(task_id="delete_connection") def delete_connection(connection_id: str) -> None: delete_airflow_connection(connection_id=connection_id) delete_connection_task = delete_connection(connection_id=CONNECTION_ID) ( # TEST SETUP create_gce_instance >> create_bigquery_dataset >> create_bigquery_table >> insert_bigquery_data >> get_public_ip_task >> create_connection_task >> create_firewall_rule >> setup_mssql >> create_sql_table # TEST BODY >> bigquery_to_mssql # TEST TEARDOWN >> [ delete_bigquery_dataset, delete_firewall_rule, delete_gce_instance, delete_connection_task, ] ) from tests_common.test_utils.watcher import watcher # This test needs watcher in order to properly mark success/failure # when "tearDown" task with trigger rule is part of the DAG list(dag.tasks) >> watcher() from tests_common.test_utils.system_tests import get_test_run # noqa: E402 # Needed to run the example DAG with pytest (see: tests/system/README.md#run_via_pytest)
[docs] test_run = get_test_run(dag)

Was this entry helpful?