Airflow Summit 2025 is coming October 07-09. Register now for early bird ticket!

Source code for tests.system.google.cloud.bigquery.example_bigquery_to_postgres

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Example Airflow DAG for Google BigQuery service.

This DAG relies on the following OS environment variables

* AIRFLOW__API__GOOGLE_KEY_PATH - Path to service account key file. Note, you can skip this variable if you
  run this DAG in a Composer environment.
"""

from __future__ import annotations

import logging
import os
from datetime import datetime

from pendulum import duration

from tests_common.test_utils.version_compat import AIRFLOW_V_3_0_PLUS

if AIRFLOW_V_3_0_PLUS:
    from airflow.sdk import task
else:
    # Airflow 2 path
    from airflow.decorators import task  # type: ignore[attr-defined,no-redef]
from airflow.models.dag import DAG
from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator
from airflow.providers.google.cloud.hooks.compute import ComputeEngineHook
from airflow.providers.google.cloud.hooks.compute_ssh import ComputeEngineSSHHook
from airflow.providers.google.cloud.operators.bigquery import (
    BigQueryCreateEmptyDatasetOperator,
    BigQueryCreateTableOperator,
    BigQueryDeleteDatasetOperator,
)
from airflow.providers.google.cloud.operators.compute import (
    ComputeEngineDeleteInstanceOperator,
    ComputeEngineInsertInstanceOperator,
)
from airflow.providers.google.cloud.transfers.bigquery_to_postgres import BigQueryToPostgresOperator
from airflow.providers.ssh.operators.ssh import SSHOperator
from airflow.providers.standard.operators.bash import BashOperator
from airflow.utils.trigger_rule import TriggerRule

from tests_common.test_utils.api_client_helpers import create_airflow_connection, delete_airflow_connection

[docs] ENV_ID = os.environ.get("SYSTEM_TESTS_ENV_ID", "default")
[docs] PROJECT_ID = os.environ.get("SYSTEM_TESTS_GCP_PROJECT", "example-project")
[docs] DAG_ID = "bigquery_to_postgres"
[docs] REGION = "us-west2"
[docs] ZONE = REGION + "-a"
[docs] NETWORK = "default"
[docs] CONNECTION_ID = f"connection_{DAG_ID}_{ENV_ID}".replace("-", "_")
[docs] CONNECTION_TYPE = "postgres"
[docs] BIGQUERY_DATASET_NAME = f"ds_{DAG_ID}_{ENV_ID}".replace("-", "_")
[docs] BIGQUERY_TABLE = "test_table"
[docs] SOURCE_OBJECT_NAME = "gs://airflow-system-tests-resources/bigquery/salaries_1k.csv"
[docs] BATCH_SIZE = 500
[docs] UPLOAD_DATA_TO_BIGQUERY = f""" if [ $AIRFLOW__API__GOOGLE_KEY_PATH ]; then \ gcloud auth activate-service-account --key-file=$AIRFLOW__API__GOOGLE_KEY_PATH; \ fi; bq load --project_id={PROJECT_ID} --location={REGION} \ --source_format=CSV {BIGQUERY_DATASET_NAME}.{BIGQUERY_TABLE} {SOURCE_OBJECT_NAME} \ emp_name:STRING,salary:FLOAT """
[docs] DB_NAME = "testdb"
[docs] DB_PORT = 5432
[docs] DB_USER_NAME = "root"
[docs] DB_USER_PASSWORD = "demo_password"
[docs] SCHEMA = [ {"name": "emp_name", "type": "STRING", "mode": "NULLABLE"}, {"name": "salary", "type": "FLOAT", "mode": "NULLABLE"}, ]
[docs] SETUP_POSTGRES_COMMAND = f""" sudo apt update && sudo apt install -y docker.io && sudo docker run -d -p {DB_PORT}:{DB_PORT} --name {DB_NAME} \ -e PGUSER={DB_USER_NAME} \ -e POSTGRES_USER={DB_USER_NAME} \ -e POSTGRES_PASSWORD={DB_USER_PASSWORD} \ -e POSTGRES_DB={DB_NAME} \ postgres """
[docs] SQL_TABLE = "test_table"
[docs] SQL_CREATE_TABLE = f"CREATE TABLE IF NOT EXISTS {SQL_TABLE} (emp_name VARCHAR(64), salary FLOAT)"
[docs] GCE_MACHINE_TYPE = "n1-standard-1"
[docs] GCE_INSTANCE_NAME = f"instance-{DAG_ID}-{ENV_ID}".replace("_", "-")
[docs] GCE_INSTANCE_BODY = { "name": GCE_INSTANCE_NAME, "machine_type": f"zones/{ZONE}/machineTypes/{GCE_MACHINE_TYPE}", "disks": [ { "boot": True, "auto_delete": True, "device_name": GCE_INSTANCE_NAME, "initialize_params": { "disk_size_gb": "10", "disk_type": f"zones/{ZONE}/diskTypes/pd-balanced", # The source image can become outdated and stop being supported by apt software packages. # In that case the image version will need to be updated. "source_image": "projects/debian-cloud/global/images/debian-12-bookworm-v20240611", }, } ], "network_interfaces": [ { "access_configs": [{"name": "External NAT", "network_tier": "PREMIUM"}], "stack_type": "IPV4_ONLY", "subnetwork": f"regions/{REGION}/subnetworks/default", } ], }
[docs] FIREWALL_RULE_NAME = f"allow-http-{DB_PORT}-{DAG_ID}-{ENV_ID}".replace("_", "-")
[docs] CREATE_FIREWALL_RULE_COMMAND = f""" if [ $AIRFLOW__API__GOOGLE_KEY_PATH ]; then \ gcloud auth activate-service-account --key-file=$AIRFLOW__API__GOOGLE_KEY_PATH; \ fi; if [ -z $(gcloud compute firewall-rules list --filter=name:{FIREWALL_RULE_NAME} --format="value(name)" --project={PROJECT_ID}) ]; then \ gcloud compute firewall-rules create {FIREWALL_RULE_NAME} \ --project={PROJECT_ID} \ --direction=INGRESS \ --priority=100 \ --network={NETWORK} \ --action=ALLOW \ --rules=tcp:{DB_PORT} \ --source-ranges=0.0.0.0/0 else echo "Firewall rule {FIREWALL_RULE_NAME} already exists." fi """
[docs] DELETE_FIREWALL_RULE_COMMAND = f""" if [ $AIRFLOW__API__GOOGLE_KEY_PATH ]; then \ gcloud auth activate-service-account --key-file=$AIRFLOW__API__GOOGLE_KEY_PATH; \ fi; \ if [ $(gcloud compute firewall-rules list --filter=name:{FIREWALL_RULE_NAME} --format="value(name)" --project={PROJECT_ID}) ]; then \ gcloud compute firewall-rules delete {FIREWALL_RULE_NAME} --project={PROJECT_ID} --quiet; \ fi; """
[docs] log = logging.getLogger(__name__)
with DAG( DAG_ID, schedule="@once", # Override to match your needs start_date=datetime(2021, 1, 1), catchup=False, tags=["example", "bigquery", "postgres"], ) as dag:
[docs] create_bigquery_dataset = BigQueryCreateEmptyDatasetOperator( task_id="create_bigquery_dataset", dataset_id=BIGQUERY_DATASET_NAME, location=REGION, )
create_bigquery_table = BigQueryCreateTableOperator( task_id="create_bigquery_table", dataset_id=BIGQUERY_DATASET_NAME, table_id=BIGQUERY_TABLE, table_resource={ "schema": {"fields": SCHEMA}, }, ) insert_bigquery_data = BashOperator( task_id="insert_bigquery_data", bash_command=UPLOAD_DATA_TO_BIGQUERY, ) create_gce_instance = ComputeEngineInsertInstanceOperator( task_id="create_gce_instance", project_id=PROJECT_ID, zone=ZONE, body=GCE_INSTANCE_BODY, ) create_firewall_rule = BashOperator( task_id="create_firewall_rule", bash_command=CREATE_FIREWALL_RULE_COMMAND, ) setup_postgres = SSHOperator( task_id="setup_postgres", ssh_hook=ComputeEngineSSHHook( user="username", instance_name=GCE_INSTANCE_NAME, zone=ZONE, project_id=PROJECT_ID, use_oslogin=False, use_iap_tunnel=False, cmd_timeout=180, ), command=SETUP_POSTGRES_COMMAND, retries=4, ) @task def get_public_ip() -> str: hook = ComputeEngineHook() address = hook.get_instance_address(resource_id=GCE_INSTANCE_NAME, zone=ZONE, project_id=PROJECT_ID) return address get_public_ip_task = get_public_ip() @task def create_connection(connection_id: str, ip_address: str) -> None: create_airflow_connection( connection_id=connection_id, connection_conf={ "description": "Example connection", "conn_type": CONNECTION_TYPE, "host": ip_address, "schema": DB_NAME, "login": DB_USER_NAME, "password": DB_USER_PASSWORD, "port": DB_PORT, }, ) create_connection_task = create_connection(connection_id=CONNECTION_ID, ip_address=get_public_ip_task) create_pg_table = SQLExecuteQueryOperator( task_id="create_pg_table", conn_id=CONNECTION_ID, sql=SQL_CREATE_TABLE, retries=4, retry_delay=duration(seconds=20), retry_exponential_backoff=False, ) # [START howto_operator_bigquery_to_postgres] bigquery_to_postgres = BigQueryToPostgresOperator( task_id="bigquery_to_postgres", postgres_conn_id=CONNECTION_ID, dataset_table=f"{BIGQUERY_DATASET_NAME}.{BIGQUERY_TABLE}", target_table_name=SQL_TABLE, batch_size=BATCH_SIZE, replace=False, ) # [END howto_operator_bigquery_to_postgres] update_pg_table_data = SQLExecuteQueryOperator( task_id="update_pg_table_data", conn_id=CONNECTION_ID, sql=f"UPDATE {SQL_TABLE} SET salary = salary + 0.5 WHERE salary < 10000.0", retries=4, retry_delay=duration(seconds=20), retry_exponential_backoff=False, ) create_unique_index_in_pg_table = SQLExecuteQueryOperator( task_id="create_unique_index_in_pg_table", conn_id=CONNECTION_ID, sql=f"CREATE UNIQUE INDEX emp_salary ON {SQL_TABLE}(emp_name, salary);", retries=4, retry_delay=duration(seconds=20), retry_exponential_backoff=False, show_return_value_in_logs=True, ) # [START howto_operator_bigquery_to_postgres_upsert] bigquery_to_postgres_upsert = BigQueryToPostgresOperator( task_id="bigquery_to_postgres_upsert", postgres_conn_id=CONNECTION_ID, dataset_table=f"{BIGQUERY_DATASET_NAME}.{BIGQUERY_TABLE}", target_table_name=SQL_TABLE, batch_size=BATCH_SIZE, replace=True, selected_fields=["emp_name", "salary"], replace_index=["emp_name", "salary"], ) # [END howto_operator_bigquery_to_postgres_upsert] delete_bigquery_dataset = BigQueryDeleteDatasetOperator( task_id="delete_bigquery_dataset", dataset_id=BIGQUERY_DATASET_NAME, delete_contents=True, trigger_rule=TriggerRule.ALL_DONE, ) delete_firewall_rule = BashOperator( task_id="delete_firewall_rule", bash_command=DELETE_FIREWALL_RULE_COMMAND, trigger_rule=TriggerRule.ALL_DONE, ) delete_gce_instance = ComputeEngineDeleteInstanceOperator( task_id="delete_gce_instance", resource_id=GCE_INSTANCE_NAME, zone=ZONE, project_id=PROJECT_ID, trigger_rule=TriggerRule.ALL_DONE, ) @task(task_id="delete_connection") def delete_connection(connection_id: str) -> None: delete_airflow_connection(connection_id=connection_id) delete_connection_task = delete_connection(connection_id=CONNECTION_ID) ( # TEST SETUP create_gce_instance >> create_bigquery_dataset >> create_bigquery_table >> insert_bigquery_data >> get_public_ip_task >> create_connection_task >> create_firewall_rule >> setup_postgres >> create_pg_table # TEST BODY >> bigquery_to_postgres >> update_pg_table_data >> create_unique_index_in_pg_table >> bigquery_to_postgres_upsert # TEST TEARDOWN >> [ delete_bigquery_dataset, delete_firewall_rule, delete_gce_instance, delete_connection_task, ] ) from tests_common.test_utils.watcher import watcher # This test needs watcher in order to properly mark success/failure # when "tearDown" task with trigger rule is part of the DAG list(dag.tasks) >> watcher() from tests_common.test_utils.system_tests import get_test_run # noqa: E402 # Needed to run the example DAG with pytest (see: tests/system/README.md#run_via_pytest)
[docs] test_run = get_test_run(dag)

Was this entry helpful?