from __future__ import annotations
import os
from collections import defaultdict
from typing import Callable
import pytest
from sqlalchemy.sql import select
from airflow.datasets import BaseDataset, Dataset, DatasetAll, DatasetAny
from airflow.models.dataset import DatasetDagRunQueue, DatasetModel
from airflow.models.serialized_dag import SerializedDagModel
from airflow.operators.empty import EmptyOperator
from airflow.serialization.serialized_objects import BaseSerialization, SerializedDAG
def clear_datasets():
from tests.test_utils.db import clear_db_datasets
pytest.param("", id="empty"),
pytest.param("\n\t", id="whitespace"),
pytest.param("a" * 3001, id="too_long"),
pytest.param("airflow://xcom/dag/task", id="reserved_scheme"),
pytest.param("😊", id="non-ascii"),
def test_invalid_uris(uri):
with pytest.raises(ValueError):
"uri, normalized",
pytest.param("foobar", "foobar", id="scheme-less"),
pytest.param("foo:bar", "foo:bar", id="scheme-less-colon"),
pytest.param("foo/bar", "foo/bar", id="scheme-less-slash"),
pytest.param("s3://bucket/key/path", "s3://bucket/key/path", id="normal"),
pytest.param("file:///123/456/", "file:///123/456", id="trailing-slash"),
def test_uri_with_scheme(uri: str, normalized: str) -> None:
dataset = Dataset(uri)
EmptyOperator(task_id="task1", outlets=[dataset])
assert dataset.uri == normalized
assert os.fspath(dataset) == normalized
def test_uri_with_auth() -> None:
with pytest.warns(UserWarning) as record:
dataset = Dataset("ftp://user@localhost/foo.txt")
assert len(record) == 1
assert str(record[0].message) == (
"A dataset URI should not contain auth info (e.g. username or "
"password). It has been automatically dropped."
EmptyOperator(task_id="task1", outlets=[dataset])
assert dataset.uri == "ftp://localhost/foo.txt"
assert os.fspath(dataset) == "ftp://localhost/foo.txt"
def test_uri_without_scheme():
dataset = Dataset(uri="example_dataset")
EmptyOperator(task_id="task1", outlets=[dataset])
def test_fspath():
uri = "s3://example/dataset"
dataset = Dataset(uri=uri)
assert os.fspath(dataset) == uri
def test_equal_when_same_uri():
uri = "s3://example/dataset"
dataset1 = Dataset(uri=uri)
dataset2 = Dataset(uri=uri)
assert dataset1 == dataset2
def test_not_equal_when_different_uri():
dataset1 = Dataset(uri="s3://example/dataset")
dataset2 = Dataset(uri="s3://other/dataset")
assert dataset1 != dataset2
def test_hash():
uri = "s3://example/dataset"
dataset = Dataset(uri=uri)
def test_dataset_logic_operations():
result_or = dataset1 | dataset2
assert isinstance(result_or, DatasetAny)
result_and = dataset1 & dataset2
assert isinstance(result_and, DatasetAll)
def test_dataset_iter_datasets():
assert list(dataset1.iter_datasets()) == [("s3://bucket1/data1", dataset1)]
def test_dataset_evaluate():
assert dataset1.evaluate({"s3://bucket1/data1": True}) is True
assert dataset1.evaluate({"s3://bucket1/data1": False}) is False
def test_dataset_any_operations():
result_or = (dataset1 | dataset2) | dataset3
assert isinstance(result_or, DatasetAny)
assert len(result_or.objects) == 3
result_and = (dataset1 | dataset2) & dataset3
assert isinstance(result_and, DatasetAll)
def test_dataset_all_operations():
result_or = (dataset1 & dataset2) | dataset3
assert isinstance(result_or, DatasetAny)
result_and = (dataset1 & dataset2) & dataset3
assert isinstance(result_and, DatasetAll)
def test_datasetbooleancondition_evaluate_iter():
Tests _DatasetBooleanCondition's evaluate and iter_datasets methods through DatasetAny and DatasetAll.
Ensures DatasetAny evaluate returns True with any true condition, DatasetAll evaluate returns False if
any condition is false, and both classes correctly iterate over datasets without duplication.
any_condition = DatasetAny(dataset1, dataset2)
all_condition = DatasetAll(dataset1, dataset2)
assert any_condition.evaluate({"s3://bucket1/data1": False, "s3://bucket2/data2": True}) is True
assert all_condition.evaluate({"s3://bucket1/data1": True, "s3://bucket2/data2": False}) is False
# Testing iter_datasets indirectly through the subclasses
datasets_any = set(any_condition.iter_datasets())
datasets_all = set(all_condition.iter_datasets())
assert datasets_any == {("s3://bucket1/data1", dataset1), ("s3://bucket2/data2", dataset2)}
assert datasets_all == {("s3://bucket1/data1", dataset1), ("s3://bucket2/data2", dataset2)}
"inputs, scenario, expected",
# Scenarios for DatasetAny
((True, True, True), "any", True),
((True, True, False), "any", True),
((True, False, True), "any", True),
((True, False, False), "any", True),
((False, False, True), "any", True),
((False, True, False), "any", True),
((False, True, True), "any", True),
((False, False, False), "any", False),
# Scenarios for DatasetAll
((True, True, True), "all", True),
((True, True, False), "all", False),
((True, False, True), "all", False),
((True, False, False), "all", False),
((False, False, True), "all", False),
((False, True, False), "all", False),
((False, True, True), "all", False),
((False, False, False), "all", False),
def test_dataset_logical_conditions_evaluation_and_serialization(inputs, scenario, expected):
class_ = DatasetAny if scenario == "any" else DatasetAll
datasets = [Dataset(uri=f"s3://abc/{i}") for i in range(123, 126)]
condition = class_(*datasets)
statuses = {dataset.uri: status for dataset, status in zip(datasets, inputs)}
assert (
condition.evaluate(statuses) == expected
), f"Condition evaluation failed for inputs {inputs} and scenario '{scenario}'"
# Serialize and deserialize the condition to test persistence
serialized = BaseSerialization.serialize(condition)
deserialized = BaseSerialization.deserialize(serialized)
assert deserialized.evaluate(statuses) == expected, "Serialization round-trip failed"
"status_values, expected_evaluation",
((False, True, True), False), # DatasetAll requires all conditions to be True, but d1 is False
((True, True, True), True), # All conditions are True
((True, False, True), True), # d1 is True, and DatasetAny condition (d2 or d3 being True) is met
((True, False, False), False), # d1 is True, but neither d2 nor d3 meet the DatasetAny condition
def test_nested_dataset_conditions_with_serialization(status_values, expected_evaluation):
# Define datasets
d1 = Dataset(uri="s3://abc/123")
d2 = Dataset(uri="s3://abc/124")
d3 = Dataset(uri="s3://abc/125")
# Create a nested condition: DatasetAll with d1 and DatasetAny with d2 and d3
nested_condition = DatasetAll(d1, DatasetAny(d2, d3))
statuses = {
d1.uri: status_values[0],
d2.uri: status_values[1],
d3.uri: status_values[2],
assert nested_condition.evaluate(statuses) == expected_evaluation, "Initial evaluation mismatch"
serialized_condition = BaseSerialization.serialize(nested_condition)
deserialized_condition = BaseSerialization.deserialize(serialized_condition)
assert (
deserialized_condition.evaluate(statuses) == expected_evaluation
), "Post-serialization evaluation mismatch"
def create_test_datasets(session):
"""Fixture to create test datasets and corresponding models."""
datasets = [Dataset(uri=f"hello{i}") for i in range(1, 3)]
for dataset in datasets:
return datasets
def test_dataset_trigger_setup_and_serialization(session, dag_maker, create_test_datasets):
datasets = create_test_datasets
# Create DAG with dataset triggers
with dag_maker(schedule=DatasetAny(*datasets)) as dag:
# Verify dataset triggers are set up correctly
assert isinstance(
dag.dataset_triggers, DatasetAny
), "DAG dataset triggers should be an instance of DatasetAny"
# Serialize and deserialize DAG dataset triggers
serialized_trigger = SerializedDAG.serialize(dag.dataset_triggers)
deserialized_trigger = SerializedDAG.deserialize(serialized_trigger)
# Verify serialization and deserialization integrity
assert isinstance(
deserialized_trigger, DatasetAny
), "Deserialized trigger should maintain type DatasetAny"
assert (
deserialized_trigger.objects == dag.dataset_triggers.objects
), "Deserialized trigger objects should match original"
def test_dataset_dag_run_queue_processing(session, clear_datasets, dag_maker, create_test_datasets):
datasets = create_test_datasets
dataset_models = session.query(DatasetModel).all()
with dag_maker(schedule=DatasetAny(*datasets)) as dag:
# Add DatasetDagRunQueue entries to simulate dataset event processing
for dm in dataset_models:
session.add(DatasetDagRunQueue(, target_dag_id=dag.dag_id))
# Fetch and evaluate dataset triggers for all DAGs affected by dataset events
records = session.scalars(select(DatasetDagRunQueue)).all()
dag_statuses = defaultdict(lambda: defaultdict(bool))
for record in records:
dag_statuses[record.target_dag_id][record.dataset.uri] = True
serialized_dags = session.execute(
for (serialized_dag,) in serialized_dags:
dag = SerializedDAG.deserialize(
for dataset_uri, status in dag_statuses[dag.dag_id].items():
assert dag.dataset_triggers.evaluate({dataset_uri: status}), "DAG trigger evaluation failed"
def test_dag_with_complex_dataset_triggers(session, dag_maker):
# Create Dataset instances
d1 = Dataset(uri="hello1")
d2 = Dataset(uri="hello2")
# Create and add DatasetModel instances to the session
dm1 = DatasetModel(uri=d1.uri)
dm2 = DatasetModel(uri=d2.uri)
session.add_all([dm1, dm2])
# Setup a DAG with complex dataset triggers (DatasetAny with DatasetAll)
with dag_maker(schedule=DatasetAny(d1, DatasetAll(d2, d1))) as dag:
assert isinstance(
dag.dataset_triggers, DatasetAny
), "DAG's dataset trigger should be an instance of DatasetAny"
assert any(
isinstance(trigger, DatasetAll) for trigger in dag.dataset_triggers.objects
), "DAG's dataset trigger should include DatasetAll"
serialized_triggers = SerializedDAG.serialize(dag.dataset_triggers)
deserialized_triggers = SerializedDAG.deserialize(serialized_triggers)
assert isinstance(
deserialized_triggers, DatasetAny
), "Deserialized triggers should be an instance of DatasetAny"
assert any(
isinstance(trigger, DatasetAll) for trigger in deserialized_triggers.objects
), "Deserialized triggers should include DatasetAll"
serialized_dag_dict = SerializedDAG.to_dict(dag)["dag"]
assert "dataset_triggers" in serialized_dag_dict, "Serialized DAG should contain 'dataset_triggers'"
assert isinstance(
serialized_dag_dict["dataset_triggers"], dict
), "Serialized 'dataset_triggers' should be a dict"
def datasets_equal(d1: BaseDataset, d2: BaseDataset) -> bool:
if type(d1) != type(d2):
return False
if isinstance(d1, Dataset) and isinstance(d2, Dataset):
return d1.uri == d2.uri
elif isinstance(d1, (DatasetAny, DatasetAll)) and isinstance(d2, (DatasetAny, DatasetAll)):
if len(d1.objects) != len(d2.objects):
return False
# Compare each pair of objects
for obj1, obj2 in zip(d1.objects, d2.objects):
# If obj1 or obj2 is a Dataset, DatasetAny, or DatasetAll instance,
# recursively call datasets_equal
if not datasets_equal(obj1, obj2):
return False
return True
return False
dataset1 = Dataset(uri="s3://bucket1/data1")
dataset2 = Dataset(uri="s3://bucket2/data2")
dataset3 = Dataset(uri="s3://bucket3/data3")
dataset4 = Dataset(uri="s3://bucket4/data4")
dataset5 = Dataset(uri="s3://bucket5/data5")
test_cases = [
(lambda: dataset1, dataset1),
(lambda: dataset1 & dataset2, DatasetAll(dataset1, dataset2)),
(lambda: dataset1 | dataset2, DatasetAny(dataset1, dataset2)),
(lambda: dataset1 | (dataset2 & dataset3), DatasetAny(dataset1, DatasetAll(dataset2, dataset3))),
(lambda: dataset1 | dataset2 & dataset3, DatasetAny(dataset1, DatasetAll(dataset2, dataset3))),
lambda: ((dataset1 & dataset2) | dataset3) & (dataset4 | dataset5),
DatasetAll(DatasetAny(DatasetAll(dataset1, dataset2), dataset3), DatasetAny(dataset4, dataset5)),
(lambda: dataset1 & dataset2 | dataset3, DatasetAny(DatasetAll(dataset1, dataset2), dataset3)),
lambda: (dataset1 | dataset2) & (dataset3 | dataset4),
DatasetAll(DatasetAny(dataset1, dataset2), DatasetAny(dataset3, dataset4)),
lambda: (dataset1 & dataset2) | (dataset3 & (dataset4 | dataset5)),
DatasetAny(DatasetAll(dataset1, dataset2), DatasetAll(dataset3, DatasetAny(dataset4, dataset5))),
lambda: (dataset1 & dataset2) & (dataset3 & dataset4),
DatasetAll(dataset1, dataset2, DatasetAll(dataset3, dataset4)),
(lambda: dataset1 | dataset2 | dataset3, DatasetAny(dataset1, dataset2, dataset3)),
(lambda: dataset1 & dataset2 & dataset3, DatasetAll(dataset1, dataset2, dataset3)),
lambda: ((dataset1 & dataset2) | dataset3) & (dataset4 | dataset5),
DatasetAll(DatasetAny(DatasetAll(dataset1, dataset2), dataset3), DatasetAny(dataset4, dataset5)),
@pytest.mark.parametrize("expression, expected", test_cases)
def test_evaluate_datasets_expression(expression, expected):
expr = expression()
assert datasets_equal(expr, expected)
"expression, error",
lambda: dataset1 & 1, # type: ignore[operator]
"unsupported operand type(s) for &: 'Dataset' and 'int'",
lambda: dataset1 | 1, # type: ignore[operator]
"unsupported operand type(s) for |: 'Dataset' and 'int'",
lambda: DatasetAll(1, dataset1), # type: ignore[arg-type]
"expect dataset expressions in condition",
lambda: DatasetAny(1, dataset1), # type: ignore[arg-type]
"expect dataset expressions in condition",
def test_datasets_expression_error(expression: Callable[[], None], error: str) -> None:
with pytest.raises(TypeError) as info:
assert str(info.value) == error