diff --git a/poetry.lock b/poetry.lock index f695794a1..50f1c7744 100644 --- a/poetry.lock +++ b/poetry.lock @@ -7,7 +7,7 @@ description = "Async client for aws services using botocore and aiohttp" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "aiobotocore-2.26.0-py3-none-any.whl", hash = "sha256:a793db51c07930513b74ea7a95bd79aaa42f545bdb0f011779646eafa216abec"}, {file = "aiobotocore-2.26.0.tar.gz", hash = "sha256:50567feaf8dfe2b653570b4491f5bc8c6e7fb9622479d66442462c021db4fadc"}, @@ -34,7 +34,7 @@ description = "Happy Eyeballs for asyncio" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8"}, {file = "aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558"}, @@ -47,7 +47,7 @@ description = "Async http client/server framework (asyncio)" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "aiohttp-3.13.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d5a372fd5afd301b3a89582817fdcdb6c34124787c70dbcc616f259013e7eef7"}, {file = "aiohttp-3.13.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:147e422fd1223005c22b4fe080f5d93ced44460f5f9c105406b753612b587821"}, @@ -191,7 +191,7 @@ description = "itertools and builtins for AsyncIO and mixed iterables" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "aioitertools-0.13.0-py3-none-any.whl", hash = "sha256:0be0292b856f08dfac90e31f4739432f4cb6d7520ab9eb73e143f4f2fa5259be"}, {file = "aioitertools-0.13.0.tar.gz", hash = "sha256:620bd241acc0bbb9ec819f1ab215866871b4bbd1f73836a55f799200ee86950c"}, @@ -207,7 +207,7 @@ description = "aiosignal: a list of registered asynchronous callbacks" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e"}, {file = "aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7"}, @@ -267,7 +267,7 @@ description = "Timeout context manager for asyncio programs" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "(extra == \"all\" or extra == \"s3\") and python_version < \"3.11\"" +markers = "(extra == \"s3\" or extra == \"all\") and python_version < \"3.11\"" files = [ {file = "async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c"}, {file = "async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3"}, @@ -307,7 +307,7 @@ description = "Low-level, data-driven core of boto 3." optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "botocore-1.41.5-py3-none-any.whl", hash = "sha256:3fef7fcda30c82c27202d232cfdbd6782cb27f20f8e7e21b20606483e66ee73a"}, {file = "botocore-1.41.5.tar.gz", hash = "sha256:0367622b811597d183bfcaab4a350f0d3ede712031ce792ef183cabdee80d3bf"}, @@ -699,7 +699,7 @@ description = "A list-like structure which implements collections.abc.MutableSeq optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "frozenlist-1.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b37f6d31b3dcea7deb5e9696e529a6aa4a898adc33db82da12e4c60a7c4d2011"}, {file = "frozenlist-1.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ef2b7b394f208233e471abc541cc6991f907ffd47dc72584acee3147899d6565"}, @@ -840,7 +840,7 @@ description = "File-system specification" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "python_version == \"3.9\" and (extra == \"all\" or extra == \"s3\")" +markers = "python_version == \"3.9\" and (extra == \"s3\" or extra == \"all\")" files = [ {file = "fsspec-2025.10.0-py3-none-any.whl", hash = "sha256:7c7712353ae7d875407f97715f0e1ffcc21e33d5b24556cb1e090ae9409ec61d"}, {file = "fsspec-2025.10.0.tar.gz", hash = "sha256:b6789427626f068f9a83ca4e8a3cc050850b6c0f71f99ddb4f542b8266a26a59"}, @@ -881,7 +881,7 @@ description = "File-system specification" optional = true python-versions = ">=3.10" groups = ["main"] -markers = "python_version >= \"3.10\" and (extra == \"all\" or extra == \"s3\")" +markers = "python_version >= \"3.10\" and (extra == \"s3\" or extra == \"all\")" files = [ {file = "fsspec-2025.12.0-py3-none-any.whl", hash = "sha256:8bf1fe301b7d8acfa6e8571e3b1c3d158f909666642431cc78a1b7b4dbc5ec5b"}, {file = "fsspec-2025.12.0.tar.gz", hash = "sha256:c505de011584597b1060ff778bb664c1bc022e87921b0e4f10cc9c44f9635973"}, @@ -1118,7 +1118,7 @@ description = "JSON Matching Expressions" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, @@ -1668,7 +1668,7 @@ description = "multidict implementation" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "multidict-6.7.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:9f474ad5acda359c8758c8accc22032c6abe6dc87a8be2440d097785e27a9349"}, {file = "multidict-6.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4b7a9db5a870f780220e931d0002bbfd88fb53aceb6293251e2c839415c1b20e"}, @@ -2229,7 +2229,7 @@ description = "Accelerated property cache" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "propcache-0.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c2d1fa3201efaf55d730400d945b5b3ab6e672e100ba0f9a409d950ab25d7db"}, {file = "propcache-0.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1eb2994229cc8ce7fe9b3db88f5465f5fd8651672840b2e426b88cdb1a30aac8"}, @@ -2945,7 +2945,7 @@ description = "Convenient Filesystem interface over S3" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "python_version == \"3.9\" and (extra == \"all\" or extra == \"s3\")" +markers = "python_version == \"3.9\" and (extra == \"s3\" or extra == \"all\")" files = [ {file = "s3fs-2025.10.0-py3-none-any.whl", hash = "sha256:da7ef25efc1541f5fca8e1116361e49ea1081f83f4e8001fbd77347c625da28a"}, {file = "s3fs-2025.10.0.tar.gz", hash = "sha256:e8be6cddc77aceea1681ece0f472c3a7f8ef71a0d2acddb1cc92bb6afa3e9e4f"}, @@ -2967,7 +2967,7 @@ description = "Convenient Filesystem interface over S3" optional = true python-versions = ">=3.10" groups = ["main"] -markers = "python_version >= \"3.10\" and (extra == \"all\" or extra == \"s3\")" +markers = "python_version >= \"3.10\" and (extra == \"s3\" or extra == \"all\")" files = [ {file = "s3fs-2025.12.0-py3-none-any.whl", hash = "sha256:89d51e0744256baad7ae5410304a368ca195affd93a07795bc8ba9c00c9effbb"}, {file = "s3fs-2025.12.0.tar.gz", hash = "sha256:8612885105ce14d609c5b807553f9f9956b45541576a17ff337d9435ed3eb01f"}, @@ -3298,6 +3298,22 @@ files = [ [package.dependencies] referencing = "*" +[[package]] +name = "types-networkx" +version = "3.6.1.20260303" +description = "Typing stubs for networkx" +optional = false +python-versions = ">=3.10" +groups = ["dev"] +markers = "python_version >= \"3.10\"" +files = [ + {file = "types_networkx-3.6.1.20260303-py3-none-any.whl", hash = "sha256:754c7c7bcaab3c317b0b86441240c0a5bd0d2f419aba80a88e9718248a5c89af"}, + {file = "types_networkx-3.6.1.20260303.tar.gz", hash = "sha256:8248aa6fcadc08bd7992af6e412bfc5cfa043bda5ce7ab407fa591c808ce8557"}, +] + +[package.dependencies] +numpy = ">=1.20" + [[package]] name = "types-pytz" version = "2025.2.0.20251108" @@ -3346,7 +3362,7 @@ files = [ {file = "urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e"}, {file = "urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32"}, ] -markers = {main = "python_version == \"3.9\" and (extra == \"all\" or extra == \"s3\")", docs = "python_version == \"3.9\""} +markers = {main = "python_version == \"3.9\" and (extra == \"s3\" or extra == \"all\")", docs = "python_version == \"3.9\""} [package.extras] brotli = ["brotli (==1.0.9) ; os_name != \"nt\" and python_version < \"3\" and platform_python_implementation == \"CPython\"", "brotli (>=1.0.9) ; python_version >= \"3\" and platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; (os_name != \"nt\" or python_version >= \"3\") and platform_python_implementation != \"CPython\"", "brotlipy (>=0.6.0) ; os_name == \"nt\" and python_version < \"3\""] @@ -3364,7 +3380,7 @@ files = [ {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] -markers = {main = "python_version >= \"3.10\" and (extra == \"all\" or extra == \"s3\")"} +markers = {main = "python_version >= \"3.10\" and (extra == \"s3\" or extra == \"all\")"} [package.extras] brotli = ["brotli (>=1.2.0) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=1.2.0.0) ; platform_python_implementation != \"CPython\""] @@ -3379,7 +3395,7 @@ description = "Module for decorators, wrappers and monkey patching." optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "wrapt-1.17.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:88bbae4d40d5a46142e70d58bf664a89b6b4befaea7b2ecc14e03cedb8e06c04"}, {file = "wrapt-1.17.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6b13af258d6a9ad602d57d889f83b9d5543acd471eee12eb51f5b01f8eb1bc2"}, @@ -3486,7 +3502,7 @@ description = "Yet another URL library" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "yarl-1.22.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c7bd6683587567e5a49ee6e336e0612bec8329be1b7d4c8af5687dcdeb67ee1e"}, {file = "yarl-1.22.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5cdac20da754f3a723cceea5b3448e1a2074866406adeb4ef35b469d089adb8f"}, @@ -3653,4 +3669,4 @@ s3 = ["s3fs"] [metadata] lock-version = "2.1" python-versions = ">=3.9,<4.0" -content-hash = "44f127ea06ae4ebdcd56b245a4f9886dd3cedd5ffd64a38a110e4acc8fd4cc19" +content-hash = "ee401a8363109109d158a23c5dc04aad4198208c7287710111f40756e21e33c6" diff --git a/pyproject.toml b/pyproject.toml index a75fc10d7..d5c2855f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,7 @@ pandas-stubs = ">=2.2.2,<3.0" pyarrow-stubs = ">=19.0,<21.0" ruff = ">=0.14,<1.0.0" types-jsonschema = ">=4.25.1,<5.0" +types-networkx = {markers = "python_version >= \"3.10\"", version = "^3.6.1.20260303"} [tool.poetry.group.docs.dependencies] sphinx = ">=7.4.7,<8.0" diff --git a/src/vtlengine/API/__init__.py b/src/vtlengine/API/__init__.py index 33dce44f4..3d1a4679d 100644 --- a/src/vtlengine/API/__init__.py +++ b/src/vtlengine/API/__init__.py @@ -88,7 +88,8 @@ def _extract_input_datasets(script: Union[str, TransformationScheme, Path]) -> L raise TypeError("Unsupported script type.") ast = create_ast(vtl_script) - dag_inputs = DAGAnalyzer.ds_structure(ast).global_inputs + ds = DAGAnalyzer.ds_structure(ast) + dag_inputs = ds.global_input_datasets + ds.global_input_dataset_or_scalar return dag_inputs diff --git a/src/vtlengine/AST/DAG/__init__.py b/src/vtlengine/AST/DAG/__init__.py index aed9d5145..37c7f5e56 100644 --- a/src/vtlengine/AST/DAG/__init__.py +++ b/src/vtlengine/AST/DAG/__init__.py @@ -24,23 +24,72 @@ DefIdentifier, DPRuleset, DPValidation, + EvalOp, HROperation, HRuleset, Identifier, JoinOp, + MulOp, Operator, ParamOp, PersistentAssignment, RegularAggregation, Start, + TimeAggregation, UDOCall, + UnaryOp, + Validation, VarID, ) from vtlengine.AST.ASTTemplate import ASTTemplate -from vtlengine.AST.DAG._models import DatasetSchedule, StatementDeps -from vtlengine.AST.Grammar.tokens import AS, DROP, KEEP, MEMBERSHIP, RENAME, TO +from vtlengine.AST.DAG._models import Schedule, StatementDeps +from vtlengine.AST.Grammar.tokens import ( + AS, + BETWEEN, + DROP, + EXISTS_IN, + FILL_TIME_SERIES, + FLOW_TO_STOCK, + INSTR, + INTERSECT, + KEEP, + MEMBERSHIP, + PERIOD_INDICATOR, + RANDOM, + RENAME, + REPLACE, + ROUND, + SETDIFF, + STOCK_TO_FLOW, + SUBSTR, + SYMDIFF, + TIMESHIFT, + TO, + TRUNC, + UNION, +) +from vtlengine.DataTypes import COMP_NAME_MAPPING from vtlengine.Exceptions import SemanticError -from vtlengine.Model import Component +from vtlengine.Model import Component, Dataset + +# Operators that only accept datasets as input (never scalars). +# Grouped by AST node type for clarity. +DATASET_ONLY_MULOP = frozenset({UNION, INTERSECT, SETDIFF, SYMDIFF, EXISTS_IN}) +DATASET_ONLY_UNARYOP = frozenset({FLOW_TO_STOCK, STOCK_TO_FLOW, PERIOD_INDICATOR}) +DATASET_ONLY_BINOP = frozenset({MEMBERSHIP, TIMESHIFT}) + +# BinOp operators where the right operand must always be a scalar. +SCALAR_RIGHT_BINOP = frozenset({RANDOM}) + +# ParamOp operators where all params (not children) must be scalars. +SCALAR_PARAMS_PARAMOP = frozenset({ROUND, TRUNC, SUBSTR, REPLACE, INSTR}) + +# MulOp operators where children[1:] must be scalars (operand is children[0]). +SCALAR_TAIL_MULOP = frozenset({BETWEEN}) + +# Reserved component names generated by the engine (e.g., count() → int_var, +# check() → bool_var). These are never external inputs. +RESERVED_COMPONENT_NAMES = frozenset(COMP_NAME_MAPPING.values()) @dataclass @@ -60,16 +109,21 @@ class DAGAnalyzer(ASTTemplate): # Per-statement accumulator (reset between statements) current_deps: StatementDeps = field(default_factory=StatementDeps) + _current_has_dataset_op: bool = False # Cross-statement unknown variable tracking unknown_variables: Set[str] = field(default_factory=set) + # Outputs that were consumed via unknown_variables (RegularAggregation context) + _resolved_from_unknown: Set[str] = field(default_factory=set) + # UDO names that have at least one dataset-typed parameter + _udos_with_dataset_params: Set[str] = field(default_factory=set) @classmethod - def ds_structure(cls, ast: AST) -> DatasetSchedule: + def ds_structure(cls, ast: AST) -> Schedule: dag = cls() dag.visit(ast) return dag._ds_usage_analysis() - def _ds_usage_analysis(self) -> DatasetSchedule: + def _ds_usage_analysis(self) -> Schedule: """Analyze dataset dependencies to build insertion/deletion schedules.""" deletion: Dict[int, List[str]] = defaultdict(list) insertion: Dict[int, List[str]] = defaultdict(list) @@ -105,13 +159,123 @@ def _ds_usage_analysis(self) -> DatasetSchedule: deletion[last_consumer.get(element, key)].append(element) insertion[key].append(element) - return DatasetSchedule( + classification = self._classify_global_inputs(all_outputs, global_inputs, global_set) + + return Schedule( insertion=dict(insertion), deletion=dict(deletion), - global_inputs=global_inputs, + global_inputs=classification["global_inputs"], + global_input_datasets=classification["global_input_datasets"], + global_input_scalars=classification["global_input_scalars"], + global_input_dataset_or_scalar=classification["global_input_dataset_or_scalar"], + global_input_component_or_scalar=classification["global_input_component_or_scalar"], persistent=persistent_datasets, + all_outputs=sorted(all_outputs), ) + def _classify_global_inputs( + self, + all_outputs: Set[str], + global_inputs: List[str], + global_set: Set[str], + ) -> Dict[str, List[str]]: + """Classify global inputs into datasets, scalars, and ambiguous categories.""" + scalar_outputs, comp_or_scalar = self._compute_scalar_outputs(all_outputs) + + # Identify definite dataset and scalar inputs + definite_dataset_inputs: Set[str] = set() + definite_scalar_inputs: Set[str] = set() + for statement in self.dependencies.values(): + # Collect explicitly typed scalar inputs (e.g., UDO scalar params) + for inp in statement.scalar_inputs: + definite_scalar_inputs.add(inp) + if statement.has_dataset_op: + for inp in statement.dataset_inputs: + definite_dataset_inputs.add(inp) + + # Include component_or_scalar candidates in global_inputs + for name in comp_or_scalar: + if name not in global_set and name not in all_outputs: + global_set.add(name) + global_inputs.append(name) + + # Classify into four categories + result: Dict[str, List[str]] = { + "global_inputs": global_inputs, + "global_input_datasets": [], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [], + "global_input_component_or_scalar": [], + } + + for name in global_inputs: + if name in comp_or_scalar: + result["global_input_component_or_scalar"].append(name) + elif name in definite_dataset_inputs and name in definite_scalar_inputs: + # Used as dataset in one context and scalar in another → ambiguous + result["global_input_dataset_or_scalar"].append(name) + elif name in definite_dataset_inputs: + result["global_input_datasets"].append(name) + elif name in definite_scalar_inputs or self._feeds_only_scalar_chains( + name, scalar_outputs + ): + result["global_input_scalars"].append(name) + else: + result["global_input_dataset_or_scalar"].append(name) + + return result + + def _compute_scalar_outputs(self, all_outputs: Set[str]) -> tuple[Set[str], Set[str]]: + """Compute scalar outputs via propagation and component/scalar candidates.""" + scalar_outputs: Set[str] = set() + for statement in self.dependencies.values(): + reference = statement.outputs + statement.persistent + if not reference: + continue + ds_name = reference[0] + is_constant_assignment = not statement.has_dataset_op and not statement.inputs + if is_constant_assignment or ds_name in self._resolved_from_unknown: + scalar_outputs.add(ds_name) + + comp_or_scalar: Set[str] = set() + for statement in self.dependencies.values(): + for uv in statement.unknown_variables: + if uv not in all_outputs: + comp_or_scalar.add(uv) + + # Propagate: statements with no dataset ops where all inputs are known scalars + changed = True + while changed: + changed = False + for statement in self.dependencies.values(): + reference = statement.outputs + statement.persistent + if not reference: + continue + ds_name = reference[0] + if ds_name in scalar_outputs or statement.has_dataset_op: + continue + if statement.inputs and all( + inp in scalar_outputs or inp in comp_or_scalar for inp in statement.inputs + ): + scalar_outputs.add(ds_name) + changed = True + + return scalar_outputs, comp_or_scalar + + def _feeds_only_scalar_chains(self, name: str, scalar_outputs: Set[str]) -> bool: + """Check if a global input feeds only into scalar-output statements.""" + has_consumers = False + for stmt in self.dependencies.values(): + if name not in stmt.inputs: + continue + has_consumers = True + if stmt.has_dataset_op: + return False + reference = stmt.outputs + stmt.persistent + if reference and reference[0] not in scalar_outputs: + return False + return has_consumers + @classmethod def create_dag(cls, ast: Start) -> "DAGAnalyzer": dag = cls() @@ -212,6 +376,9 @@ def statement_structure(self) -> StatementDeps: outputs=list(self.current_deps.outputs), persistent=list(self.current_deps.persistent), unknown_variables=list(self.current_deps.unknown_variables), + has_dataset_op=self._current_has_dataset_op, + dataset_inputs=list(self.current_deps.dataset_inputs), + scalar_inputs=list(self.current_deps.scalar_inputs), ) self.unknown_variables.update(self.current_deps.unknown_variables) return result @@ -233,10 +400,16 @@ def visit_Start(self, node: Start) -> None: self.visit(child) """ udos = {} + udos_with_ds: Set[str] = set() for ast_element in node.children: if isinstance(ast_element, Operator): udos[ast_element.op] = ast_element + for p in ast_element.parameters: + if isinstance(p.type_, Dataset): + udos_with_ds.add(ast_element.op) + break self.udos = udos + self._udos_with_dataset_params = udos_with_ds for child in node.children: if isinstance(child, (Assignment, PersistentAssignment)): self.is_first_assignment = True @@ -246,12 +419,14 @@ def visit_Start(self, node: Start) -> None: self.number_of_statements += 1 self.alias = set() self.current_deps = StatementDeps() + self._current_has_dataset_op = False aux = copy.copy(self.unknown_variables) for variable in aux: for _number_of_statement, dependency in self.dependencies.items(): if variable in dependency.outputs: self.unknown_variables.discard(variable) + self._resolved_from_unknown.add(variable) for _ns2, dep2 in self.dependencies.items(): if variable in dep2.unknown_variables: dep2.unknown_variables.remove(variable) @@ -272,6 +447,9 @@ def visit_PersistentAssignment(self, node: PersistentAssignment) -> None: self.visit(node.right) def visit_RegularAggregation(self, node: RegularAggregation) -> None: + self._current_has_dataset_op = True + if isinstance(node.dataset, VarID) and node.dataset.value not in self.alias: + self.current_deps.dataset_inputs.append(node.dataset.value) self.visit(node.dataset) if node.op in [KEEP, DROP, RENAME]: return @@ -281,11 +459,19 @@ def visit_RegularAggregation(self, node: RegularAggregation) -> None: self.is_from_regular_aggregation = False def visit_BinOp(self, node: BinOp) -> None: - if node.op == MEMBERSHIP: + if node.op in DATASET_ONLY_BINOP: + self._current_has_dataset_op = True + if isinstance(node.left, VarID) and node.left.value not in self.alias: + self.current_deps.dataset_inputs.append(node.left.value) self.is_dataset = True self.visit(node.left) self.is_dataset = False self.visit(node.right) + elif node.op in SCALAR_RIGHT_BINOP: + self.visit(node.left) + if isinstance(node.right, VarID) and node.right.value not in self.alias: + self.current_deps.scalar_inputs.append(node.right.value) + self.visit(node.right) elif node.op == AS or node.op == TO: self.visit(node.left) self.alias.add(node.right.value) @@ -293,6 +479,27 @@ def visit_BinOp(self, node: BinOp) -> None: self.visit(node.left) self.visit(node.right) + def visit_MulOp(self, node: MulOp) -> None: + if node.op in DATASET_ONLY_MULOP: + self._current_has_dataset_op = True + for child in node.children: + if isinstance(child, VarID) and child.value not in self.alias: + self.current_deps.dataset_inputs.append(child.value) + elif node.op in SCALAR_TAIL_MULOP: + # First child is the operand (dual), rest are scalar-only params. + for child in node.children[1:]: + if isinstance(child, VarID) and child.value not in self.alias: + self.current_deps.scalar_inputs.append(child.value) + for child in node.children: + self.visit(child) + + def visit_UnaryOp(self, node: UnaryOp) -> None: + if node.op in DATASET_ONLY_UNARYOP: + self._current_has_dataset_op = True + if isinstance(node.operand, VarID) and node.operand.value not in self.alias: + self.current_deps.dataset_inputs.append(node.operand.value) + self.visit(node.operand) + def visit_VarID(self, node: VarID) -> None: if ( not self.is_from_regular_aggregation or self.is_dataset @@ -303,6 +510,7 @@ def visit_VarID(self, node: VarID) -> None: self.is_from_regular_aggregation and node.value not in self.alias and not self.is_dataset + and node.value not in RESERVED_COMPONENT_NAMES and node.value not in self.current_deps.unknown_variables ): self.current_deps.unknown_variables.append(node.value) @@ -313,29 +521,62 @@ def visit_Identifier(self, node: Identifier) -> None: and node.value not in self.alias and node.value not in self.current_deps.inputs ): + self._current_has_dataset_op = True self.current_deps.inputs.append(node.value) + self.current_deps.dataset_inputs.append(node.value) def visit_ParamOp(self, node: ParamOp) -> None: if self.udos and node.op in self.udos: + if node.op in self._udos_with_dataset_params: + self._current_has_dataset_op = True do_ast: Operator = self.udos[node.op] - - for arg in node.params: - index_arg = node.params.index(arg) - if do_ast.parameters[index_arg].type_.kind == "DataSet": - self.visit(arg) + for i, arg in enumerate(node.params): + if isinstance(arg, Constant): + continue + param_type = do_ast.parameters[i].type_ + if type(param_type) is not Component: + is_ds = isinstance(param_type, Dataset) + if is_ds and isinstance(arg, VarID): + self.current_deps.dataset_inputs.append(arg.value) + elif not is_ds and isinstance(arg, VarID): + self.current_deps.scalar_inputs.append(arg.value) + self.visit(arg) else: + if node.op == FILL_TIME_SERIES: + self._current_has_dataset_op = True + for child in node.children: + if isinstance(child, VarID) and child.value not in self.alias: + self.current_deps.dataset_inputs.append(child.value) + elif node.op in SCALAR_PARAMS_PARAMOP: + for param in node.params: + if isinstance(param, VarID) and param.value not in self.alias: + self.current_deps.scalar_inputs.append(param.value) super(DAGAnalyzer, self).visit_ParamOp(node) def visit_Aggregation(self, node: Aggregation) -> None: + self._current_has_dataset_op = True if node.operand is not None: + if isinstance(node.operand, VarID) and node.operand.value not in self.alias: + self.current_deps.dataset_inputs.append(node.operand.value) self.visit(node.operand) def visit_Analytic(self, node: Analytic) -> None: - if node.operand is not None: + self._current_has_dataset_op = True + # Inside RegularAggregation (calc/filter/etc.), analytic operands are always + # component references — they cannot be external scalars, so skip them. + if node.operand is not None and not self.is_from_regular_aggregation: self.visit(node.operand) def visit_JoinOp(self, node: JoinOp) -> None: + self._current_has_dataset_op = True + # Join clauses contain BinOp(AS) nodes; dataset aliases are handled via visit_BinOp. + # Direct VarID children in clauses are dataset references. for clause in node.clauses: + if isinstance(clause, BinOp) and clause.op == AS: + if isinstance(clause.left, VarID) and clause.left.value not in self.alias: + self.current_deps.dataset_inputs.append(clause.left.value) + elif isinstance(clause, VarID) and clause.value not in self.alias: + self.current_deps.dataset_inputs.append(clause.value) self.visit(clause) def visit_UDOCall(self, node: UDOCall) -> None: @@ -343,19 +584,52 @@ def visit_UDOCall(self, node: UDOCall) -> None: if not node_args: super().visit_UDOCall(node) else: - node_sig = [type(p.type_) for p in node_args.parameters] - for sig, param in zip(node_sig, node.params): - if not isinstance(param, Constant) and sig is not Component: + if node.op in self._udos_with_dataset_params: + self._current_has_dataset_op = True + for p, param in zip(node_args.parameters, node.params): + if isinstance(param, Constant): + continue + if type(p.type_) is not Component: + is_ds = isinstance(p.type_, Dataset) + if is_ds and isinstance(param, VarID): + self.current_deps.dataset_inputs.append(param.value) + elif not is_ds and isinstance(param, VarID): + self.current_deps.scalar_inputs.append(param.value) self.visit(param) def visit_HROperation(self, node: HROperation) -> None: """Visit HROperation node for dependency analysis.""" + self._current_has_dataset_op = True + if isinstance(node.dataset, VarID) and node.dataset.value not in self.alias: + self.current_deps.dataset_inputs.append(node.dataset.value) self.visit(node.dataset) def visit_DPValidation(self, node: DPValidation) -> None: """Visit DPValidation node for dependency analysis.""" + self._current_has_dataset_op = True + if isinstance(node.dataset, VarID) and node.dataset.value not in self.alias: + self.current_deps.dataset_inputs.append(node.dataset.value) self.visit(node.dataset) + def visit_TimeAggregation(self, node: TimeAggregation) -> None: + if node.operand is not None: + self.visit(node.operand) + + def visit_Validation(self, node: Validation) -> None: + # Don't force has_dataset_op here — let the inner expression determine it. + # Dataset-only operators (exists_in, membership, etc.) set it themselves. + self.visit(node.validation) + if node.imbalance is not None: + self.visit(node.imbalance) + + def visit_EvalOp(self, node: EvalOp) -> None: + """Eval operands are always datasets (external SQL routine inputs).""" + self._current_has_dataset_op = True + for operand in node.operands: + if isinstance(operand, VarID) and operand.value not in self.alias: + self.current_deps.dataset_inputs.append(operand.value) + self.visit(operand) + class HRDAGAnalyzer(DAGAnalyzer): def visit_HRuleset(self, node: HRuleset) -> None: diff --git a/src/vtlengine/AST/DAG/_models.py b/src/vtlengine/AST/DAG/_models.py index 359edc49e..bd58f00d3 100644 --- a/src/vtlengine/AST/DAG/_models.py +++ b/src/vtlengine/AST/DAG/_models.py @@ -4,22 +4,70 @@ @dataclass class StatementDeps: - """Per-statement dependency info tracked during AST visiting.""" + """Per-statement dependency info tracked during AST visiting. + + Attributes: + inputs: Variables consumed by this statement (excluding its own outputs). + outputs: Variables produced by this statement via ``:=`` assignment. + persistent: Variables produced by this statement via ``<-`` assignment. + unknown_variables: Variables inside RegularAggregation context that could + be either components of the dataset or external scalars. + has_dataset_op: Whether this statement involves a dataset operation + (RegularAggregation, JoinOp, Aggregation, Analytic, MEMBERSHIP, + UDO with dataset params, etc.). + dataset_inputs: Subset of inputs that are definitively datasets + (e.g., UDO params typed as dataset). Empty means all inputs in a + ``has_dataset_op`` statement are considered dataset inputs. + scalar_inputs: Subset of inputs that are definitively scalars + (e.g., UDO params typed as a scalar type like number, string, etc.). + """ inputs: List[str] = field(default_factory=list) outputs: List[str] = field(default_factory=list) persistent: List[str] = field(default_factory=list) unknown_variables: List[str] = field(default_factory=list) + has_dataset_op: bool = False + dataset_inputs: List[str] = field(default_factory=list) + scalar_inputs: List[str] = field(default_factory=list) @dataclass -class DatasetSchedule: +class Schedule: """Typed result of DAG dataset usage analysis. - Tracks when datasets should be loaded/unloaded for memory-efficient execution. + Tracks when datasets should be loaded/unloaded for memory-efficient execution, + and classifies global inputs into four categories based on AST context. + + Attributes: + insertion: Statement index to list of datasets to load at that point + (first use). + deletion: Statement index to list of datasets to unload at that point + (last use). + global_inputs: All external dependencies not produced by the script. + Union of the four ``global_input_*`` categories below (no duplicates). + global_input_datasets: Definite datasets — used in dataset operations + (RegularAggregation operand, Identifier with kind="DatasetID", + UDO dataset params, MEMBERSHIP left operand, JoinOp, etc.). + global_input_scalars: Definite scalars — feed exclusively into scalar + chains propagated from constant assignments with no dataset ops. + global_input_dataset_or_scalar: Ambiguous at top level (e.g., + ``DS_r <- X + 2`` where X could be a dataset or scalar). + The caller may provide either. + global_input_component_or_scalar: Ambiguous inside RegularAggregation + (e.g., ``DS_1[calc Me_2 := Me_1 + X]`` where X could be a component + of DS_1 or an external scalar). Semantic error 1-1-6-11 is raised + at runtime if it collides with a component name. + persistent: Outputs written with ``<-`` (persistent assignment). + all_outputs: All variables produced by the script (both ``:=`` and + ``<-`` assignments). """ insertion: Dict[int, List[str]] = field(default_factory=dict) deletion: Dict[int, List[str]] = field(default_factory=dict) global_inputs: List[str] = field(default_factory=list) + global_input_datasets: List[str] = field(default_factory=list) + global_input_scalars: List[str] = field(default_factory=list) + global_input_dataset_or_scalar: List[str] = field(default_factory=list) + global_input_component_or_scalar: List[str] = field(default_factory=list) persistent: List[str] = field(default_factory=list) + all_outputs: List[str] = field(default_factory=list) diff --git a/src/vtlengine/Interpreter/__init__.py b/src/vtlengine/Interpreter/__init__.py index eda8789bc..7381d2a98 100644 --- a/src/vtlengine/Interpreter/__init__.py +++ b/src/vtlengine/Interpreter/__init__.py @@ -11,7 +11,7 @@ import vtlengine.Operators as Operators from vtlengine.AST.ASTTemplate import ASTTemplate from vtlengine.AST.DAG import HRDAGAnalyzer -from vtlengine.AST.DAG._models import DatasetSchedule +from vtlengine.AST.DAG._models import Schedule from vtlengine.AST.Grammar.tokens import ( AGGREGATE, ALL, @@ -115,7 +115,7 @@ class InterpreterAnalyzer(ASTTemplate): # Analysis mode only_semantic: bool = False # Memory efficient - ds_analysis: Optional[DatasetSchedule] = None + ds_analysis: Optional[Schedule] = None datapoints_paths: Optional[Dict[str, Path]] = None output_path: Optional[Union[str, Path]] = None # Time Period Representation @@ -195,7 +195,7 @@ def _save_datapoints_efficient(self, statement_num: int) -> None: or self.datasets[ds_name].data is None ): continue - if ds_name in self.ds_analysis.global_inputs: + if ds_name in self.ds_analysis.global_input_datasets: # We do not save global input datasets, only results of transformations self.datasets[ds_name].data = None continue diff --git a/tests/DAG/data/references/10.json b/tests/DAG/data/references/scheduling/10.json similarity index 97% rename from tests/DAG/data/references/10.json rename to tests/DAG/data/references/scheduling/10.json index 34df46b41..bf970b66c 100644 --- a/tests/DAG/data/references/10.json +++ b/tests/DAG/data/references/scheduling/10.json @@ -64,8 +64,5 @@ "BOP" ] }, - "global_inputs": [ - "BOP" - ], "persistent": [] } \ No newline at end of file diff --git a/tests/DAG/data/references/1.json b/tests/DAG/data/references/scheduling/11.json similarity index 77% rename from tests/DAG/data/references/1.json rename to tests/DAG/data/references/scheduling/11.json index 2621e2997..76743b78b 100644 --- a/tests/DAG/data/references/1.json +++ b/tests/DAG/data/references/scheduling/11.json @@ -1,13 +1,14 @@ { "insertion": {}, "deletion": { + "2": [ + "a" + ], "3": [ - "a", "b", "c" ] }, - "global_inputs": [], "persistent": [ "c" ] diff --git a/tests/DAG/data/references/scheduling/13.json b/tests/DAG/data/references/scheduling/13.json new file mode 100644 index 000000000..082d06b0d --- /dev/null +++ b/tests/DAG/data/references/scheduling/13.json @@ -0,0 +1,25 @@ +{ + "insertion": { + "1": [ + "SC_1", + "SC_2" + ], + "2": [ + "DS_1" + ] + }, + "deletion": { + "2": [ + "DS_1", + "DS_r", + "SC_r" + ], + "1": [ + "SC_1", + "SC_2" + ] + }, + "persistent": [ + "DS_r" + ] +} \ No newline at end of file diff --git a/tests/DAG/data/references/4.json b/tests/DAG/data/references/scheduling/16.json similarity index 56% rename from tests/DAG/data/references/4.json rename to tests/DAG/data/references/scheduling/16.json index dadf09596..741f492cb 100644 --- a/tests/DAG/data/references/4.json +++ b/tests/DAG/data/references/scheduling/16.json @@ -2,19 +2,20 @@ "insertion": { "1": [ "DS_1", - "DS_2" + "SC_1" ] }, "deletion": { "1": [ "DS_1", - "DS_2", "DS_r" + ], + "2": [ + "SC_1", + "SC_r" ] }, - "global_inputs": [ - "DS_1", - "DS_2" - ], - "persistent": [] + "persistent": [ + "DS_r" + ] } \ No newline at end of file diff --git a/tests/DAG/data/references/2.json b/tests/DAG/data/references/scheduling/2.json similarity index 90% rename from tests/DAG/data/references/2.json rename to tests/DAG/data/references/scheduling/2.json index 44dfb5825..902fd2827 100644 --- a/tests/DAG/data/references/2.json +++ b/tests/DAG/data/references/scheduling/2.json @@ -22,9 +22,6 @@ "A" ] }, - "global_inputs": [ - "A" - ], "persistent": [ "a", "b", diff --git a/tests/DAG/data/references/3.json b/tests/DAG/data/references/scheduling/3.json similarity index 89% rename from tests/DAG/data/references/3.json rename to tests/DAG/data/references/scheduling/3.json index 0e1cfdbd7..2099a26ea 100644 --- a/tests/DAG/data/references/3.json +++ b/tests/DAG/data/references/scheduling/3.json @@ -29,10 +29,6 @@ "A" ] }, - "global_inputs": [ - "A", - "A2" - ], "persistent": [ "F" ] diff --git a/tests/DAG/data/references/scheduling/35.json b/tests/DAG/data/references/scheduling/35.json new file mode 100644 index 000000000..931078719 --- /dev/null +++ b/tests/DAG/data/references/scheduling/35.json @@ -0,0 +1,26 @@ +{ + "insertion": { + "1": [ + "DS_1", + "DS_2" + ], + "2": [ + "DS_3" + ] + }, + "deletion": { + "3": [ + "DS_A", + "DS_B", + "DS_r" + ], + "2": [ + "DS_1", + "DS_3" + ], + "1": [ + "DS_2" + ] + }, + "persistent": [] +} \ No newline at end of file diff --git a/tests/DAG/data/references/scheduling/36.json b/tests/DAG/data/references/scheduling/36.json new file mode 100644 index 000000000..4df60c70b --- /dev/null +++ b/tests/DAG/data/references/scheduling/36.json @@ -0,0 +1,24 @@ +{ + "insertion": { + "2": [ + "SC_1" + ], + "3": [ + "DS_1" + ] + }, + "deletion": { + "2": [ + "SC_1", + "SC_a" + ], + "3": [ + "DS_1", + "DS_r", + "SC_b" + ] + }, + "persistent": [ + "DS_r" + ] +} \ No newline at end of file diff --git a/tests/DAG/data/references/5.json b/tests/DAG/data/references/scheduling/5.json similarity index 90% rename from tests/DAG/data/references/5.json rename to tests/DAG/data/references/scheduling/5.json index 47e0d83cb..6e3d061ee 100644 --- a/tests/DAG/data/references/5.json +++ b/tests/DAG/data/references/scheduling/5.json @@ -29,10 +29,6 @@ "DSD_POP" ] }, - "global_inputs": [ - "DSD_AGR", - "DSD_POP" - ], "persistent": [ "DS_check_agr", "DS_check_countries", diff --git a/tests/DAG/data/references/6.json b/tests/DAG/data/references/scheduling/6.json similarity index 91% rename from tests/DAG/data/references/6.json rename to tests/DAG/data/references/scheduling/6.json index cc0c0e644..b8672add0 100644 --- a/tests/DAG/data/references/6.json +++ b/tests/DAG/data/references/scheduling/6.json @@ -54,12 +54,6 @@ "DS2" ] }, - "global_inputs": [ - "BIS_LOC_STATS", - "DS1", - "DS2", - "DS3" - ], "persistent": [ "numCouYear", "numYearCou", diff --git a/tests/DAG/data/references/7.json b/tests/DAG/data/references/scheduling/7.json similarity index 96% rename from tests/DAG/data/references/7.json rename to tests/DAG/data/references/scheduling/7.json index e1456f9c5..075c98c33 100644 --- a/tests/DAG/data/references/7.json +++ b/tests/DAG/data/references/scheduling/7.json @@ -506,23 +506,6 @@ "ANCRDT_ACCNTNG_C" ] }, - "global_inputs": [ - "ANCRDT_ACCNTNG_C", - "ANCRDT_ACCNTNG_C_Z", - "ANCRDT_ENTTY", - "ANCRDT_ENTTY_DFLT_C", - "ANCRDT_ENTTY_DFLT_C_T1", - "ANCRDT_ENTTY_INSTRMNT_C", - "ANCRDT_ENTTY_RSK_C", - "ANCRDT_FNNCL_C", - "ANCRDT_FNNCL_C_T1", - "ANCRDT_INSTRMNT_C", - "ANCRDT_INSTRMNT_C_T1", - "ANCRDT_INSTRMNT_PRTCTN_RCVD_C", - "ANCRDT_JNT_LBLTS_C", - "ANCRDT_PRTCTN_RCVD_C", - "ANCRDT_PRTCTN_RCVD_C_T1" - ], "persistent": [ "ACCNTNG_CMPLTNSS", "CN0230", diff --git a/tests/DAG/data/references/8.json b/tests/DAG/data/references/scheduling/8.json similarity index 89% rename from tests/DAG/data/references/8.json rename to tests/DAG/data/references/scheduling/8.json index 111d84582..5f728af27 100644 --- a/tests/DAG/data/references/8.json +++ b/tests/DAG/data/references/scheduling/8.json @@ -98,18 +98,6 @@ "ANCRDT_INSTRMNT_C_T3" ] }, - "global_inputs": [ - "ANCRDT_ACCNTNG_C", - "ANCRDT_ACCNTNG_C_T3", - "ANCRDT_ENTTY", - "ANCRDT_ENTTY_DFLT_C", - "ANCRDT_ENTTY_INSTRMNT_C", - "ANCRDT_FNNCL_C", - "ANCRDT_INSTRMNT_C", - "ANCRDT_INSTRMNT_C_T1", - "ANCRDT_INSTRMNT_C_T2", - "ANCRDT_INSTRMNT_C_T3" - ], "persistent": [ "DP_RLST_ACCNTNG_FRMWRK_RSLT", "DP_RLST_ACCNTNG_INTRCMPNY", diff --git a/tests/DAG/data/references/9.json b/tests/DAG/data/references/scheduling/9.json similarity index 94% rename from tests/DAG/data/references/9.json rename to tests/DAG/data/references/scheduling/9.json index 6381e3a2b..b487827b6 100644 --- a/tests/DAG/data/references/9.json +++ b/tests/DAG/data/references/scheduling/9.json @@ -128,13 +128,6 @@ "Oferta_PT_2025_Q1" ] }, - "global_inputs": [ - "Income_PT", - "Inflation_PT", - "Inflation_divisors_Q", - "Oferta_PT_2025_Q1", - "Vendas_PT_2025_Q1" - ], "persistent": [ "output_generic", "output_generic_eda", diff --git a/tests/DAG/data/vtl/11.vtl b/tests/DAG/data/vtl/11.vtl new file mode 100644 index 000000000..d666ead62 --- /dev/null +++ b/tests/DAG/data/vtl/11.vtl @@ -0,0 +1,3 @@ +a := 1; +b := a + 2; +c <- b * 3; diff --git a/tests/DAG/data/vtl/13.vtl b/tests/DAG/data/vtl/13.vtl new file mode 100644 index 000000000..298741c6d --- /dev/null +++ b/tests/DAG/data/vtl/13.vtl @@ -0,0 +1,2 @@ +SC_r := SC_1 + SC_2; +DS_r <- DS_1[calc Me_2 := Me_1 + SC_r]; diff --git a/tests/DAG/data/vtl/16.vtl b/tests/DAG/data/vtl/16.vtl new file mode 100644 index 000000000..97392464a --- /dev/null +++ b/tests/DAG/data/vtl/16.vtl @@ -0,0 +1,2 @@ +DS_r <- DS_1 + SC_1; +SC_r := SC_1 * 2; diff --git a/tests/DAG/data/vtl/35.vtl b/tests/DAG/data/vtl/35.vtl new file mode 100644 index 000000000..d5a7f8116 --- /dev/null +++ b/tests/DAG/data/vtl/35.vtl @@ -0,0 +1,3 @@ +DS_A := DS_1 + DS_2; +DS_B := DS_1 * DS_3; +DS_r := DS_A + DS_B; diff --git a/tests/DAG/data/vtl/36.vtl b/tests/DAG/data/vtl/36.vtl new file mode 100644 index 000000000..2d6cb0181 --- /dev/null +++ b/tests/DAG/data/vtl/36.vtl @@ -0,0 +1,3 @@ +SC_a := 10; +SC_b := SC_a + SC_1; +DS_r <- DS_1[calc Me_2 := Me_1 + SC_b]; diff --git a/tests/DAG/test_classification.py b/tests/DAG/test_classification.py new file mode 100644 index 000000000..65c711843 --- /dev/null +++ b/tests/DAG/test_classification.py @@ -0,0 +1,819 @@ +from dataclasses import dataclass, field +from pathlib import Path +from typing import List, Optional + +import pytest + +from vtlengine.API import create_ast +from vtlengine.AST.DAG import DAGAnalyzer + +data_path = Path(__file__).parent / "data" + + +@dataclass +class Classification: + datasets: List[str] = field(default_factory=list) + scalars: List[str] = field(default_factory=list) + dataset_or_scalar: List[str] = field(default_factory=list) + component_or_scalar: List[str] = field(default_factory=list) + vtl: Optional[str] = None + + +# Tests 1-11: vtl=None → read from VTL files (complex multi-statement scripts). +# Tests 12+: vtl is inline. +CASES: dict[str, Classification] = { + # --- File-based tests (complex multi-statement scripts) --- + "1": Classification(), + "2": Classification(dataset_or_scalar=["A"]), + "3": Classification( + datasets=["A", "A2"], + component_or_scalar=["var1", "var3", "varF", "varRel", "varRel2"], + ), + "4": Classification(datasets=["DS_1", "DS_2"]), + "5": Classification( + datasets=["DSD_AGR", "DSD_POP"], + component_or_scalar=["AGE", "MEASURE", "SEX", "TIME_HORIZ", "UNIT_MEASURE"], + ), + "6": Classification( + datasets=["BIS_LOC_STATS", "DS1", "DS2", "DS3"], + component_or_scalar=[ + "CURRENCY", + "CURRENCY_DENOM", + "EXCHANGE_RATE", + "EXR_SUFFIX", + "EXR_TYPE", + "FREQ", + "OBS_VALUE", + ], + ), + "7": Classification( + datasets=[ + "ANCRDT_ACCNTNG_C", + "ANCRDT_ACCNTNG_C_Z", + "ANCRDT_ENTTY", + "ANCRDT_ENTTY_DFLT_C", + "ANCRDT_ENTTY_DFLT_C_T1", + "ANCRDT_ENTTY_INSTRMNT_C", + "ANCRDT_ENTTY_RSK_C", + "ANCRDT_FNNCL_C", + "ANCRDT_FNNCL_C_T1", + "ANCRDT_INSTRMNT_C", + "ANCRDT_INSTRMNT_C_T1", + "ANCRDT_INSTRMNT_PRTCTN_RCVD_C", + "ANCRDT_JNT_LBLTS_C", + "ANCRDT_PRTCTN_RCVD_C", + "ANCRDT_PRTCTN_RCVD_C_T1", + ], + component_or_scalar=[ + "ACCMLTD_WRTFFS", + "ANCRDT_DRGTN_QRTR_CR_OA", + "CC0010", + "CNTRY", + "CRDTR", + "CRDTR_CD", + "DBTR_CD", + "DFLT_STTS", + "DT_BRTH", + "DT_INCPTN", + "DT_RFRNC", + "ENTTY_RIAD_CD", + "ENTTY_RL", + "FRGN_BRNCH", + "HD_OFFC_UNDRT_CD", + "HD_OFFC_UNDRT_CNTRY", + "HD_QRTR_CD_CRDTR", + "HD_QRTR_CD_DBTR", + "IMMDT_PRNT_UNDRT_CD", + "INSTTTNL_SCTR", + "INSTTTNL_SCTR_DTL", + "IS_PRTCTN_PRVDR", + "LGL_FRM", + "OBSRVD_AGNT_CD", + "OFF_BLNC_SHT_AMNT", + "OTHR_TYP_ENTTY", + "OTSTNDNG_NMNL_AMNT", + "PRTCTN_ALLCTD_VL", + "PRTCTN_PRVDR_CD", + "RCGNTN_STTS", + "RCRS", + "SPFUND", + "SRVCR", + "SSMSIGNIFICANCE", + "THRD_PRTY_PRRTY_CLMS", + "TRD_RCVBL_NN_RCRS", + "TTL_NMBR_DBTRS", + "TTL_NMBR_DFLT_DBTRS", + "TYP_INSTRMNT", + "TYP_PRTCTN", + "TYP_SCRTSTN", + "ULTMT_PRNT_UNDRT_CD", + ], + ), + "8": Classification( + datasets=[ + "ANCRDT_ACCNTNG_C", + "ANCRDT_ACCNTNG_C_T3", + "ANCRDT_ENTTY", + "ANCRDT_ENTTY_DFLT_C", + "ANCRDT_ENTTY_INSTRMNT_C", + "ANCRDT_FNNCL_C", + "ANCRDT_INSTRMNT_C", + "ANCRDT_INSTRMNT_C_T1", + "ANCRDT_INSTRMNT_C_T2", + "ANCRDT_INSTRMNT_C_T3", + ], + component_or_scalar=[ + "ENTTY_RIAD_CD", + "ENTTY_RL", + "HD_OFFC_UNDRT_CD", + "LGL_ENTTY_CD", + "OBSRVD_AGNT_CD", + ], + ), + "9": Classification( + datasets=[ + "Income_PT", + "Inflation_PT", + "Inflation_divisors_Q", + "Oferta_PT_2025_Q1", + "Vendas_PT_2025_Q1", + ], + component_or_scalar=[ + "coefficient", + "coefficient_cq", + "coefficient_inv", + "coefficient_lc", + "coefficient_lcq", + "coefficient_q", + "county", + "divisor", + "estado", + "income", + "period_label", + "regiao", + "value", + "var", + "year_str", + ], + ), + "10": Classification( + datasets=["BOP"], + component_or_scalar=[ + "ACCOUNTING_ENTRY", + "ADJUSTMENT", + "COMP_METHOD", + "COUNTERPART_SECTOR", + "CURRENCY_DENOM", + "FLOW_STOCK_ENTRY", + "FREQ", + "FUNCTIONAL_CAT", + "INSTR_ASSET", + "INT_ACC_ITEM", + "MATURITY", + "REF_SECTOR", + "VALUATION", + "imbalance", + ], + ), + "11": Classification(), + # --- Inline tests --- + # Calc with external component/scalar + "12": Classification( + vtl="DS_r <- DS_1[calc Me_2 := Me_1 * SC_1];", + datasets=["DS_1"], + component_or_scalar=["Me_1", "SC_1"], + ), + # Scalar chain with UDO + "13": Classification( + vtl="SC_a := SC_1 + SC_2;\nDS_r <- DS_1[calc Me_2 := Me_1 + SC_a];", + datasets=["DS_1"], + scalars=["SC_1", "SC_2"], + component_or_scalar=["Me_1"], + ), + # Dual binary op + "14": Classification( + vtl="DS_r <- DS_1 + DS_2;", + dataset_or_scalar=["DS_1", "DS_2"], + ), + # Scalar chain feeding calc + "15": Classification( + vtl="SC_r := 10;\nDS_r <- DS_1[calc Me_2 := Me_1 + SC_r];", + datasets=["DS_1"], + component_or_scalar=["Me_1"], + ), + # Mixed dataset_or_scalar + scalar chain + "16": Classification( + vtl="DS_r <- DS_1 + SC_1;\nSC_r := SC_1 * 2;", + dataset_or_scalar=["DS_1", "SC_1"], + ), + # Membership + "17": Classification(vtl="DS_r := DS_1#Me_1;", datasets=["DS_1"]), + # Set operators (dataset-only) + "18": Classification(vtl="DS_r <- union(DS_1, DS_2);", datasets=["DS_1", "DS_2"]), + "19": Classification(vtl="DS_r <- intersect(DS_1, DS_2);", datasets=["DS_1", "DS_2"]), + "20": Classification(vtl="DS_r <- setdiff(DS_1, DS_2);", datasets=["DS_1", "DS_2"]), + # If-then-else (dual) + "21": Classification( + vtl="DS_r := if DS_1 then DS_2 else DS_3;", + dataset_or_scalar=["DS_1", "DS_2", "DS_3"], + ), + "22": Classification( + vtl="SC_r := if true then SC_1 else SC_2;", + dataset_or_scalar=["SC_1", "SC_2"], + ), + # Comparison / logical (dual) + "23": Classification(vtl="DS_r := DS_1 > DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "24": Classification(vtl="DS_r := DS_1 and DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "25": Classification(vtl="DS_r := not DS_1;", dataset_or_scalar=["DS_1"]), + # Numeric (dual) + "26": Classification(vtl="DS_r := abs(DS_1);", dataset_or_scalar=["DS_1"]), + "27": Classification(vtl="SC_r := abs(SC_1);", dataset_or_scalar=["SC_1"]), + # Aggregation (dataset-only) + "28": Classification(vtl="DS_r := sum(DS_1);", datasets=["DS_1"]), + # Clause operators + "29": Classification( + vtl="DS_r <- DS_1[filter Me_1 > 10];", + datasets=["DS_1"], + component_or_scalar=["Me_1"], + ), + "30": Classification(vtl="DS_r <- DS_1[keep Me_1];", datasets=["DS_1"]), + "31": Classification(vtl="DS_r <- DS_1[drop Me_1];", datasets=["DS_1"]), + "32": Classification(vtl="DS_r <- DS_1[rename Me_1 to Me_2];", datasets=["DS_1"]), + "33": Classification( + vtl="DS_r := DS_1[sub Id_1 = 1];", + datasets=["DS_1"], + component_or_scalar=["Id_1"], + ), + # Parameterized (dual) + "34": Classification(vtl="DS_r := round(DS_1, 2);", dataset_or_scalar=["DS_1"]), + # Multi-statement with intermediates + "35": Classification( + vtl="DS_A := DS_1 + DS_2;\nDS_B := DS_1 * DS_3;\nDS_r := DS_A + DS_B;", + dataset_or_scalar=["DS_1", "DS_2", "DS_3"], + ), + # Scalar chain propagation + "36": Classification( + vtl="SC_a := 10;\nSC_b := SC_a + SC_1;\nDS_r <- DS_1[calc Me_2 := Me_1 + SC_b];", + datasets=["DS_1"], + scalars=["SC_1"], + component_or_scalar=["Me_1"], + ), + # Join (dataset-only) + "37": Classification(vtl="DS_r := inner_join(DS_1, DS_2);", datasets=["DS_1", "DS_2"]), + # Dual unary + "38": Classification(vtl="DS_r := isnull(DS_1);", dataset_or_scalar=["DS_1"]), + "39": Classification(vtl="DS_r := -DS_1;", dataset_or_scalar=["DS_1"]), + # Calc with multiple external refs + "40": Classification( + vtl="DS_r <- DS_1[calc Me_2 := Me_1 + SC_1, Me_3 := Me_1 * SC_2];", + datasets=["DS_1"], + component_or_scalar=["Me_1", "SC_1", "SC_2"], + ), + # UDO with typed parameters + "41": Classification( + vtl=( + "define operator my_op (ds dataset, sc number)\n" + " returns dataset is\n" + " ds * sc\n" + "end operator;\n\n" + "DS_r := my_op(DS_1, SC_1);" + ), + datasets=["DS_1"], + scalars=["SC_1"], + ), + # Time operators (dataset-only) + "42": Classification(vtl="DS_r := flow_to_stock(DS_1);", datasets=["DS_1"]), + "43": Classification(vtl="DS_r := stock_to_flow(DS_1);", datasets=["DS_1"]), + "44": Classification(vtl="DS_r := exists_in(DS_1, DS_2, all);", datasets=["DS_1", "DS_2"]), + "45": Classification(vtl="DS_r := timeshift(DS_1, 1);", datasets=["DS_1"]), + # --- Dual BinOp operators --- + "46": Classification(vtl="DS_r := DS_1 / DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "47": Classification(vtl="DS_r := mod(DS_1, DS_2);", dataset_or_scalar=["DS_1", "DS_2"]), + "48": Classification(vtl="DS_r := DS_1 || DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "49": Classification(vtl="DS_r := DS_1 or DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "50": Classification(vtl="DS_r := DS_1 xor DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "51": Classification(vtl="DS_r := DS_1 = DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "52": Classification(vtl="DS_r := DS_1 <> DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "53": Classification(vtl="DS_r := DS_1 >= DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "54": Classification(vtl="DS_r := DS_1 < DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "55": Classification(vtl="DS_r := DS_1 <= DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "56": Classification(vtl="DS_r := DS_1 in {1, 2, 3};", dataset_or_scalar=["DS_1"]), + "57": Classification(vtl="DS_r := DS_1 not_in {1, 2, 3};", dataset_or_scalar=["DS_1"]), + "58": Classification( + vtl='DS_r := match_characters(DS_1, "[a-z]+");', + dataset_or_scalar=["DS_1"], + ), + "59": Classification(vtl="DS_r := nvl(DS_1, 0);", dataset_or_scalar=["DS_1"]), + # --- Dual UnaryOp operators --- + "60": Classification(vtl="DS_r := exp(DS_1);", dataset_or_scalar=["DS_1"]), + "61": Classification(vtl="DS_r := ln(DS_1);", dataset_or_scalar=["DS_1"]), + "62": Classification(vtl="DS_r := sqrt(DS_1);", dataset_or_scalar=["DS_1"]), + "63": Classification(vtl="DS_r := ceil(DS_1);", dataset_or_scalar=["DS_1"]), + "64": Classification(vtl="DS_r := floor(DS_1);", dataset_or_scalar=["DS_1"]), + "65": Classification(vtl="DS_r := +DS_1;", dataset_or_scalar=["DS_1"]), + "66": Classification(vtl="DS_r := length(DS_1);", dataset_or_scalar=["DS_1"]), + "67": Classification(vtl="DS_r := upper(DS_1);", dataset_or_scalar=["DS_1"]), + "68": Classification(vtl="DS_r := lower(DS_1);", dataset_or_scalar=["DS_1"]), + "69": Classification(vtl="DS_r := trim(DS_1);", dataset_or_scalar=["DS_1"]), + "70": Classification(vtl="DS_r := ltrim(DS_1);", dataset_or_scalar=["DS_1"]), + "71": Classification(vtl="DS_r := rtrim(DS_1);", dataset_or_scalar=["DS_1"]), + # --- Dual ParamOp operators --- + "72": Classification(vtl="DS_r := trunc(DS_1, 2);", dataset_or_scalar=["DS_1"]), + "73": Classification(vtl="DS_r := power(DS_1, 2);", dataset_or_scalar=["DS_1"]), + "74": Classification(vtl="DS_r := log(DS_1, 10);", dataset_or_scalar=["DS_1"]), + "75": Classification(vtl="DS_r := substr(DS_1, 1, 3);", dataset_or_scalar=["DS_1"]), + "76": Classification(vtl='DS_r := replace(DS_1, "a", "b");', dataset_or_scalar=["DS_1"]), + "77": Classification(vtl='DS_r := instr(DS_1, "a");', dataset_or_scalar=["DS_1"]), + "78": Classification(vtl="DS_r := cast(DS_1, integer);", dataset_or_scalar=["DS_1"]), + # --- Dual MulOp / conditional --- + "79": Classification(vtl="DS_r := between(DS_1, 1, 10);", dataset_or_scalar=["DS_1"]), + "80": Classification( + vtl="DS_r := case when DS_1 > 0 then DS_2 else DS_3;", + dataset_or_scalar=["DS_1", "DS_2", "DS_3"], + ), + # --- Dual time operators --- + "81": Classification( + vtl="DS_r := datediff(DS_1, DS_2);", + dataset_or_scalar=["DS_1", "DS_2"], + ), + "82": Classification(vtl='DS_r := dateadd(DS_1, 1, "M");', dataset_or_scalar=["DS_1"]), + "83": Classification(vtl="DS_r := getyear(DS_1);", dataset_or_scalar=["DS_1"]), + "84": Classification(vtl="DS_r := getmonth(DS_1);", dataset_or_scalar=["DS_1"]), + "85": Classification(vtl="DS_r := dayofmonth(DS_1);", dataset_or_scalar=["DS_1"]), + "86": Classification(vtl="DS_r := dayofyear(DS_1);", dataset_or_scalar=["DS_1"]), + # --- Dataset-only operators --- + "87": Classification(vtl="DS_r := symdiff(DS_1, DS_2);", datasets=["DS_1", "DS_2"]), + "88": Classification(vtl="DS_r := left_join(DS_1, DS_2);", datasets=["DS_1", "DS_2"]), + "89": Classification(vtl="DS_r := full_join(DS_1, DS_2);", datasets=["DS_1", "DS_2"]), + "90": Classification( + vtl="DS_r := cross_join(DS_1 as d1, DS_2 as d2);", + datasets=["DS_1", "DS_2"], + ), + "91": Classification(vtl="DS_r := count(DS_1);", datasets=["DS_1"]), + "92": Classification(vtl="DS_r := min(DS_1);", datasets=["DS_1"]), + "93": Classification(vtl="DS_r := max(DS_1);", datasets=["DS_1"]), + "94": Classification(vtl="DS_r := avg(DS_1);", datasets=["DS_1"]), + "95": Classification(vtl="DS_r := median(DS_1);", datasets=["DS_1"]), + "96": Classification(vtl="DS_r := stddev_pop(DS_1);", datasets=["DS_1"]), + "97": Classification(vtl="DS_r := stddev_samp(DS_1);", datasets=["DS_1"]), + "98": Classification(vtl="DS_r := var_pop(DS_1);", datasets=["DS_1"]), + "99": Classification(vtl="DS_r := var_samp(DS_1);", datasets=["DS_1"]), + "100": Classification(vtl="DS_r := fill_time_series(DS_1, all);", datasets=["DS_1"]), + "101": Classification(vtl="DS_r := period_indicator(DS_1);", datasets=["DS_1"]), + # --- Clause operators --- + "102": Classification( + vtl="DS_r := DS_1[aggr Me_1 := sum(Me_2) group by Id_1];", + datasets=["DS_1"], + component_or_scalar=["Me_2"], + ), + "103": Classification(vtl="DS_r := DS_1[pivot Id_1, Me_1];", datasets=["DS_1"]), + "104": Classification(vtl="DS_r := DS_1[unpivot Id_1, Me_1];", datasets=["DS_1"]), + # --- Analytic operators (operands are always components, never scalars) --- + "105": Classification( + vtl="DS_r := DS_1[calc Me_2 := first_value(Me_1 over (order by Id_1))];", + datasets=["DS_1"], + ), + "106": Classification( + vtl="DS_r := DS_1[calc Me_2 := last_value(Me_1 over (order by Id_1))];", + datasets=["DS_1"], + ), + "107": Classification( + vtl="DS_r := DS_1[calc Me_2 := lag(Me_1, 1 over (order by Id_1))];", + datasets=["DS_1"], + ), + "108": Classification( + vtl="DS_r := DS_1[calc Me_2 := lead(Me_1, 1 over (order by Id_1))];", + datasets=["DS_1"], + ), + "109": Classification( + vtl="DS_r := DS_1[calc Me_2 := rank(over (order by Id_1))];", + datasets=["DS_1"], + ), + "110": Classification( + vtl="DS_r := DS_1[calc Me_2 := ratio_to_report(Me_1 over (partition by Id_1))];", + datasets=["DS_1"], + ), + # --- Join clause operators --- + "111": Classification( + vtl="DS_r := inner_join(DS_1, DS_2 calc Me_2 := Me_1 + SC_1);", + datasets=["DS_1", "DS_2"], + component_or_scalar=["Me_1", "SC_1"], + ), + "112": Classification( + vtl="DS_r := inner_join(DS_1, DS_2 filter Me_1 > 10);", + datasets=["DS_1", "DS_2"], + component_or_scalar=["Me_1"], + ), + "113": Classification( + vtl="DS_r := inner_join(DS_1, DS_2 keep Me_1);", + datasets=["DS_1", "DS_2"], + ), + "114": Classification( + vtl="DS_r := inner_join(DS_1, DS_2 rename Me_1 to Me_2);", + datasets=["DS_1", "DS_2"], + ), + "115": Classification( + vtl="DS_r := inner_join(DS_1, DS_2 using Id_1);", + datasets=["DS_1", "DS_2"], + ), + "116": Classification( + vtl="DS_r := inner_join(DS_1, DS_2, DS_3);", + datasets=["DS_1", "DS_2", "DS_3"], + ), + "117": Classification( + vtl="DS_r := inner_join(DS_1, DS_2 using Id_1 calc Me_2 := Me_1 + SC_1);", + datasets=["DS_1", "DS_2"], + component_or_scalar=["Me_1", "SC_1"], + ), + "118": Classification( + vtl="DS_r := inner_join(DS_1, DS_2 aggr Me_1 := sum(Me_2) group by Id_1);", + datasets=["DS_1", "DS_2"], + component_or_scalar=["Me_2"], + ), + "119": Classification( + vtl="DS_r := left_join(DS_1, DS_2 calc Me_2 := Me_1 * 2);", + datasets=["DS_1", "DS_2"], + component_or_scalar=["Me_1"], + ), + "120": Classification( + vtl="DS_r := full_join(DS_1, DS_2 filter Me_1 > 0);", + datasets=["DS_1", "DS_2"], + component_or_scalar=["Me_1"], + ), + # --- Validation operators --- + "121": Classification( + vtl='DS_r := check(DS_1#Me_1 > 0 errorcode "E001" errorlevel 1);', + datasets=["DS_1"], + ), + "122": Classification( + vtl='DS_r := check(not isnull(DS_1#Me_1) errorcode "E002" errorlevel 2 invalid);', + datasets=["DS_1"], + ), + "123": Classification( + vtl='DS_r := check(exists_in(DS_1, DS_2, true) errorcode "E003" errorlevel 3 all);', + datasets=["DS_1", "DS_2"], + ), + "124": Classification( + vtl=( + "define datapoint ruleset dpr1 (variable Me_1 as Number) is\n" + ' rule1: Me_1 > 0 errorcode "E001" errorlevel 1\n' + "end datapoint ruleset;\n\n" + "DS_r := check_datapoint(DS_1, dpr1);" + ), + datasets=["DS_1"], + ), + "125": Classification( + vtl=( + "define datapoint ruleset dpr1 (variable Me_1 as Number) is\n" + ' rule1: Me_1 > 0 errorcode "E001" errorlevel 1\n' + "end datapoint ruleset;\n\n" + "DS_r := check_datapoint(DS_1, dpr1 all);" + ), + datasets=["DS_1"], + ), + "128": Classification( + vtl='DS_r := check(DS_1 + SC_1 > 0 errorcode "E001" errorlevel 1);', + dataset_or_scalar=["DS_1", "SC_1"], + ), + # --- Hierarchical operators --- + "126": Classification( + vtl=( + "define hierarchical ruleset hr1 (variable rule Me_1) is\n" + " A = B + C\n" + "end hierarchical ruleset;\n\n" + "DS_r := hierarchy(DS_1, hr1 rule Me_1 non_null all);" + ), + datasets=["DS_1"], + ), + "127": Classification( + vtl=( + "define hierarchical ruleset hr1 (variable rule Me_1) is\n" + " A = B + C\n" + "end hierarchical ruleset;\n\n" + "DS_r := check_hierarchy(DS_1, hr1 rule Me_1 non_null all);" + ), + datasets=["DS_1"], + ), + # --- Dataset-only operators with mixed sub-expressions --- + # When a dataset-only operator wraps a complex expression (e.g., DS_1 + SC_1), + # the sub-expression operands should NOT all be forced to datasets. + "129": Classification( + vtl="DS_r := count(DS_1 + SC_1);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + "130": Classification( + vtl="DS_r := sum(DS_1 * SC_1);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + "131": Classification( + vtl="DS_r := avg(DS_1 + SC_1);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + "132": Classification( + vtl="DS_r := flow_to_stock(DS_1 + SC_1);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + "133": Classification( + vtl="DS_r := stock_to_flow(DS_1 - SC_1);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + "134": Classification( + vtl="DS_r := union(DS_1 + SC_1, DS_2);", + datasets=["DS_2"], + dataset_or_scalar=["DS_1", "SC_1"], + ), + "135": Classification( + vtl="DS_r := intersect(DS_1 + SC_1, DS_2);", + datasets=["DS_2"], + dataset_or_scalar=["DS_1", "SC_1"], + ), + "136": Classification( + vtl="DS_r := union(DS_1 + SC_1, DS_2 + SC_2);", + dataset_or_scalar=["DS_1", "DS_2", "SC_1", "SC_2"], + ), + "137": Classification( + vtl="DS_r := exists_in(DS_1 + SC_1, DS_2, all);", + datasets=["DS_2"], + dataset_or_scalar=["DS_1", "SC_1"], + ), + "138": Classification( + vtl="DS_r := timeshift(DS_1 + SC_1, 1);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + "139": Classification( + vtl="DS_r := (DS_1 + SC_1)#Me_1;", + dataset_or_scalar=["DS_1", "SC_1"], + ), + # Calc clause with dataset-only sub-expression: dataset is direct VarID, + # but calc body contains sub-expression with external refs + "140": Classification( + vtl="DS_r <- DS_1[calc Me_2 := sum(Me_1 + SC_1)];", + datasets=["DS_1"], + component_or_scalar=["Me_1", "SC_1"], + ), + # --- Multi-statement ambiguity propagation --- + # Intermediate from mixed expr fed to aggregation: ambiguity propagates + "141": Classification( + vtl="DS_A := DS_1 + SC_1;\nDS_r := sum(DS_A);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + # Intermediate from union fed to flow_to_stock with scalar + "142": Classification( + vtl="DS_A := union(DS_1, DS_2);\nDS_r := flow_to_stock(DS_A + SC_1);", + datasets=["DS_1", "DS_2"], + dataset_or_scalar=["SC_1"], + ), + # Intermediate from mixed expr fed to timeshift + "143": Classification( + vtl="DS_A := DS_1 + SC_1;\nDS_r := timeshift(DS_A, 1);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + # --- Nested dataset-only operators --- + # count(union(...)): inner union marks datasets, count wraps it + "144": Classification( + vtl="DS_r := count(union(DS_1, DS_2));", + datasets=["DS_1", "DS_2"], + ), + # union result combined with scalar in outer expression + "145": Classification( + vtl="DS_r := union(DS_1, DS_2) + SC_1;", + datasets=["DS_1", "DS_2"], + dataset_or_scalar=["SC_1"], + ), + # --- fill_time_series / period_indicator with expression --- + "146": Classification( + vtl="DS_r := fill_time_series(DS_1 + SC_1, all);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + "147": Classification( + vtl="DS_r := period_indicator(DS_1 + SC_1);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + # --- setdiff / symdiff with expressions --- + "148": Classification( + vtl="DS_r := setdiff(DS_1 + SC_1, DS_2);", + datasets=["DS_2"], + dataset_or_scalar=["DS_1", "SC_1"], + ), + "149": Classification( + vtl="DS_r := symdiff(DS_1 + SC_1, DS_2 + SC_2);", + dataset_or_scalar=["DS_1", "DS_2", "SC_1", "SC_2"], + ), + # --- check / validation with mixed sub-expressions --- + # check wrapping aggregation: sum forces DS_1 to dataset + "150": Classification( + vtl='DS_r := check(sum(DS_1) > 0 errorcode "E001" errorlevel 1);', + datasets=["DS_1"], + ), + # check with membership + scalar: DS_1 is dataset (membership), SC_1 ambiguous + "151": Classification( + vtl='DS_r := check(DS_1#Me_1 + SC_1 > 0 errorcode "E001" errorlevel 1);', + datasets=["DS_1"], + dataset_or_scalar=["SC_1"], + ), + # --- Clause operators with external scalar in expression --- + # filter with external scalar ref + "152": Classification( + vtl="DS_r <- DS_1[filter Me_1 + SC_1 > 0];", + datasets=["DS_1"], + component_or_scalar=["Me_1", "SC_1"], + ), + # aggr with external scalar in aggregation body + "153": Classification( + vtl="DS_r := DS_1[aggr Me_1 := sum(Me_2 + SC_1) group by Id_1];", + datasets=["DS_1"], + component_or_scalar=["Me_2", "SC_1"], + ), + # --- Membership on expression result --- + "154": Classification( + vtl="DS_r := (DS_1 * SC_1)#Me_1;", + dataset_or_scalar=["DS_1", "SC_1"], + ), + # --- Duration conversion operators (dual) --- + "155": Classification(vtl="DS_r := daytoyear(DS_1);", dataset_or_scalar=["DS_1"]), + "156": Classification(vtl="DS_r := daytomonth(DS_1);", dataset_or_scalar=["DS_1"]), + "157": Classification(vtl="DS_r := yeartoday(DS_1);", dataset_or_scalar=["DS_1"]), + "158": Classification(vtl="DS_r := monthtoday(DS_1);", dataset_or_scalar=["DS_1"]), + # --- time_agg (dual) --- + "159": Classification( + vtl='DS_r := time_agg("A", DS_1);', + dataset_or_scalar=["DS_1"], + ), + # --- current_date (no inputs) --- + "160": Classification(vtl="SC_r := current_date();"), + # --- random (seed=dual, index=scalar) --- + "161": Classification(vtl="SC_r := random(42, 1);"), + "162": Classification(vtl="DS_r := random(DS_1, 1);", dataset_or_scalar=["DS_1"]), + "165": Classification( + vtl="DS_r := random(DS_1, SC_1);", + scalars=["SC_1"], + dataset_or_scalar=["DS_1"], + ), + # --- eval (dataset-only: external SQL routine) --- + "163": Classification( + vtl=( + "DS_r := eval(my_routine(DS_1)" + ' language "SQL"' + " returns dataset { identifier Id_1, measure Me_1 });" + ), + datasets=["DS_1"], + ), + "164": Classification( + vtl=( + "DS_r := eval(my_routine(DS_1, DS_2)" + ' language "SQL"' + " returns dataset { identifier Id_1, measure Me_1 });" + ), + datasets=["DS_1", "DS_2"], + ), + # --- Operators with scalar-constrained parameters --- + # round: param is always scalar + "166": Classification( + vtl="DS_r := round(DS_1, SC_1);", + scalars=["SC_1"], + dataset_or_scalar=["DS_1"], + ), + # trunc: param is always scalar + "167": Classification( + vtl="DS_r := trunc(DS_1, SC_1);", + scalars=["SC_1"], + dataset_or_scalar=["DS_1"], + ), + # substr: start and length are always scalars + "168": Classification( + vtl="DS_r := substr(DS_1, SC_1, SC_2);", + scalars=["SC_1", "SC_2"], + dataset_or_scalar=["DS_1"], + ), + # replace: pattern and replacement are always scalars + "169": Classification( + vtl="DS_r := replace(DS_1, SC_1, SC_2);", + scalars=["SC_1", "SC_2"], + dataset_or_scalar=["DS_1"], + ), + # instr: pattern is always scalar + "170": Classification( + vtl="DS_r := instr(DS_1, SC_1);", + scalars=["SC_1"], + dataset_or_scalar=["DS_1"], + ), + # between: from and to are always scalars + "171": Classification( + vtl="DS_r := between(DS_1, SC_1, SC_2);", + scalars=["SC_1", "SC_2"], + dataset_or_scalar=["DS_1"], + ), + # --- Group A: Classification logic edge cases --- + # A1: Variable in both dataset_inputs (union) and scalar_inputs (round param) + "172": Classification( + vtl="DS_r := union(DS_1, DS_2);\nDS_r2 := round(DS_3, DS_1);", + datasets=["DS_2"], + dataset_or_scalar=["DS_1", "DS_3"], + ), + # A2: Scalar chain from resolved-from-unknown variable + "173": Classification( + vtl="DS_r <- DS_1[calc Me_2 := Me_1 + SC_b];\nSC_b := SC_1 + 10;", + datasets=["DS_1"], + scalars=["SC_1"], + component_or_scalar=["Me_1"], + ), + # A3: Scalar chain broken by dataset-only operator + "174": Classification( + vtl="SC_a := SC_1 + SC_2;\nDS_r := union(SC_a, DS_1);", + datasets=["DS_1"], + dataset_or_scalar=["SC_1", "SC_2"], + ), + # A4: Same variable in ambiguous AND scalar chain contexts + "175": Classification( + vtl="DS_A := DS_1 + SC_1;\nSC_r := SC_1 * 2;", + dataset_or_scalar=["DS_1", "SC_1"], + ), + # A5: Multiple persistent assignments with shared variable + "176": Classification( + vtl="DS_r1 <- DS_1 + SC_1;\nDS_r2 <- DS_2 * SC_1;", + dataset_or_scalar=["DS_1", "DS_2", "SC_1"], + ), + # --- Group B: Component/scalar edge cases --- + # B1: Calc with multiple assignments referencing each other's components + "177": Classification( + vtl="DS_r <- DS_1[calc Me_2 := Me_1 + SC_1, Me_3 := sum(Me_2)];", + datasets=["DS_1"], + component_or_scalar=["Me_1", "Me_2", "SC_1"], + ), + # B2: Unknown variable not resolved (stays as component_or_scalar) + "178": Classification( + vtl="DS_A <- DS_1[calc Me_2 := Me_1 + X];\nDS_B := DS_A + 1;", + datasets=["DS_1"], + component_or_scalar=["Me_1", "X"], + ), + # B3: Unknown variable resolved by later output (not a global input) + "179": Classification( + vtl="DS_r <- DS_1[calc Me_2 := Me_1 + X];\nX := 10;", + datasets=["DS_1"], + component_or_scalar=["Me_1"], + ), + # --- Group C: Operator combination edge cases --- + # C1: Membership on union result + "180": Classification( + vtl="DS_r := (union(DS_1, DS_2))#Me_1;", + datasets=["DS_1", "DS_2"], + ), + # C2: Scalar-constrained param with expression operand + "181": Classification( + vtl="DS_r := round(DS_1 + SC_1, SC_2);", + scalars=["SC_2"], + dataset_or_scalar=["DS_1", "SC_1"], + ), + # C3: time_agg with expression operand + "182": Classification( + vtl='DS_r := time_agg("A", DS_1 + SC_1);', + dataset_or_scalar=["DS_1", "SC_1"], + ), + # C4: Check with two membership refs on same dataset + "183": Classification( + vtl='DS_r := check(DS_1#Me_1 + DS_1#Me_2 > 0 errorcode "E001" errorlevel 1 all);', + datasets=["DS_1"], + ), + # C5: Join with scalar-constrained param in calc + "184": Classification( + vtl="DS_r := inner_join(DS_1, DS_2 calc Me_2 := round(Me_1, SC_1));", + datasets=["DS_1", "DS_2"], + component_or_scalar=["Me_1", "SC_1"], + ), + # C6: Deeply nested dual operators (6 levels) + "185": Classification( + vtl="DS_r := abs(ceil(floor(exp(ln(sqrt(DS_1))))));", + dataset_or_scalar=["DS_1"], + ), + # C7: If-then-else with dataset-only operator in branch + "186": Classification( + vtl="DS_r := if DS_1 then union(DS_2, DS_3) else DS_4;", + datasets=["DS_2", "DS_3"], + dataset_or_scalar=["DS_1", "DS_4"], + ), +} + + +_SORTED_CODES = sorted(CASES.keys(), key=lambda k: int(k)) + + +@pytest.mark.parametrize("test_code", _SORTED_CODES) +def test_classification(test_code: str) -> None: + case = CASES[test_code] + if case.vtl is not None: + script = case.vtl + else: + with open(data_path / "vtl" / f"{test_code}.vtl") as f: + script = f.read() + + schedule = DAGAnalyzer.ds_structure(create_ast(script)) + + assert sorted(schedule.global_input_datasets) == case.datasets + assert sorted(schedule.global_input_scalars) == case.scalars + assert sorted(schedule.global_input_dataset_or_scalar) == case.dataset_or_scalar + assert sorted(schedule.global_input_component_or_scalar) == case.component_or_scalar + # global_inputs is the union of all four categories + all_classified = sorted( + case.datasets + case.scalars + case.dataset_or_scalar + case.component_or_scalar + ) + assert sorted(schedule.global_inputs) == all_classified diff --git a/tests/DAG/test_dag.py b/tests/DAG/test_dag.py deleted file mode 100644 index f7be96209..000000000 --- a/tests/DAG/test_dag.py +++ /dev/null @@ -1,49 +0,0 @@ -import json -from pathlib import Path -from typing import List - -import pytest - -from vtlengine.API import create_ast -from vtlengine.AST.DAG import DAGAnalyzer - -override = False -data_path = Path(__file__).parent / "data" - - -def _discover_tests(data_root: Path) -> List[str]: - return sorted(p.stem for p in (data_root / "vtl").iterdir() if p.is_file()) - - -def _normalize_ds_structure(ds_structure): - return json.loads( - json.dumps( - { - "insertion": {k: sorted(v) for k, v in ds_structure.insertion.items()}, - "deletion": {k: sorted(v) for k, v in ds_structure.deletion.items()}, - "global_inputs": sorted(ds_structure.global_inputs), - "persistent": sorted(ds_structure.persistent), - } - ) - ) - - -tests = _discover_tests(data_path) - - -@pytest.mark.parametrize("test_code", tests) -def test_ds_structure(test_code): - with open(data_path / "vtl" / f"{test_code}.vtl") as f: - script = f.read() - - ds_structures = DAGAnalyzer.ds_structure(create_ast(script)) - - if override: - with open(data_path / "references" / f"{test_code}.json", "w") as f: - json.dump(_normalize_ds_structure(ds_structures), f, indent=4) - - with open(data_path / "references" / f"{test_code}.json") as f: - reference = json.load(f) - - normalized_ds_structures = _normalize_ds_structure(ds_structures) - assert normalized_ds_structures == reference diff --git a/tests/DAG/test_scheduling.py b/tests/DAG/test_scheduling.py new file mode 100644 index 000000000..ba5552464 --- /dev/null +++ b/tests/DAG/test_scheduling.py @@ -0,0 +1,46 @@ +import json +from pathlib import Path +from typing import Any, Dict + +import pytest + +from vtlengine.API import create_ast +from vtlengine.AST.DAG import DAGAnalyzer + +override = False +data_path = Path(__file__).parent / "data" + + +def _normalize_scheduling(schedule: Any) -> Dict[str, Any]: + return json.loads( + json.dumps( + { + "insertion": {k: sorted(v) for k, v in schedule.insertion.items()}, + "deletion": {k: sorted(v) for k, v in schedule.deletion.items()}, + "persistent": sorted(schedule.persistent), + } + ) + ) + + +# Only keep tests with non-trivial scheduling (multiple insertion/deletion points). +NONTRIVIAL_TESTS = ["2", "3", "5", "6", "7", "8", "9", "10", "11", "13", "16", "35", "36"] + + +@pytest.mark.parametrize("test_code", NONTRIVIAL_TESTS) +def test_scheduling(test_code: str) -> None: + with open(data_path / "vtl" / f"{test_code}.vtl") as f: + script = f.read() + + schedule = DAGAnalyzer.ds_structure(create_ast(script)) + ref_path = data_path / "references" / "scheduling" / f"{test_code}.json" + + if override: + ref_path.parent.mkdir(parents=True, exist_ok=True) + with open(ref_path, "w") as f: + json.dump(_normalize_scheduling(schedule), f, indent=4) + + with open(ref_path) as f: + reference = json.load(f) + + assert _normalize_scheduling(schedule) == reference