From 674a1a5f257c3a25e94dee22c241d964f7e02069 Mon Sep 17 00:00:00 2001 From: Javier Hernandez Date: Thu, 5 Mar 2026 16:06:05 +0100 Subject: [PATCH 01/11] feat: add has_dataset_op to StatementDeps and four category fields to DatasetSchedule --- src/vtlengine/AST/DAG/_models.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/vtlengine/AST/DAG/_models.py b/src/vtlengine/AST/DAG/_models.py index 359edc49e..92d7d4f9f 100644 --- a/src/vtlengine/AST/DAG/_models.py +++ b/src/vtlengine/AST/DAG/_models.py @@ -10,6 +10,7 @@ class StatementDeps: outputs: List[str] = field(default_factory=list) persistent: List[str] = field(default_factory=list) unknown_variables: List[str] = field(default_factory=list) + has_dataset_op: bool = False @dataclass @@ -22,4 +23,8 @@ class DatasetSchedule: insertion: Dict[int, List[str]] = field(default_factory=dict) deletion: Dict[int, List[str]] = field(default_factory=dict) global_inputs: List[str] = field(default_factory=list) + global_input_datasets: List[str] = field(default_factory=list) + global_input_scalars: List[str] = field(default_factory=list) + global_input_dataset_or_scalar: List[str] = field(default_factory=list) + global_input_component_or_scalar: List[str] = field(default_factory=list) persistent: List[str] = field(default_factory=list) From 079f31ae9342e217da0a74b978b28efc30db405c Mon Sep 17 00:00:00 2001 From: Javier Hernandez Date: Thu, 5 Mar 2026 16:11:36 +0100 Subject: [PATCH 02/11] feat: implement four-category global input classification with scalar propagation Track dataset operations per statement (has_dataset_op flag) and use fixed-point propagation to identify scalar outputs. Classify global inputs into: global_input_datasets, global_input_scalars, global_input_dataset_or_scalar, and global_input_component_or_scalar. Add 6 new DAG test cases (11-16) covering scalar chains, RegularAggregation ambiguity, top-level ambiguity, and mixed contexts. --- src/vtlengine/AST/DAG/__init__.py | 98 ++++++++++++++++++++++++++ tests/DAG/data/references/1.json | 4 ++ tests/DAG/data/references/10.json | 35 ++++++++++ tests/DAG/data/references/11.json | 20 ++++++ tests/DAG/data/references/12.json | 30 ++++++++ tests/DAG/data/references/13.json | 42 ++++++++++++ tests/DAG/data/references/14.json | 29 ++++++++ tests/DAG/data/references/15.json | 29 ++++++++ tests/DAG/data/references/16.json | 32 +++++++++ tests/DAG/data/references/2.json | 6 ++ tests/DAG/data/references/3.json | 18 +++++ tests/DAG/data/references/4.json | 7 ++ tests/DAG/data/references/5.json | 18 +++++ tests/DAG/data/references/6.json | 24 +++++++ tests/DAG/data/references/7.json | 110 ++++++++++++++++++++++++++++++ tests/DAG/data/references/8.json | 26 +++++++ tests/DAG/data/references/9.json | 46 +++++++++++++ tests/DAG/data/vtl/11.vtl | 3 + tests/DAG/data/vtl/12.vtl | 1 + tests/DAG/data/vtl/13.vtl | 2 + tests/DAG/data/vtl/14.vtl | 1 + tests/DAG/data/vtl/15.vtl | 2 + tests/DAG/data/vtl/16.vtl | 2 + tests/DAG/test_dag.py | 8 +++ 24 files changed, 593 insertions(+) create mode 100644 tests/DAG/data/references/11.json create mode 100644 tests/DAG/data/references/12.json create mode 100644 tests/DAG/data/references/13.json create mode 100644 tests/DAG/data/references/14.json create mode 100644 tests/DAG/data/references/15.json create mode 100644 tests/DAG/data/references/16.json create mode 100644 tests/DAG/data/vtl/11.vtl create mode 100644 tests/DAG/data/vtl/12.vtl create mode 100644 tests/DAG/data/vtl/13.vtl create mode 100644 tests/DAG/data/vtl/14.vtl create mode 100644 tests/DAG/data/vtl/15.vtl create mode 100644 tests/DAG/data/vtl/16.vtl diff --git a/src/vtlengine/AST/DAG/__init__.py b/src/vtlengine/AST/DAG/__init__.py index aed9d5145..f8f7abda7 100644 --- a/src/vtlengine/AST/DAG/__init__.py +++ b/src/vtlengine/AST/DAG/__init__.py @@ -60,8 +60,11 @@ class DAGAnalyzer(ASTTemplate): # Per-statement accumulator (reset between statements) current_deps: StatementDeps = field(default_factory=StatementDeps) + _current_has_dataset_op: bool = False # Cross-statement unknown variable tracking unknown_variables: Set[str] = field(default_factory=set) + # Outputs that were consumed via unknown_variables (RegularAggregation context) + _resolved_from_unknown: Set[str] = field(default_factory=set) @classmethod def ds_structure(cls, ast: AST) -> DatasetSchedule: @@ -105,11 +108,96 @@ def _ds_usage_analysis(self) -> DatasetSchedule: deletion[last_consumer.get(element, key)].append(element) insertion[key].append(element) + # --- Scalar output propagation --- + + # Seed: outputs of statements with no dataset ops and no inputs (constant assignments) + # Also seed outputs resolved from unknown_variables (used in RegularAggregation context) + scalar_outputs: Set[str] = set() + for statement in self.dependencies.values(): + reference = statement.outputs + statement.persistent + if not reference: + continue + ds_name = reference[0] + if not statement.has_dataset_op and not statement.inputs: + scalar_outputs.add(ds_name) + elif ds_name in self._resolved_from_unknown: + scalar_outputs.add(ds_name) + + # Collect component_or_scalar candidates from unknown_variables + component_or_scalar_candidates: Set[str] = set() + for statement in self.dependencies.values(): + for uv in statement.unknown_variables: + if uv not in all_outputs: + component_or_scalar_candidates.add(uv) + + # Propagate: statements with no dataset ops where all inputs are known scalars + changed = True + while changed: + changed = False + for statement in self.dependencies.values(): + reference = statement.outputs + statement.persistent + if not reference: + continue + ds_name = reference[0] + if ds_name in scalar_outputs or statement.has_dataset_op: + continue + if statement.inputs and all( + inp in scalar_outputs or inp in component_or_scalar_candidates + for inp in statement.inputs + ): + scalar_outputs.add(ds_name) + changed = True + + # --- Identify definite dataset inputs --- + definite_dataset_inputs: Set[str] = set() + for statement in self.dependencies.values(): + if statement.has_dataset_op: + for inp in statement.inputs: + definite_dataset_inputs.add(inp) + + # Include component_or_scalar candidates in global_inputs + for name in component_or_scalar_candidates: + if name not in global_set: + global_set.add(name) + global_inputs.append(name) + + # --- Classify global inputs into four categories --- + global_input_datasets: List[str] = [] + global_input_scalars: List[str] = [] + global_input_dataset_or_scalar: List[str] = [] + global_input_component_or_scalar: List[str] = [] + + for name in global_inputs: + if name in component_or_scalar_candidates: + global_input_component_or_scalar.append(name) + elif name in definite_dataset_inputs: + global_input_datasets.append(name) + else: + feeds_only_scalars = all( + (stmt.outputs + stmt.persistent)[0] in scalar_outputs + for stmt in self.dependencies.values() + if name in stmt.inputs and (stmt.outputs + stmt.persistent) + ) + no_dataset_ops = not any( + stmt.has_dataset_op + for stmt in self.dependencies.values() + if name in stmt.inputs + ) + if feeds_only_scalars and no_dataset_ops: + global_input_scalars.append(name) + else: + global_input_dataset_or_scalar.append(name) + return DatasetSchedule( insertion=dict(insertion), deletion=dict(deletion), global_inputs=global_inputs, + global_input_datasets=global_input_datasets, + global_input_scalars=global_input_scalars, + global_input_dataset_or_scalar=global_input_dataset_or_scalar, + global_input_component_or_scalar=global_input_component_or_scalar, persistent=persistent_datasets, + all_outputs=sorted(all_outputs), ) @classmethod @@ -212,6 +300,7 @@ def statement_structure(self) -> StatementDeps: outputs=list(self.current_deps.outputs), persistent=list(self.current_deps.persistent), unknown_variables=list(self.current_deps.unknown_variables), + has_dataset_op=self._current_has_dataset_op, ) self.unknown_variables.update(self.current_deps.unknown_variables) return result @@ -246,12 +335,14 @@ def visit_Start(self, node: Start) -> None: self.number_of_statements += 1 self.alias = set() self.current_deps = StatementDeps() + self._current_has_dataset_op = False aux = copy.copy(self.unknown_variables) for variable in aux: for _number_of_statement, dependency in self.dependencies.items(): if variable in dependency.outputs: self.unknown_variables.discard(variable) + self._resolved_from_unknown.add(variable) for _ns2, dep2 in self.dependencies.items(): if variable in dep2.unknown_variables: dep2.unknown_variables.remove(variable) @@ -272,6 +363,7 @@ def visit_PersistentAssignment(self, node: PersistentAssignment) -> None: self.visit(node.right) def visit_RegularAggregation(self, node: RegularAggregation) -> None: + self._current_has_dataset_op = True self.visit(node.dataset) if node.op in [KEEP, DROP, RENAME]: return @@ -313,6 +405,7 @@ def visit_Identifier(self, node: Identifier) -> None: and node.value not in self.alias and node.value not in self.current_deps.inputs ): + self._current_has_dataset_op = True self.current_deps.inputs.append(node.value) def visit_ParamOp(self, node: ParamOp) -> None: @@ -327,14 +420,17 @@ def visit_ParamOp(self, node: ParamOp) -> None: super(DAGAnalyzer, self).visit_ParamOp(node) def visit_Aggregation(self, node: Aggregation) -> None: + self._current_has_dataset_op = True if node.operand is not None: self.visit(node.operand) def visit_Analytic(self, node: Analytic) -> None: + self._current_has_dataset_op = True if node.operand is not None: self.visit(node.operand) def visit_JoinOp(self, node: JoinOp) -> None: + self._current_has_dataset_op = True for clause in node.clauses: self.visit(clause) @@ -350,10 +446,12 @@ def visit_UDOCall(self, node: UDOCall) -> None: def visit_HROperation(self, node: HROperation) -> None: """Visit HROperation node for dependency analysis.""" + self._current_has_dataset_op = True self.visit(node.dataset) def visit_DPValidation(self, node: DPValidation) -> None: """Visit DPValidation node for dependency analysis.""" + self._current_has_dataset_op = True self.visit(node.dataset) diff --git a/tests/DAG/data/references/1.json b/tests/DAG/data/references/1.json index 2621e2997..42a811fce 100644 --- a/tests/DAG/data/references/1.json +++ b/tests/DAG/data/references/1.json @@ -8,6 +8,10 @@ ] }, "global_inputs": [], + "global_input_datasets": [], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [], + "global_input_component_or_scalar": [], "persistent": [ "c" ] diff --git a/tests/DAG/data/references/10.json b/tests/DAG/data/references/10.json index 34df46b41..74b3870eb 100644 --- a/tests/DAG/data/references/10.json +++ b/tests/DAG/data/references/10.json @@ -65,7 +65,42 @@ ] }, "global_inputs": [ + "ACCOUNTING_ENTRY", + "ADJUSTMENT", + "BOP", + "COMP_METHOD", + "COUNTERPART_SECTOR", + "CURRENCY_DENOM", + "FLOW_STOCK_ENTRY", + "FREQ", + "FUNCTIONAL_CAT", + "INSTR_ASSET", + "INT_ACC_ITEM", + "MATURITY", + "REF_SECTOR", + "VALUATION", + "imbalance" + ], + "global_input_datasets": [ "BOP" ], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [], + "global_input_component_or_scalar": [ + "ACCOUNTING_ENTRY", + "ADJUSTMENT", + "COMP_METHOD", + "COUNTERPART_SECTOR", + "CURRENCY_DENOM", + "FLOW_STOCK_ENTRY", + "FREQ", + "FUNCTIONAL_CAT", + "INSTR_ASSET", + "INT_ACC_ITEM", + "MATURITY", + "REF_SECTOR", + "VALUATION", + "imbalance" + ], "persistent": [] } \ No newline at end of file diff --git a/tests/DAG/data/references/11.json b/tests/DAG/data/references/11.json new file mode 100644 index 000000000..53e6ccfef --- /dev/null +++ b/tests/DAG/data/references/11.json @@ -0,0 +1,20 @@ +{ + "insertion": {}, + "deletion": { + "2": [ + "a" + ], + "3": [ + "b", + "c" + ] + }, + "global_inputs": [], + "global_input_datasets": [], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [], + "global_input_component_or_scalar": [], + "persistent": [ + "c" + ] +} \ No newline at end of file diff --git a/tests/DAG/data/references/12.json b/tests/DAG/data/references/12.json new file mode 100644 index 000000000..53d80fe5f --- /dev/null +++ b/tests/DAG/data/references/12.json @@ -0,0 +1,30 @@ +{ + "insertion": { + "1": [ + "DS_1" + ] + }, + "deletion": { + "1": [ + "DS_1", + "DS_r" + ] + }, + "global_inputs": [ + "DS_1", + "Me_1", + "SC_1" + ], + "global_input_datasets": [ + "DS_1" + ], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [], + "global_input_component_or_scalar": [ + "Me_1", + "SC_1" + ], + "persistent": [ + "DS_r" + ] +} \ No newline at end of file diff --git a/tests/DAG/data/references/13.json b/tests/DAG/data/references/13.json new file mode 100644 index 000000000..12ee853e3 --- /dev/null +++ b/tests/DAG/data/references/13.json @@ -0,0 +1,42 @@ +{ + "insertion": { + "1": [ + "SC_1", + "SC_2" + ], + "2": [ + "DS_1" + ] + }, + "deletion": { + "2": [ + "DS_1", + "DS_r", + "SC_r" + ], + "1": [ + "SC_1", + "SC_2" + ] + }, + "global_inputs": [ + "DS_1", + "Me_1", + "SC_1", + "SC_2" + ], + "global_input_datasets": [ + "DS_1" + ], + "global_input_scalars": [ + "SC_1", + "SC_2" + ], + "global_input_dataset_or_scalar": [], + "global_input_component_or_scalar": [ + "Me_1" + ], + "persistent": [ + "DS_r" + ] +} \ No newline at end of file diff --git a/tests/DAG/data/references/14.json b/tests/DAG/data/references/14.json new file mode 100644 index 000000000..34f8edeee --- /dev/null +++ b/tests/DAG/data/references/14.json @@ -0,0 +1,29 @@ +{ + "insertion": { + "1": [ + "DS_1", + "DS_2" + ] + }, + "deletion": { + "1": [ + "DS_1", + "DS_2", + "DS_r" + ] + }, + "global_inputs": [ + "DS_1", + "DS_2" + ], + "global_input_datasets": [], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [ + "DS_1", + "DS_2" + ], + "global_input_component_or_scalar": [], + "persistent": [ + "DS_r" + ] +} \ No newline at end of file diff --git a/tests/DAG/data/references/15.json b/tests/DAG/data/references/15.json new file mode 100644 index 000000000..f7cb442d6 --- /dev/null +++ b/tests/DAG/data/references/15.json @@ -0,0 +1,29 @@ +{ + "insertion": { + "2": [ + "DS_1" + ] + }, + "deletion": { + "2": [ + "DS_1", + "DS_r", + "SC_r" + ] + }, + "global_inputs": [ + "DS_1", + "Me_1" + ], + "global_input_datasets": [ + "DS_1" + ], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [], + "global_input_component_or_scalar": [ + "Me_1" + ], + "persistent": [ + "DS_r" + ] +} \ No newline at end of file diff --git a/tests/DAG/data/references/16.json b/tests/DAG/data/references/16.json new file mode 100644 index 000000000..c94e32353 --- /dev/null +++ b/tests/DAG/data/references/16.json @@ -0,0 +1,32 @@ +{ + "insertion": { + "1": [ + "DS_1", + "SC_1" + ] + }, + "deletion": { + "1": [ + "DS_1", + "DS_r" + ], + "2": [ + "SC_1", + "SC_r" + ] + }, + "global_inputs": [ + "DS_1", + "SC_1" + ], + "global_input_datasets": [], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [ + "DS_1", + "SC_1" + ], + "global_input_component_or_scalar": [], + "persistent": [ + "DS_r" + ] +} \ No newline at end of file diff --git a/tests/DAG/data/references/2.json b/tests/DAG/data/references/2.json index 44dfb5825..0e8dae7c7 100644 --- a/tests/DAG/data/references/2.json +++ b/tests/DAG/data/references/2.json @@ -25,6 +25,12 @@ "global_inputs": [ "A" ], + "global_input_datasets": [], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [ + "A" + ], + "global_input_component_or_scalar": [], "persistent": [ "a", "b", diff --git a/tests/DAG/data/references/3.json b/tests/DAG/data/references/3.json index 0e1cfdbd7..63e131dec 100644 --- a/tests/DAG/data/references/3.json +++ b/tests/DAG/data/references/3.json @@ -30,9 +30,27 @@ ] }, "global_inputs": [ + "A", + "A2", + "var1", + "var3", + "varF", + "varRel", + "varRel2" + ], + "global_input_datasets": [ "A", "A2" ], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [], + "global_input_component_or_scalar": [ + "var1", + "var3", + "varF", + "varRel", + "varRel2" + ], "persistent": [ "F" ] diff --git a/tests/DAG/data/references/4.json b/tests/DAG/data/references/4.json index dadf09596..8c1ba0627 100644 --- a/tests/DAG/data/references/4.json +++ b/tests/DAG/data/references/4.json @@ -16,5 +16,12 @@ "DS_1", "DS_2" ], + "global_input_datasets": [], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [ + "DS_1", + "DS_2" + ], + "global_input_component_or_scalar": [], "persistent": [] } \ No newline at end of file diff --git a/tests/DAG/data/references/5.json b/tests/DAG/data/references/5.json index 47e0d83cb..0fad1b8ad 100644 --- a/tests/DAG/data/references/5.json +++ b/tests/DAG/data/references/5.json @@ -30,9 +30,27 @@ ] }, "global_inputs": [ + "AGE", + "DSD_AGR", + "DSD_POP", + "MEASURE", + "SEX", + "TIME_HORIZ", + "UNIT_MEASURE" + ], + "global_input_datasets": [ "DSD_AGR", "DSD_POP" ], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [], + "global_input_component_or_scalar": [ + "AGE", + "MEASURE", + "SEX", + "TIME_HORIZ", + "UNIT_MEASURE" + ], "persistent": [ "DS_check_agr", "DS_check_countries", diff --git a/tests/DAG/data/references/6.json b/tests/DAG/data/references/6.json index cc0c0e644..aac54e973 100644 --- a/tests/DAG/data/references/6.json +++ b/tests/DAG/data/references/6.json @@ -55,11 +55,35 @@ ] }, "global_inputs": [ + "BIS_LOC_STATS", + "CURRENCY", + "CURRENCY_DENOM", + "DS1", + "DS2", + "DS3", + "EXCHANGE_RATE", + "EXR_SUFFIX", + "EXR_TYPE", + "FREQ", + "OBS_VALUE" + ], + "global_input_datasets": [ "BIS_LOC_STATS", "DS1", "DS2", "DS3" ], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [], + "global_input_component_or_scalar": [ + "CURRENCY", + "CURRENCY_DENOM", + "EXCHANGE_RATE", + "EXR_SUFFIX", + "EXR_TYPE", + "FREQ", + "OBS_VALUE" + ], "persistent": [ "numCouYear", "numYearCou", diff --git a/tests/DAG/data/references/7.json b/tests/DAG/data/references/7.json index e1456f9c5..e6ccc00ae 100644 --- a/tests/DAG/data/references/7.json +++ b/tests/DAG/data/references/7.json @@ -507,8 +507,10 @@ ] }, "global_inputs": [ + "ACCMLTD_WRTFFS", "ANCRDT_ACCNTNG_C", "ANCRDT_ACCNTNG_C_Z", + "ANCRDT_DRGTN_QRTR_CR_OA", "ANCRDT_ENTTY", "ANCRDT_ENTTY_DFLT_C", "ANCRDT_ENTTY_DFLT_C_T1", @@ -521,8 +523,116 @@ "ANCRDT_INSTRMNT_PRTCTN_RCVD_C", "ANCRDT_JNT_LBLTS_C", "ANCRDT_PRTCTN_RCVD_C", + "ANCRDT_PRTCTN_RCVD_C_T1", + "CC0010", + "CNTRY", + "CRDTR", + "CRDTR_CD", + "DBTR_CD", + "DFLT_STTS", + "DT_BRTH", + "DT_INCPTN", + "DT_RFRNC", + "ENTTY_RIAD_CD", + "ENTTY_RL", + "FRGN_BRNCH", + "HD_OFFC_UNDRT_CD", + "HD_OFFC_UNDRT_CNTRY", + "HD_QRTR_CD_CRDTR", + "HD_QRTR_CD_DBTR", + "IMMDT_PRNT_UNDRT_CD", + "INSTTTNL_SCTR", + "INSTTTNL_SCTR_DTL", + "IS_PRTCTN_PRVDR", + "LGL_FRM", + "OBSRVD_AGNT_CD", + "OFF_BLNC_SHT_AMNT", + "OTHR_TYP_ENTTY", + "OTSTNDNG_NMNL_AMNT", + "PRTCTN_ALLCTD_VL", + "PRTCTN_PRVDR_CD", + "RCGNTN_STTS", + "RCRS", + "SPFUND", + "SRVCR", + "SSMSIGNIFICANCE", + "THRD_PRTY_PRRTY_CLMS", + "TRD_RCVBL_NN_RCRS", + "TTL_NMBR_DBTRS", + "TTL_NMBR_DFLT_DBTRS", + "TYP_INSTRMNT", + "TYP_PRTCTN", + "TYP_SCRTSTN", + "ULTMT_PRNT_UNDRT_CD", + "bool_var", + "int_var" + ], + "global_input_datasets": [ + "ANCRDT_ACCNTNG_C", + "ANCRDT_ACCNTNG_C_Z", + "ANCRDT_ENTTY", + "ANCRDT_ENTTY_DFLT_C", + "ANCRDT_ENTTY_DFLT_C_T1", + "ANCRDT_ENTTY_INSTRMNT_C", + "ANCRDT_ENTTY_RSK_C", + "ANCRDT_FNNCL_C", + "ANCRDT_INSTRMNT_C", + "ANCRDT_INSTRMNT_PRTCTN_RCVD_C", + "ANCRDT_JNT_LBLTS_C", + "ANCRDT_PRTCTN_RCVD_C", "ANCRDT_PRTCTN_RCVD_C_T1" ], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [ + "ANCRDT_FNNCL_C_T1", + "ANCRDT_INSTRMNT_C_T1" + ], + "global_input_component_or_scalar": [ + "ACCMLTD_WRTFFS", + "ANCRDT_DRGTN_QRTR_CR_OA", + "CC0010", + "CNTRY", + "CRDTR", + "CRDTR_CD", + "DBTR_CD", + "DFLT_STTS", + "DT_BRTH", + "DT_INCPTN", + "DT_RFRNC", + "ENTTY_RIAD_CD", + "ENTTY_RL", + "FRGN_BRNCH", + "HD_OFFC_UNDRT_CD", + "HD_OFFC_UNDRT_CNTRY", + "HD_QRTR_CD_CRDTR", + "HD_QRTR_CD_DBTR", + "IMMDT_PRNT_UNDRT_CD", + "INSTTTNL_SCTR", + "INSTTTNL_SCTR_DTL", + "IS_PRTCTN_PRVDR", + "LGL_FRM", + "OBSRVD_AGNT_CD", + "OFF_BLNC_SHT_AMNT", + "OTHR_TYP_ENTTY", + "OTSTNDNG_NMNL_AMNT", + "PRTCTN_ALLCTD_VL", + "PRTCTN_PRVDR_CD", + "RCGNTN_STTS", + "RCRS", + "SPFUND", + "SRVCR", + "SSMSIGNIFICANCE", + "THRD_PRTY_PRRTY_CLMS", + "TRD_RCVBL_NN_RCRS", + "TTL_NMBR_DBTRS", + "TTL_NMBR_DFLT_DBTRS", + "TYP_INSTRMNT", + "TYP_PRTCTN", + "TYP_SCRTSTN", + "ULTMT_PRNT_UNDRT_CD", + "bool_var", + "int_var" + ], "persistent": [ "ACCNTNG_CMPLTNSS", "CN0230", diff --git a/tests/DAG/data/references/8.json b/tests/DAG/data/references/8.json index 111d84582..dacbdc15f 100644 --- a/tests/DAG/data/references/8.json +++ b/tests/DAG/data/references/8.json @@ -99,6 +99,23 @@ ] }, "global_inputs": [ + "ANCRDT_ACCNTNG_C", + "ANCRDT_ACCNTNG_C_T3", + "ANCRDT_ENTTY", + "ANCRDT_ENTTY_DFLT_C", + "ANCRDT_ENTTY_INSTRMNT_C", + "ANCRDT_FNNCL_C", + "ANCRDT_INSTRMNT_C", + "ANCRDT_INSTRMNT_C_T1", + "ANCRDT_INSTRMNT_C_T2", + "ANCRDT_INSTRMNT_C_T3", + "ENTTY_RIAD_CD", + "ENTTY_RL", + "HD_OFFC_UNDRT_CD", + "LGL_ENTTY_CD", + "OBSRVD_AGNT_CD" + ], + "global_input_datasets": [ "ANCRDT_ACCNTNG_C", "ANCRDT_ACCNTNG_C_T3", "ANCRDT_ENTTY", @@ -110,6 +127,15 @@ "ANCRDT_INSTRMNT_C_T2", "ANCRDT_INSTRMNT_C_T3" ], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [], + "global_input_component_or_scalar": [ + "ENTTY_RIAD_CD", + "ENTTY_RL", + "HD_OFFC_UNDRT_CD", + "LGL_ENTTY_CD", + "OBSRVD_AGNT_CD" + ], "persistent": [ "DP_RLST_ACCNTNG_FRMWRK_RSLT", "DP_RLST_ACCNTNG_INTRCMPNY", diff --git a/tests/DAG/data/references/9.json b/tests/DAG/data/references/9.json index 6381e3a2b..c90da87a3 100644 --- a/tests/DAG/data/references/9.json +++ b/tests/DAG/data/references/9.json @@ -132,9 +132,55 @@ "Income_PT", "Inflation_PT", "Inflation_divisors_Q", + "Oferta_PT_2025_Q1", + "Vendas_PT_2025_Q1", + "coefficient", + "coefficient_cq", + "coefficient_inv", + "coefficient_l", + "coefficient_lc", + "coefficient_lcq", + "coefficient_lq", + "coefficient_q", + "county", + "divisor", + "estado", + "income", + "period_label", + "regiao", + "value", + "var", + "year_str" + ], + "global_input_datasets": [ + "Income_PT", + "Inflation_PT", + "Inflation_divisors_Q" + ], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [ "Oferta_PT_2025_Q1", "Vendas_PT_2025_Q1" ], + "global_input_component_or_scalar": [ + "coefficient", + "coefficient_cq", + "coefficient_inv", + "coefficient_l", + "coefficient_lc", + "coefficient_lcq", + "coefficient_lq", + "coefficient_q", + "county", + "divisor", + "estado", + "income", + "period_label", + "regiao", + "value", + "var", + "year_str" + ], "persistent": [ "output_generic", "output_generic_eda", diff --git a/tests/DAG/data/vtl/11.vtl b/tests/DAG/data/vtl/11.vtl new file mode 100644 index 000000000..d666ead62 --- /dev/null +++ b/tests/DAG/data/vtl/11.vtl @@ -0,0 +1,3 @@ +a := 1; +b := a + 2; +c <- b * 3; diff --git a/tests/DAG/data/vtl/12.vtl b/tests/DAG/data/vtl/12.vtl new file mode 100644 index 000000000..f9ee4c4bd --- /dev/null +++ b/tests/DAG/data/vtl/12.vtl @@ -0,0 +1 @@ +DS_r <- DS_1[calc Me_2 := Me_1 * SC_1]; diff --git a/tests/DAG/data/vtl/13.vtl b/tests/DAG/data/vtl/13.vtl new file mode 100644 index 000000000..298741c6d --- /dev/null +++ b/tests/DAG/data/vtl/13.vtl @@ -0,0 +1,2 @@ +SC_r := SC_1 + SC_2; +DS_r <- DS_1[calc Me_2 := Me_1 + SC_r]; diff --git a/tests/DAG/data/vtl/14.vtl b/tests/DAG/data/vtl/14.vtl new file mode 100644 index 000000000..e19c38657 --- /dev/null +++ b/tests/DAG/data/vtl/14.vtl @@ -0,0 +1 @@ +DS_r <- DS_1 + DS_2; diff --git a/tests/DAG/data/vtl/15.vtl b/tests/DAG/data/vtl/15.vtl new file mode 100644 index 000000000..b17fb76c0 --- /dev/null +++ b/tests/DAG/data/vtl/15.vtl @@ -0,0 +1,2 @@ +SC_r := 10; +DS_r <- DS_1[calc Me_2 := Me_1 + SC_r]; diff --git a/tests/DAG/data/vtl/16.vtl b/tests/DAG/data/vtl/16.vtl new file mode 100644 index 000000000..97392464a --- /dev/null +++ b/tests/DAG/data/vtl/16.vtl @@ -0,0 +1,2 @@ +DS_r <- DS_1 + SC_1; +SC_r := SC_1 * 2; diff --git a/tests/DAG/test_dag.py b/tests/DAG/test_dag.py index f7be96209..7edbeb54c 100644 --- a/tests/DAG/test_dag.py +++ b/tests/DAG/test_dag.py @@ -22,6 +22,14 @@ def _normalize_ds_structure(ds_structure): "insertion": {k: sorted(v) for k, v in ds_structure.insertion.items()}, "deletion": {k: sorted(v) for k, v in ds_structure.deletion.items()}, "global_inputs": sorted(ds_structure.global_inputs), + "global_input_datasets": sorted(ds_structure.global_input_datasets), + "global_input_scalars": sorted(ds_structure.global_input_scalars), + "global_input_dataset_or_scalar": sorted( + ds_structure.global_input_dataset_or_scalar + ), + "global_input_component_or_scalar": sorted( + ds_structure.global_input_component_or_scalar + ), "persistent": sorted(ds_structure.persistent), } ) From 23a2fe670ad1d6d1a8bfa9c74e9bd2ffea5f4579 Mon Sep 17 00:00:00 2001 From: Javier Hernandez Date: Thu, 5 Mar 2026 16:13:06 +0100 Subject: [PATCH 03/11] feat: update consumers to use categorized global input fields --- src/vtlengine/API/__init__.py | 3 ++- src/vtlengine/Interpreter/__init__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/vtlengine/API/__init__.py b/src/vtlengine/API/__init__.py index 33dce44f4..3d1a4679d 100644 --- a/src/vtlengine/API/__init__.py +++ b/src/vtlengine/API/__init__.py @@ -88,7 +88,8 @@ def _extract_input_datasets(script: Union[str, TransformationScheme, Path]) -> L raise TypeError("Unsupported script type.") ast = create_ast(vtl_script) - dag_inputs = DAGAnalyzer.ds_structure(ast).global_inputs + ds = DAGAnalyzer.ds_structure(ast) + dag_inputs = ds.global_input_datasets + ds.global_input_dataset_or_scalar return dag_inputs diff --git a/src/vtlengine/Interpreter/__init__.py b/src/vtlengine/Interpreter/__init__.py index eda8789bc..8cdd19888 100644 --- a/src/vtlengine/Interpreter/__init__.py +++ b/src/vtlengine/Interpreter/__init__.py @@ -195,7 +195,7 @@ def _save_datapoints_efficient(self, statement_num: int) -> None: or self.datasets[ds_name].data is None ): continue - if ds_name in self.ds_analysis.global_inputs: + if ds_name in self.ds_analysis.global_input_datasets: # We do not save global input datasets, only results of transformations self.datasets[ds_name].data = None continue From 5d9405f3c63e48d424e060d51babea785c2d661f Mon Sep 17 00:00:00 2001 From: Javier Hernandez Date: Thu, 5 Mar 2026 16:16:38 +0100 Subject: [PATCH 04/11] refactor: extract helper methods to reduce complexity in DAG analysis --- src/vtlengine/AST/DAG/__init__.py | 143 +++++++++++++++++------------- 1 file changed, 83 insertions(+), 60 deletions(-) diff --git a/src/vtlengine/AST/DAG/__init__.py b/src/vtlengine/AST/DAG/__init__.py index f8f7abda7..7f3cd7808 100644 --- a/src/vtlengine/AST/DAG/__init__.py +++ b/src/vtlengine/AST/DAG/__init__.py @@ -108,27 +108,86 @@ def _ds_usage_analysis(self) -> DatasetSchedule: deletion[last_consumer.get(element, key)].append(element) insertion[key].append(element) - # --- Scalar output propagation --- + classification = self._classify_global_inputs( + all_outputs, global_inputs, global_set + ) + + return DatasetSchedule( + insertion=dict(insertion), + deletion=dict(deletion), + global_inputs=classification["global_inputs"], + global_input_datasets=classification["global_input_datasets"], + global_input_scalars=classification["global_input_scalars"], + global_input_dataset_or_scalar=classification["global_input_dataset_or_scalar"], + global_input_component_or_scalar=classification[ + "global_input_component_or_scalar" + ], + persistent=persistent_datasets, + all_outputs=sorted(all_outputs), + ) + + def _classify_global_inputs( + self, + all_outputs: Set[str], + global_inputs: List[str], + global_set: Set[str], + ) -> Dict[str, List[str]]: + """Classify global inputs into datasets, scalars, and ambiguous categories.""" + scalar_outputs, comp_or_scalar = self._compute_scalar_outputs(all_outputs) + + # Identify definite dataset inputs + definite_dataset_inputs: Set[str] = set() + for statement in self.dependencies.values(): + if statement.has_dataset_op: + for inp in statement.inputs: + definite_dataset_inputs.add(inp) + + # Include component_or_scalar candidates in global_inputs + for name in comp_or_scalar: + if name not in global_set: + global_set.add(name) + global_inputs.append(name) + + # Classify into four categories + result: Dict[str, List[str]] = { + "global_inputs": global_inputs, + "global_input_datasets": [], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [], + "global_input_component_or_scalar": [], + } + + for name in global_inputs: + if name in comp_or_scalar: + result["global_input_component_or_scalar"].append(name) + elif name in definite_dataset_inputs: + result["global_input_datasets"].append(name) + elif self._feeds_only_scalar_chains(name, scalar_outputs): + result["global_input_scalars"].append(name) + else: + result["global_input_dataset_or_scalar"].append(name) - # Seed: outputs of statements with no dataset ops and no inputs (constant assignments) - # Also seed outputs resolved from unknown_variables (used in RegularAggregation context) + return result + + def _compute_scalar_outputs( + self, all_outputs: Set[str] + ) -> tuple[Set[str], Set[str]]: + """Compute scalar outputs via propagation and component/scalar candidates.""" scalar_outputs: Set[str] = set() for statement in self.dependencies.values(): reference = statement.outputs + statement.persistent if not reference: continue ds_name = reference[0] - if not statement.has_dataset_op and not statement.inputs: - scalar_outputs.add(ds_name) - elif ds_name in self._resolved_from_unknown: + is_constant_assignment = not statement.has_dataset_op and not statement.inputs + if is_constant_assignment or ds_name in self._resolved_from_unknown: scalar_outputs.add(ds_name) - # Collect component_or_scalar candidates from unknown_variables - component_or_scalar_candidates: Set[str] = set() + comp_or_scalar: Set[str] = set() for statement in self.dependencies.values(): for uv in statement.unknown_variables: if uv not in all_outputs: - component_or_scalar_candidates.add(uv) + comp_or_scalar.add(uv) # Propagate: statements with no dataset ops where all inputs are known scalars changed = True @@ -142,63 +201,27 @@ def _ds_usage_analysis(self) -> DatasetSchedule: if ds_name in scalar_outputs or statement.has_dataset_op: continue if statement.inputs and all( - inp in scalar_outputs or inp in component_or_scalar_candidates + inp in scalar_outputs or inp in comp_or_scalar for inp in statement.inputs ): scalar_outputs.add(ds_name) changed = True - # --- Identify definite dataset inputs --- - definite_dataset_inputs: Set[str] = set() - for statement in self.dependencies.values(): - if statement.has_dataset_op: - for inp in statement.inputs: - definite_dataset_inputs.add(inp) - - # Include component_or_scalar candidates in global_inputs - for name in component_or_scalar_candidates: - if name not in global_set: - global_set.add(name) - global_inputs.append(name) + return scalar_outputs, comp_or_scalar - # --- Classify global inputs into four categories --- - global_input_datasets: List[str] = [] - global_input_scalars: List[str] = [] - global_input_dataset_or_scalar: List[str] = [] - global_input_component_or_scalar: List[str] = [] - - for name in global_inputs: - if name in component_or_scalar_candidates: - global_input_component_or_scalar.append(name) - elif name in definite_dataset_inputs: - global_input_datasets.append(name) - else: - feeds_only_scalars = all( - (stmt.outputs + stmt.persistent)[0] in scalar_outputs - for stmt in self.dependencies.values() - if name in stmt.inputs and (stmt.outputs + stmt.persistent) - ) - no_dataset_ops = not any( - stmt.has_dataset_op - for stmt in self.dependencies.values() - if name in stmt.inputs - ) - if feeds_only_scalars and no_dataset_ops: - global_input_scalars.append(name) - else: - global_input_dataset_or_scalar.append(name) - - return DatasetSchedule( - insertion=dict(insertion), - deletion=dict(deletion), - global_inputs=global_inputs, - global_input_datasets=global_input_datasets, - global_input_scalars=global_input_scalars, - global_input_dataset_or_scalar=global_input_dataset_or_scalar, - global_input_component_or_scalar=global_input_component_or_scalar, - persistent=persistent_datasets, - all_outputs=sorted(all_outputs), - ) + def _feeds_only_scalar_chains(self, name: str, scalar_outputs: Set[str]) -> bool: + """Check if a global input feeds only into scalar-output statements.""" + has_consumers = False + for stmt in self.dependencies.values(): + if name not in stmt.inputs: + continue + has_consumers = True + if stmt.has_dataset_op: + return False + reference = stmt.outputs + stmt.persistent + if reference and reference[0] not in scalar_outputs: + return False + return has_consumers @classmethod def create_dag(cls, ast: Start) -> "DAGAnalyzer": From 5aca4adf18bdeaf51d02ffa8b8336458ee330e53 Mon Sep 17 00:00:00 2001 From: Javier Hernandez Date: Thu, 5 Mar 2026 16:33:48 +0100 Subject: [PATCH 05/11] feat: fix dataset classification for UDO calls and MEMBERSHIP operator Set _current_has_dataset_op in visit_ParamOp, visit_UDOCall, and visit_BinOp (MEMBERSHIP) so UDO dataset parameters and membership operands are correctly classified as global_input_datasets. --- src/vtlengine/AST/DAG/__init__.py | 28 +++++++++++++--------------- src/vtlengine/AST/DAG/_models.py | 1 + tests/DAG/data/references/17.json | 23 +++++++++++++++++++++++ tests/DAG/data/references/4.json | 8 ++++---- tests/DAG/data/references/7.json | 7 +++---- tests/DAG/data/references/9.json | 7 +++---- tests/DAG/data/vtl/17.vtl | 1 + 7 files changed, 48 insertions(+), 27 deletions(-) create mode 100644 tests/DAG/data/references/17.json create mode 100644 tests/DAG/data/vtl/17.vtl diff --git a/src/vtlengine/AST/DAG/__init__.py b/src/vtlengine/AST/DAG/__init__.py index 7f3cd7808..9ee185272 100644 --- a/src/vtlengine/AST/DAG/__init__.py +++ b/src/vtlengine/AST/DAG/__init__.py @@ -40,7 +40,7 @@ from vtlengine.AST.DAG._models import DatasetSchedule, StatementDeps from vtlengine.AST.Grammar.tokens import AS, DROP, KEEP, MEMBERSHIP, RENAME, TO from vtlengine.Exceptions import SemanticError -from vtlengine.Model import Component +from vtlengine.Model import Component, Dataset @dataclass @@ -108,9 +108,7 @@ def _ds_usage_analysis(self) -> DatasetSchedule: deletion[last_consumer.get(element, key)].append(element) insertion[key].append(element) - classification = self._classify_global_inputs( - all_outputs, global_inputs, global_set - ) + classification = self._classify_global_inputs(all_outputs, global_inputs, global_set) return DatasetSchedule( insertion=dict(insertion), @@ -119,9 +117,7 @@ def _ds_usage_analysis(self) -> DatasetSchedule: global_input_datasets=classification["global_input_datasets"], global_input_scalars=classification["global_input_scalars"], global_input_dataset_or_scalar=classification["global_input_dataset_or_scalar"], - global_input_component_or_scalar=classification[ - "global_input_component_or_scalar" - ], + global_input_component_or_scalar=classification["global_input_component_or_scalar"], persistent=persistent_datasets, all_outputs=sorted(all_outputs), ) @@ -169,9 +165,7 @@ def _classify_global_inputs( return result - def _compute_scalar_outputs( - self, all_outputs: Set[str] - ) -> tuple[Set[str], Set[str]]: + def _compute_scalar_outputs(self, all_outputs: Set[str]) -> tuple[Set[str], Set[str]]: """Compute scalar outputs via propagation and component/scalar candidates.""" scalar_outputs: Set[str] = set() for statement in self.dependencies.values(): @@ -201,8 +195,7 @@ def _compute_scalar_outputs( if ds_name in scalar_outputs or statement.has_dataset_op: continue if statement.inputs and all( - inp in scalar_outputs or inp in comp_or_scalar - for inp in statement.inputs + inp in scalar_outputs or inp in comp_or_scalar for inp in statement.inputs ): scalar_outputs.add(ds_name) changed = True @@ -397,6 +390,7 @@ def visit_RegularAggregation(self, node: RegularAggregation) -> None: def visit_BinOp(self, node: BinOp) -> None: if node.op == MEMBERSHIP: + self._current_has_dataset_op = True self.is_dataset = True self.visit(node.left) self.is_dataset = False @@ -438,6 +432,7 @@ def visit_ParamOp(self, node: ParamOp) -> None: for arg in node.params: index_arg = node.params.index(arg) if do_ast.parameters[index_arg].type_.kind == "DataSet": + self._current_has_dataset_op = True self.visit(arg) else: super(DAGAnalyzer, self).visit_ParamOp(node) @@ -462,9 +457,12 @@ def visit_UDOCall(self, node: UDOCall) -> None: if not node_args: super().visit_UDOCall(node) else: - node_sig = [type(p.type_) for p in node_args.parameters] - for sig, param in zip(node_sig, node.params): - if not isinstance(param, Constant) and sig is not Component: + for p, param in zip(node_args.parameters, node.params): + if isinstance(param, Constant): + continue + if type(p.type_) is not Component: + if isinstance(p.type_, Dataset): + self._current_has_dataset_op = True self.visit(param) def visit_HROperation(self, node: HROperation) -> None: diff --git a/src/vtlengine/AST/DAG/_models.py b/src/vtlengine/AST/DAG/_models.py index 92d7d4f9f..1e41b62ad 100644 --- a/src/vtlengine/AST/DAG/_models.py +++ b/src/vtlengine/AST/DAG/_models.py @@ -28,3 +28,4 @@ class DatasetSchedule: global_input_dataset_or_scalar: List[str] = field(default_factory=list) global_input_component_or_scalar: List[str] = field(default_factory=list) persistent: List[str] = field(default_factory=list) + all_outputs: List[str] = field(default_factory=list) diff --git a/tests/DAG/data/references/17.json b/tests/DAG/data/references/17.json new file mode 100644 index 000000000..9696fa802 --- /dev/null +++ b/tests/DAG/data/references/17.json @@ -0,0 +1,23 @@ +{ + "insertion": { + "1": [ + "DS_1" + ] + }, + "deletion": { + "1": [ + "DS_1", + "DS_r" + ] + }, + "global_inputs": [ + "DS_1" + ], + "global_input_datasets": [ + "DS_1" + ], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [], + "global_input_component_or_scalar": [], + "persistent": [] +} \ No newline at end of file diff --git a/tests/DAG/data/references/4.json b/tests/DAG/data/references/4.json index 8c1ba0627..e84302eaa 100644 --- a/tests/DAG/data/references/4.json +++ b/tests/DAG/data/references/4.json @@ -16,12 +16,12 @@ "DS_1", "DS_2" ], - "global_input_datasets": [], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [ + "global_input_datasets": [ "DS_1", "DS_2" ], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [], "global_input_component_or_scalar": [], "persistent": [] -} \ No newline at end of file +} diff --git a/tests/DAG/data/references/7.json b/tests/DAG/data/references/7.json index e6ccc00ae..757fa6dac 100644 --- a/tests/DAG/data/references/7.json +++ b/tests/DAG/data/references/7.json @@ -576,17 +576,16 @@ "ANCRDT_ENTTY_INSTRMNT_C", "ANCRDT_ENTTY_RSK_C", "ANCRDT_FNNCL_C", + "ANCRDT_FNNCL_C_T1", "ANCRDT_INSTRMNT_C", + "ANCRDT_INSTRMNT_C_T1", "ANCRDT_INSTRMNT_PRTCTN_RCVD_C", "ANCRDT_JNT_LBLTS_C", "ANCRDT_PRTCTN_RCVD_C", "ANCRDT_PRTCTN_RCVD_C_T1" ], "global_input_scalars": [], - "global_input_dataset_or_scalar": [ - "ANCRDT_FNNCL_C_T1", - "ANCRDT_INSTRMNT_C_T1" - ], + "global_input_dataset_or_scalar": [], "global_input_component_or_scalar": [ "ACCMLTD_WRTFFS", "ANCRDT_DRGTN_QRTR_CR_OA", diff --git a/tests/DAG/data/references/9.json b/tests/DAG/data/references/9.json index c90da87a3..e92a92fc4 100644 --- a/tests/DAG/data/references/9.json +++ b/tests/DAG/data/references/9.json @@ -155,13 +155,12 @@ "global_input_datasets": [ "Income_PT", "Inflation_PT", - "Inflation_divisors_Q" - ], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [ + "Inflation_divisors_Q", "Oferta_PT_2025_Q1", "Vendas_PT_2025_Q1" ], + "global_input_scalars": [], + "global_input_dataset_or_scalar": [], "global_input_component_or_scalar": [ "coefficient", "coefficient_cq", diff --git a/tests/DAG/data/vtl/17.vtl b/tests/DAG/data/vtl/17.vtl new file mode 100644 index 000000000..2e9470e04 --- /dev/null +++ b/tests/DAG/data/vtl/17.vtl @@ -0,0 +1 @@ +DS_r := DS_1#Me_1; From 978f2206f8af1328f2c0f66fd416d950c1672923 Mon Sep 17 00:00:00 2001 From: Javier Hernandez Date: Thu, 5 Mar 2026 16:33:57 +0100 Subject: [PATCH 06/11] chore: add types-networkx to dev dependencies --- poetry.lock | 56 ++++++++++++++++++++++++++++++++------------------ pyproject.toml | 1 + 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/poetry.lock b/poetry.lock index f695794a1..50f1c7744 100644 --- a/poetry.lock +++ b/poetry.lock @@ -7,7 +7,7 @@ description = "Async client for aws services using botocore and aiohttp" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "aiobotocore-2.26.0-py3-none-any.whl", hash = "sha256:a793db51c07930513b74ea7a95bd79aaa42f545bdb0f011779646eafa216abec"}, {file = "aiobotocore-2.26.0.tar.gz", hash = "sha256:50567feaf8dfe2b653570b4491f5bc8c6e7fb9622479d66442462c021db4fadc"}, @@ -34,7 +34,7 @@ description = "Happy Eyeballs for asyncio" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8"}, {file = "aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558"}, @@ -47,7 +47,7 @@ description = "Async http client/server framework (asyncio)" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "aiohttp-3.13.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d5a372fd5afd301b3a89582817fdcdb6c34124787c70dbcc616f259013e7eef7"}, {file = "aiohttp-3.13.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:147e422fd1223005c22b4fe080f5d93ced44460f5f9c105406b753612b587821"}, @@ -191,7 +191,7 @@ description = "itertools and builtins for AsyncIO and mixed iterables" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "aioitertools-0.13.0-py3-none-any.whl", hash = "sha256:0be0292b856f08dfac90e31f4739432f4cb6d7520ab9eb73e143f4f2fa5259be"}, {file = "aioitertools-0.13.0.tar.gz", hash = "sha256:620bd241acc0bbb9ec819f1ab215866871b4bbd1f73836a55f799200ee86950c"}, @@ -207,7 +207,7 @@ description = "aiosignal: a list of registered asynchronous callbacks" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e"}, {file = "aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7"}, @@ -267,7 +267,7 @@ description = "Timeout context manager for asyncio programs" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "(extra == \"all\" or extra == \"s3\") and python_version < \"3.11\"" +markers = "(extra == \"s3\" or extra == \"all\") and python_version < \"3.11\"" files = [ {file = "async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c"}, {file = "async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3"}, @@ -307,7 +307,7 @@ description = "Low-level, data-driven core of boto 3." optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "botocore-1.41.5-py3-none-any.whl", hash = "sha256:3fef7fcda30c82c27202d232cfdbd6782cb27f20f8e7e21b20606483e66ee73a"}, {file = "botocore-1.41.5.tar.gz", hash = "sha256:0367622b811597d183bfcaab4a350f0d3ede712031ce792ef183cabdee80d3bf"}, @@ -699,7 +699,7 @@ description = "A list-like structure which implements collections.abc.MutableSeq optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "frozenlist-1.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b37f6d31b3dcea7deb5e9696e529a6aa4a898adc33db82da12e4c60a7c4d2011"}, {file = "frozenlist-1.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ef2b7b394f208233e471abc541cc6991f907ffd47dc72584acee3147899d6565"}, @@ -840,7 +840,7 @@ description = "File-system specification" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "python_version == \"3.9\" and (extra == \"all\" or extra == \"s3\")" +markers = "python_version == \"3.9\" and (extra == \"s3\" or extra == \"all\")" files = [ {file = "fsspec-2025.10.0-py3-none-any.whl", hash = "sha256:7c7712353ae7d875407f97715f0e1ffcc21e33d5b24556cb1e090ae9409ec61d"}, {file = "fsspec-2025.10.0.tar.gz", hash = "sha256:b6789427626f068f9a83ca4e8a3cc050850b6c0f71f99ddb4f542b8266a26a59"}, @@ -881,7 +881,7 @@ description = "File-system specification" optional = true python-versions = ">=3.10" groups = ["main"] -markers = "python_version >= \"3.10\" and (extra == \"all\" or extra == \"s3\")" +markers = "python_version >= \"3.10\" and (extra == \"s3\" or extra == \"all\")" files = [ {file = "fsspec-2025.12.0-py3-none-any.whl", hash = "sha256:8bf1fe301b7d8acfa6e8571e3b1c3d158f909666642431cc78a1b7b4dbc5ec5b"}, {file = "fsspec-2025.12.0.tar.gz", hash = "sha256:c505de011584597b1060ff778bb664c1bc022e87921b0e4f10cc9c44f9635973"}, @@ -1118,7 +1118,7 @@ description = "JSON Matching Expressions" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, @@ -1668,7 +1668,7 @@ description = "multidict implementation" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "multidict-6.7.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:9f474ad5acda359c8758c8accc22032c6abe6dc87a8be2440d097785e27a9349"}, {file = "multidict-6.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4b7a9db5a870f780220e931d0002bbfd88fb53aceb6293251e2c839415c1b20e"}, @@ -2229,7 +2229,7 @@ description = "Accelerated property cache" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "propcache-0.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c2d1fa3201efaf55d730400d945b5b3ab6e672e100ba0f9a409d950ab25d7db"}, {file = "propcache-0.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1eb2994229cc8ce7fe9b3db88f5465f5fd8651672840b2e426b88cdb1a30aac8"}, @@ -2945,7 +2945,7 @@ description = "Convenient Filesystem interface over S3" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "python_version == \"3.9\" and (extra == \"all\" or extra == \"s3\")" +markers = "python_version == \"3.9\" and (extra == \"s3\" or extra == \"all\")" files = [ {file = "s3fs-2025.10.0-py3-none-any.whl", hash = "sha256:da7ef25efc1541f5fca8e1116361e49ea1081f83f4e8001fbd77347c625da28a"}, {file = "s3fs-2025.10.0.tar.gz", hash = "sha256:e8be6cddc77aceea1681ece0f472c3a7f8ef71a0d2acddb1cc92bb6afa3e9e4f"}, @@ -2967,7 +2967,7 @@ description = "Convenient Filesystem interface over S3" optional = true python-versions = ">=3.10" groups = ["main"] -markers = "python_version >= \"3.10\" and (extra == \"all\" or extra == \"s3\")" +markers = "python_version >= \"3.10\" and (extra == \"s3\" or extra == \"all\")" files = [ {file = "s3fs-2025.12.0-py3-none-any.whl", hash = "sha256:89d51e0744256baad7ae5410304a368ca195affd93a07795bc8ba9c00c9effbb"}, {file = "s3fs-2025.12.0.tar.gz", hash = "sha256:8612885105ce14d609c5b807553f9f9956b45541576a17ff337d9435ed3eb01f"}, @@ -3298,6 +3298,22 @@ files = [ [package.dependencies] referencing = "*" +[[package]] +name = "types-networkx" +version = "3.6.1.20260303" +description = "Typing stubs for networkx" +optional = false +python-versions = ">=3.10" +groups = ["dev"] +markers = "python_version >= \"3.10\"" +files = [ + {file = "types_networkx-3.6.1.20260303-py3-none-any.whl", hash = "sha256:754c7c7bcaab3c317b0b86441240c0a5bd0d2f419aba80a88e9718248a5c89af"}, + {file = "types_networkx-3.6.1.20260303.tar.gz", hash = "sha256:8248aa6fcadc08bd7992af6e412bfc5cfa043bda5ce7ab407fa591c808ce8557"}, +] + +[package.dependencies] +numpy = ">=1.20" + [[package]] name = "types-pytz" version = "2025.2.0.20251108" @@ -3346,7 +3362,7 @@ files = [ {file = "urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e"}, {file = "urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32"}, ] -markers = {main = "python_version == \"3.9\" and (extra == \"all\" or extra == \"s3\")", docs = "python_version == \"3.9\""} +markers = {main = "python_version == \"3.9\" and (extra == \"s3\" or extra == \"all\")", docs = "python_version == \"3.9\""} [package.extras] brotli = ["brotli (==1.0.9) ; os_name != \"nt\" and python_version < \"3\" and platform_python_implementation == \"CPython\"", "brotli (>=1.0.9) ; python_version >= \"3\" and platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; (os_name != \"nt\" or python_version >= \"3\") and platform_python_implementation != \"CPython\"", "brotlipy (>=0.6.0) ; os_name == \"nt\" and python_version < \"3\""] @@ -3364,7 +3380,7 @@ files = [ {file = "urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd"}, {file = "urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797"}, ] -markers = {main = "python_version >= \"3.10\" and (extra == \"all\" or extra == \"s3\")"} +markers = {main = "python_version >= \"3.10\" and (extra == \"s3\" or extra == \"all\")"} [package.extras] brotli = ["brotli (>=1.2.0) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=1.2.0.0) ; platform_python_implementation != \"CPython\""] @@ -3379,7 +3395,7 @@ description = "Module for decorators, wrappers and monkey patching." optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "wrapt-1.17.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:88bbae4d40d5a46142e70d58bf664a89b6b4befaea7b2ecc14e03cedb8e06c04"}, {file = "wrapt-1.17.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6b13af258d6a9ad602d57d889f83b9d5543acd471eee12eb51f5b01f8eb1bc2"}, @@ -3486,7 +3502,7 @@ description = "Yet another URL library" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"all\" or extra == \"s3\"" +markers = "extra == \"s3\" or extra == \"all\"" files = [ {file = "yarl-1.22.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c7bd6683587567e5a49ee6e336e0612bec8329be1b7d4c8af5687dcdeb67ee1e"}, {file = "yarl-1.22.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5cdac20da754f3a723cceea5b3448e1a2074866406adeb4ef35b469d089adb8f"}, @@ -3653,4 +3669,4 @@ s3 = ["s3fs"] [metadata] lock-version = "2.1" python-versions = ">=3.9,<4.0" -content-hash = "44f127ea06ae4ebdcd56b245a4f9886dd3cedd5ffd64a38a110e4acc8fd4cc19" +content-hash = "ee401a8363109109d158a23c5dc04aad4198208c7287710111f40756e21e33c6" diff --git a/pyproject.toml b/pyproject.toml index a75fc10d7..d5c2855f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,7 @@ pandas-stubs = ">=2.2.2,<3.0" pyarrow-stubs = ">=19.0,<21.0" ruff = ">=0.14,<1.0.0" types-jsonschema = ">=4.25.1,<5.0" +types-networkx = {markers = "python_version >= \"3.10\"", version = "^3.6.1.20260303"} [tool.poetry.group.docs.dependencies] sphinx = ">=7.4.7,<8.0" From a56f90f7a9f426c3d84584d481cf722e55fef267 Mon Sep 17 00:00:00 2001 From: Javier Hernandez Date: Thu, 5 Mar 2026 16:38:25 +0100 Subject: [PATCH 07/11] docs: add docstrings to DAG models and filter global inputs by all_outputs --- src/vtlengine/AST/DAG/__init__.py | 2 +- src/vtlengine/AST/DAG/_models.py | 39 +++++++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/src/vtlengine/AST/DAG/__init__.py b/src/vtlengine/AST/DAG/__init__.py index 9ee185272..0905b1954 100644 --- a/src/vtlengine/AST/DAG/__init__.py +++ b/src/vtlengine/AST/DAG/__init__.py @@ -140,7 +140,7 @@ def _classify_global_inputs( # Include component_or_scalar candidates in global_inputs for name in comp_or_scalar: - if name not in global_set: + if name not in global_set and name not in all_outputs: global_set.add(name) global_inputs.append(name) diff --git a/src/vtlengine/AST/DAG/_models.py b/src/vtlengine/AST/DAG/_models.py index 1e41b62ad..17a31f180 100644 --- a/src/vtlengine/AST/DAG/_models.py +++ b/src/vtlengine/AST/DAG/_models.py @@ -4,7 +4,18 @@ @dataclass class StatementDeps: - """Per-statement dependency info tracked during AST visiting.""" + """Per-statement dependency info tracked during AST visiting. + + Attributes: + inputs: Variables consumed by this statement (excluding its own outputs). + outputs: Variables produced by this statement via ``:=`` assignment. + persistent: Variables produced by this statement via ``<-`` assignment. + unknown_variables: Variables inside RegularAggregation context that could + be either components of the dataset or external scalars. + has_dataset_op: Whether this statement involves a dataset operation + (RegularAggregation, JoinOp, Aggregation, Analytic, MEMBERSHIP, + UDO with dataset params, etc.). + """ inputs: List[str] = field(default_factory=list) outputs: List[str] = field(default_factory=list) @@ -17,7 +28,31 @@ class StatementDeps: class DatasetSchedule: """Typed result of DAG dataset usage analysis. - Tracks when datasets should be loaded/unloaded for memory-efficient execution. + Tracks when datasets should be loaded/unloaded for memory-efficient execution, + and classifies global inputs into four categories based on AST context. + + Attributes: + insertion: Statement index to list of datasets to load at that point + (first use). + deletion: Statement index to list of datasets to unload at that point + (last use). + global_inputs: All external dependencies not produced by the script. + Union of the four ``global_input_*`` categories below (no duplicates). + global_input_datasets: Definite datasets — used in dataset operations + (RegularAggregation operand, Identifier with kind="DatasetID", + UDO dataset params, MEMBERSHIP left operand, JoinOp, etc.). + global_input_scalars: Definite scalars — feed exclusively into scalar + chains propagated from constant assignments with no dataset ops. + global_input_dataset_or_scalar: Ambiguous at top level (e.g., + ``DS_r <- X + 2`` where X could be a dataset or scalar). + The caller may provide either. + global_input_component_or_scalar: Ambiguous inside RegularAggregation + (e.g., ``DS_1[calc Me_2 := Me_1 + X]`` where X could be a component + of DS_1 or an external scalar). Semantic error 1-1-6-11 is raised + at runtime if it collides with a component name. + persistent: Outputs written with ``<-`` (persistent assignment). + all_outputs: All variables produced by the script (both ``:=`` and + ``<-`` assignments). """ insertion: Dict[int, List[str]] = field(default_factory=dict) From d2f67a1ec5da3ddab4ebb6803942eccaf3624668 Mon Sep 17 00:00:00 2001 From: Javier Hernandez Date: Thu, 5 Mar 2026 16:40:26 +0100 Subject: [PATCH 08/11] refactor: rename DatasetSchedule to Schedule --- src/vtlengine/AST/DAG/__init__.py | 8 ++++---- src/vtlengine/AST/DAG/_models.py | 2 +- src/vtlengine/Interpreter/__init__.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/vtlengine/AST/DAG/__init__.py b/src/vtlengine/AST/DAG/__init__.py index 0905b1954..f5a4b3d6a 100644 --- a/src/vtlengine/AST/DAG/__init__.py +++ b/src/vtlengine/AST/DAG/__init__.py @@ -37,7 +37,7 @@ VarID, ) from vtlengine.AST.ASTTemplate import ASTTemplate -from vtlengine.AST.DAG._models import DatasetSchedule, StatementDeps +from vtlengine.AST.DAG._models import Schedule, StatementDeps from vtlengine.AST.Grammar.tokens import AS, DROP, KEEP, MEMBERSHIP, RENAME, TO from vtlengine.Exceptions import SemanticError from vtlengine.Model import Component, Dataset @@ -67,12 +67,12 @@ class DAGAnalyzer(ASTTemplate): _resolved_from_unknown: Set[str] = field(default_factory=set) @classmethod - def ds_structure(cls, ast: AST) -> DatasetSchedule: + def ds_structure(cls, ast: AST) -> Schedule: dag = cls() dag.visit(ast) return dag._ds_usage_analysis() - def _ds_usage_analysis(self) -> DatasetSchedule: + def _ds_usage_analysis(self) -> Schedule: """Analyze dataset dependencies to build insertion/deletion schedules.""" deletion: Dict[int, List[str]] = defaultdict(list) insertion: Dict[int, List[str]] = defaultdict(list) @@ -110,7 +110,7 @@ def _ds_usage_analysis(self) -> DatasetSchedule: classification = self._classify_global_inputs(all_outputs, global_inputs, global_set) - return DatasetSchedule( + return Schedule( insertion=dict(insertion), deletion=dict(deletion), global_inputs=classification["global_inputs"], diff --git a/src/vtlengine/AST/DAG/_models.py b/src/vtlengine/AST/DAG/_models.py index 17a31f180..4c196006f 100644 --- a/src/vtlengine/AST/DAG/_models.py +++ b/src/vtlengine/AST/DAG/_models.py @@ -25,7 +25,7 @@ class StatementDeps: @dataclass -class DatasetSchedule: +class Schedule: """Typed result of DAG dataset usage analysis. Tracks when datasets should be loaded/unloaded for memory-efficient execution, diff --git a/src/vtlengine/Interpreter/__init__.py b/src/vtlengine/Interpreter/__init__.py index 8cdd19888..7381d2a98 100644 --- a/src/vtlengine/Interpreter/__init__.py +++ b/src/vtlengine/Interpreter/__init__.py @@ -11,7 +11,7 @@ import vtlengine.Operators as Operators from vtlengine.AST.ASTTemplate import ASTTemplate from vtlengine.AST.DAG import HRDAGAnalyzer -from vtlengine.AST.DAG._models import DatasetSchedule +from vtlengine.AST.DAG._models import Schedule from vtlengine.AST.Grammar.tokens import ( AGGREGATE, ALL, @@ -115,7 +115,7 @@ class InterpreterAnalyzer(ASTTemplate): # Analysis mode only_semantic: bool = False # Memory efficient - ds_analysis: Optional[DatasetSchedule] = None + ds_analysis: Optional[Schedule] = None datapoints_paths: Optional[Dict[str, Path]] = None output_path: Optional[Union[str, Path]] = None # Time Period Representation From 1b7f94e6b59e69997e500d28fc665d0c424512aa Mon Sep 17 00:00:00 2001 From: Javier Hernandez Date: Thu, 5 Mar 2026 18:27:51 +0100 Subject: [PATCH 09/11] feat: classify dataset vs scalar inputs in DAG Schedule (#571) Extend DAG analysis to discriminate between datasets, scalars, dataset-or-scalar, and component-or-scalar global inputs. Each dataset-only operator now explicitly marks its direct VarID operands as dataset_inputs, so sub-expression operands (e.g., SC_1 in count(DS_1 + SC_1)) are correctly classified as ambiguous rather than forced to datasets. Split test_dag.py into test_classification.py (154 cases) and test_scheduling.py (13 non-trivial cases), inline VTL for simple tests, and delete unused reference/VTL files. --- src/vtlengine/AST/DAG/__init__.py | 141 +++- src/vtlengine/AST/DAG/_models.py | 7 + tests/DAG/data/references/1.json | 18 - tests/DAG/data/references/12.json | 30 - tests/DAG/data/references/14.json | 29 - tests/DAG/data/references/15.json | 29 - tests/DAG/data/references/17.json | 23 - tests/DAG/data/references/4.json | 27 - .../data/references/{ => scheduling}/10.json | 38 - .../data/references/{ => scheduling}/11.json | 5 - .../data/references/{ => scheduling}/13.json | 17 - .../data/references/{ => scheduling}/16.json | 11 - .../data/references/{ => scheduling}/2.json | 9 - .../data/references/{ => scheduling}/3.json | 22 - tests/DAG/data/references/scheduling/35.json | 26 + tests/DAG/data/references/scheduling/36.json | 24 + .../data/references/{ => scheduling}/5.json | 22 - .../data/references/{ => scheduling}/6.json | 30 - .../data/references/{ => scheduling}/7.json | 126 ---- .../data/references/{ => scheduling}/8.json | 38 - .../data/references/{ => scheduling}/9.json | 52 -- tests/DAG/data/vtl/12.vtl | 1 - tests/DAG/data/vtl/14.vtl | 1 - tests/DAG/data/vtl/15.vtl | 2 - tests/DAG/data/vtl/17.vtl | 1 - tests/DAG/data/vtl/35.vtl | 3 + tests/DAG/data/vtl/36.vtl | 3 + tests/DAG/test_classification.py | 657 ++++++++++++++++++ tests/DAG/test_dag.py | 57 -- tests/DAG/test_scheduling.py | 46 ++ 30 files changed, 893 insertions(+), 602 deletions(-) delete mode 100644 tests/DAG/data/references/1.json delete mode 100644 tests/DAG/data/references/12.json delete mode 100644 tests/DAG/data/references/14.json delete mode 100644 tests/DAG/data/references/15.json delete mode 100644 tests/DAG/data/references/17.json delete mode 100644 tests/DAG/data/references/4.json rename tests/DAG/data/references/{ => scheduling}/10.json (63%) rename tests/DAG/data/references/{ => scheduling}/11.json (51%) rename tests/DAG/data/references/{ => scheduling}/13.json (52%) rename tests/DAG/data/references/{ => scheduling}/16.json (53%) rename tests/DAG/data/references/{ => scheduling}/2.json (66%) rename tests/DAG/data/references/{ => scheduling}/3.json (54%) create mode 100644 tests/DAG/data/references/scheduling/35.json create mode 100644 tests/DAG/data/references/scheduling/36.json rename tests/DAG/data/references/{ => scheduling}/5.json (59%) rename tests/DAG/data/references/{ => scheduling}/6.json (63%) rename tests/DAG/data/references/{ => scheduling}/7.json (79%) rename tests/DAG/data/references/{ => scheduling}/8.json (72%) rename tests/DAG/data/references/{ => scheduling}/9.json (71%) delete mode 100644 tests/DAG/data/vtl/12.vtl delete mode 100644 tests/DAG/data/vtl/14.vtl delete mode 100644 tests/DAG/data/vtl/15.vtl delete mode 100644 tests/DAG/data/vtl/17.vtl create mode 100644 tests/DAG/data/vtl/35.vtl create mode 100644 tests/DAG/data/vtl/36.vtl create mode 100644 tests/DAG/test_classification.py delete mode 100644 tests/DAG/test_dag.py create mode 100644 tests/DAG/test_scheduling.py diff --git a/src/vtlengine/AST/DAG/__init__.py b/src/vtlengine/AST/DAG/__init__.py index f5a4b3d6a..1d49a4ecf 100644 --- a/src/vtlengine/AST/DAG/__init__.py +++ b/src/vtlengine/AST/DAG/__init__.py @@ -28,20 +28,52 @@ HRuleset, Identifier, JoinOp, + MulOp, Operator, ParamOp, PersistentAssignment, RegularAggregation, Start, + TimeAggregation, UDOCall, + UnaryOp, + Validation, VarID, ) from vtlengine.AST.ASTTemplate import ASTTemplate from vtlengine.AST.DAG._models import Schedule, StatementDeps -from vtlengine.AST.Grammar.tokens import AS, DROP, KEEP, MEMBERSHIP, RENAME, TO +from vtlengine.AST.Grammar.tokens import ( + AS, + DROP, + EXISTS_IN, + FILL_TIME_SERIES, + FLOW_TO_STOCK, + INTERSECT, + KEEP, + MEMBERSHIP, + PERIOD_INDICATOR, + RENAME, + SETDIFF, + STOCK_TO_FLOW, + SYMDIFF, + TIMESHIFT, + TO, + UNION, +) +from vtlengine.DataTypes import COMP_NAME_MAPPING from vtlengine.Exceptions import SemanticError from vtlengine.Model import Component, Dataset +# Operators that only accept datasets as input (never scalars). +# Grouped by AST node type for clarity. +DATASET_ONLY_MULOP = frozenset({UNION, INTERSECT, SETDIFF, SYMDIFF, EXISTS_IN}) +DATASET_ONLY_UNARYOP = frozenset({FLOW_TO_STOCK, STOCK_TO_FLOW, PERIOD_INDICATOR}) +DATASET_ONLY_BINOP = frozenset({MEMBERSHIP, TIMESHIFT}) + +# Reserved component names generated by the engine (e.g., count() → int_var, +# check() → bool_var). These are never external inputs. +RESERVED_COMPONENT_NAMES = frozenset(COMP_NAME_MAPPING.values()) + @dataclass class DAGAnalyzer(ASTTemplate): @@ -65,6 +97,8 @@ class DAGAnalyzer(ASTTemplate): unknown_variables: Set[str] = field(default_factory=set) # Outputs that were consumed via unknown_variables (RegularAggregation context) _resolved_from_unknown: Set[str] = field(default_factory=set) + # UDO names that have at least one dataset-typed parameter + _udos_with_dataset_params: Set[str] = field(default_factory=set) @classmethod def ds_structure(cls, ast: AST) -> Schedule: @@ -131,11 +165,15 @@ def _classify_global_inputs( """Classify global inputs into datasets, scalars, and ambiguous categories.""" scalar_outputs, comp_or_scalar = self._compute_scalar_outputs(all_outputs) - # Identify definite dataset inputs + # Identify definite dataset and scalar inputs definite_dataset_inputs: Set[str] = set() + definite_scalar_inputs: Set[str] = set() for statement in self.dependencies.values(): + # Collect explicitly typed scalar inputs (e.g., UDO scalar params) + for inp in statement.scalar_inputs: + definite_scalar_inputs.add(inp) if statement.has_dataset_op: - for inp in statement.inputs: + for inp in statement.dataset_inputs: definite_dataset_inputs.add(inp) # Include component_or_scalar candidates in global_inputs @@ -158,7 +196,9 @@ def _classify_global_inputs( result["global_input_component_or_scalar"].append(name) elif name in definite_dataset_inputs: result["global_input_datasets"].append(name) - elif self._feeds_only_scalar_chains(name, scalar_outputs): + elif name in definite_scalar_inputs or self._feeds_only_scalar_chains( + name, scalar_outputs + ): result["global_input_scalars"].append(name) else: result["global_input_dataset_or_scalar"].append(name) @@ -317,6 +357,8 @@ def statement_structure(self) -> StatementDeps: persistent=list(self.current_deps.persistent), unknown_variables=list(self.current_deps.unknown_variables), has_dataset_op=self._current_has_dataset_op, + dataset_inputs=list(self.current_deps.dataset_inputs), + scalar_inputs=list(self.current_deps.scalar_inputs), ) self.unknown_variables.update(self.current_deps.unknown_variables) return result @@ -338,10 +380,16 @@ def visit_Start(self, node: Start) -> None: self.visit(child) """ udos = {} + udos_with_ds: Set[str] = set() for ast_element in node.children: if isinstance(ast_element, Operator): udos[ast_element.op] = ast_element + for p in ast_element.parameters: + if isinstance(p.type_, Dataset): + udos_with_ds.add(ast_element.op) + break self.udos = udos + self._udos_with_dataset_params = udos_with_ds for child in node.children: if isinstance(child, (Assignment, PersistentAssignment)): self.is_first_assignment = True @@ -380,6 +428,8 @@ def visit_PersistentAssignment(self, node: PersistentAssignment) -> None: def visit_RegularAggregation(self, node: RegularAggregation) -> None: self._current_has_dataset_op = True + if isinstance(node.dataset, VarID) and node.dataset.value not in self.alias: + self.current_deps.dataset_inputs.append(node.dataset.value) self.visit(node.dataset) if node.op in [KEEP, DROP, RENAME]: return @@ -389,8 +439,10 @@ def visit_RegularAggregation(self, node: RegularAggregation) -> None: self.is_from_regular_aggregation = False def visit_BinOp(self, node: BinOp) -> None: - if node.op == MEMBERSHIP: + if node.op in DATASET_ONLY_BINOP: self._current_has_dataset_op = True + if isinstance(node.left, VarID) and node.left.value not in self.alias: + self.current_deps.dataset_inputs.append(node.left.value) self.is_dataset = True self.visit(node.left) self.is_dataset = False @@ -402,6 +454,22 @@ def visit_BinOp(self, node: BinOp) -> None: self.visit(node.left) self.visit(node.right) + def visit_MulOp(self, node: MulOp) -> None: + if node.op in DATASET_ONLY_MULOP: + self._current_has_dataset_op = True + for child in node.children: + if isinstance(child, VarID) and child.value not in self.alias: + self.current_deps.dataset_inputs.append(child.value) + for child in node.children: + self.visit(child) + + def visit_UnaryOp(self, node: UnaryOp) -> None: + if node.op in DATASET_ONLY_UNARYOP: + self._current_has_dataset_op = True + if isinstance(node.operand, VarID) and node.operand.value not in self.alias: + self.current_deps.dataset_inputs.append(node.operand.value) + self.visit(node.operand) + def visit_VarID(self, node: VarID) -> None: if ( not self.is_from_regular_aggregation or self.is_dataset @@ -412,6 +480,7 @@ def visit_VarID(self, node: VarID) -> None: self.is_from_regular_aggregation and node.value not in self.alias and not self.is_dataset + and node.value not in RESERVED_COMPONENT_NAMES and node.value not in self.current_deps.unknown_variables ): self.current_deps.unknown_variables.append(node.value) @@ -424,32 +493,56 @@ def visit_Identifier(self, node: Identifier) -> None: ): self._current_has_dataset_op = True self.current_deps.inputs.append(node.value) + self.current_deps.dataset_inputs.append(node.value) def visit_ParamOp(self, node: ParamOp) -> None: if self.udos and node.op in self.udos: + if node.op in self._udos_with_dataset_params: + self._current_has_dataset_op = True do_ast: Operator = self.udos[node.op] - - for arg in node.params: - index_arg = node.params.index(arg) - if do_ast.parameters[index_arg].type_.kind == "DataSet": - self._current_has_dataset_op = True - self.visit(arg) + for i, arg in enumerate(node.params): + if isinstance(arg, Constant): + continue + param_type = do_ast.parameters[i].type_ + if type(param_type) is not Component: + is_ds = isinstance(param_type, Dataset) + if is_ds and isinstance(arg, VarID): + self.current_deps.dataset_inputs.append(arg.value) + elif not is_ds and isinstance(arg, VarID): + self.current_deps.scalar_inputs.append(arg.value) + self.visit(arg) else: + if node.op == FILL_TIME_SERIES: + self._current_has_dataset_op = True + for child in node.children: + if isinstance(child, VarID) and child.value not in self.alias: + self.current_deps.dataset_inputs.append(child.value) super(DAGAnalyzer, self).visit_ParamOp(node) def visit_Aggregation(self, node: Aggregation) -> None: self._current_has_dataset_op = True if node.operand is not None: + if isinstance(node.operand, VarID) and node.operand.value not in self.alias: + self.current_deps.dataset_inputs.append(node.operand.value) self.visit(node.operand) def visit_Analytic(self, node: Analytic) -> None: self._current_has_dataset_op = True - if node.operand is not None: + # Inside RegularAggregation (calc/filter/etc.), analytic operands are always + # component references — they cannot be external scalars, so skip them. + if node.operand is not None and not self.is_from_regular_aggregation: self.visit(node.operand) def visit_JoinOp(self, node: JoinOp) -> None: self._current_has_dataset_op = True + # Join clauses contain BinOp(AS) nodes; dataset aliases are handled via visit_BinOp. + # Direct VarID children in clauses are dataset references. for clause in node.clauses: + if isinstance(clause, BinOp) and clause.op == AS: + if isinstance(clause.left, VarID) and clause.left.value not in self.alias: + self.current_deps.dataset_inputs.append(clause.left.value) + elif isinstance(clause, VarID) and clause.value not in self.alias: + self.current_deps.dataset_inputs.append(clause.value) self.visit(clause) def visit_UDOCall(self, node: UDOCall) -> None: @@ -457,24 +550,44 @@ def visit_UDOCall(self, node: UDOCall) -> None: if not node_args: super().visit_UDOCall(node) else: + if node.op in self._udos_with_dataset_params: + self._current_has_dataset_op = True for p, param in zip(node_args.parameters, node.params): if isinstance(param, Constant): continue if type(p.type_) is not Component: - if isinstance(p.type_, Dataset): - self._current_has_dataset_op = True + is_ds = isinstance(p.type_, Dataset) + if is_ds and isinstance(param, VarID): + self.current_deps.dataset_inputs.append(param.value) + elif not is_ds and isinstance(param, VarID): + self.current_deps.scalar_inputs.append(param.value) self.visit(param) def visit_HROperation(self, node: HROperation) -> None: """Visit HROperation node for dependency analysis.""" self._current_has_dataset_op = True + if isinstance(node.dataset, VarID) and node.dataset.value not in self.alias: + self.current_deps.dataset_inputs.append(node.dataset.value) self.visit(node.dataset) def visit_DPValidation(self, node: DPValidation) -> None: """Visit DPValidation node for dependency analysis.""" self._current_has_dataset_op = True + if isinstance(node.dataset, VarID) and node.dataset.value not in self.alias: + self.current_deps.dataset_inputs.append(node.dataset.value) self.visit(node.dataset) + def visit_TimeAggregation(self, node: TimeAggregation) -> None: + if node.operand is not None: + self.visit(node.operand) + + def visit_Validation(self, node: Validation) -> None: + # Don't force has_dataset_op here — let the inner expression determine it. + # Dataset-only operators (exists_in, membership, etc.) set it themselves. + self.visit(node.validation) + if node.imbalance is not None: + self.visit(node.imbalance) + class HRDAGAnalyzer(DAGAnalyzer): def visit_HRuleset(self, node: HRuleset) -> None: diff --git a/src/vtlengine/AST/DAG/_models.py b/src/vtlengine/AST/DAG/_models.py index 4c196006f..bd58f00d3 100644 --- a/src/vtlengine/AST/DAG/_models.py +++ b/src/vtlengine/AST/DAG/_models.py @@ -15,6 +15,11 @@ class StatementDeps: has_dataset_op: Whether this statement involves a dataset operation (RegularAggregation, JoinOp, Aggregation, Analytic, MEMBERSHIP, UDO with dataset params, etc.). + dataset_inputs: Subset of inputs that are definitively datasets + (e.g., UDO params typed as dataset). Empty means all inputs in a + ``has_dataset_op`` statement are considered dataset inputs. + scalar_inputs: Subset of inputs that are definitively scalars + (e.g., UDO params typed as a scalar type like number, string, etc.). """ inputs: List[str] = field(default_factory=list) @@ -22,6 +27,8 @@ class StatementDeps: persistent: List[str] = field(default_factory=list) unknown_variables: List[str] = field(default_factory=list) has_dataset_op: bool = False + dataset_inputs: List[str] = field(default_factory=list) + scalar_inputs: List[str] = field(default_factory=list) @dataclass diff --git a/tests/DAG/data/references/1.json b/tests/DAG/data/references/1.json deleted file mode 100644 index 42a811fce..000000000 --- a/tests/DAG/data/references/1.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "insertion": {}, - "deletion": { - "3": [ - "a", - "b", - "c" - ] - }, - "global_inputs": [], - "global_input_datasets": [], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [], - "global_input_component_or_scalar": [], - "persistent": [ - "c" - ] -} \ No newline at end of file diff --git a/tests/DAG/data/references/12.json b/tests/DAG/data/references/12.json deleted file mode 100644 index 53d80fe5f..000000000 --- a/tests/DAG/data/references/12.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "insertion": { - "1": [ - "DS_1" - ] - }, - "deletion": { - "1": [ - "DS_1", - "DS_r" - ] - }, - "global_inputs": [ - "DS_1", - "Me_1", - "SC_1" - ], - "global_input_datasets": [ - "DS_1" - ], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [], - "global_input_component_or_scalar": [ - "Me_1", - "SC_1" - ], - "persistent": [ - "DS_r" - ] -} \ No newline at end of file diff --git a/tests/DAG/data/references/14.json b/tests/DAG/data/references/14.json deleted file mode 100644 index 34f8edeee..000000000 --- a/tests/DAG/data/references/14.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "insertion": { - "1": [ - "DS_1", - "DS_2" - ] - }, - "deletion": { - "1": [ - "DS_1", - "DS_2", - "DS_r" - ] - }, - "global_inputs": [ - "DS_1", - "DS_2" - ], - "global_input_datasets": [], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [ - "DS_1", - "DS_2" - ], - "global_input_component_or_scalar": [], - "persistent": [ - "DS_r" - ] -} \ No newline at end of file diff --git a/tests/DAG/data/references/15.json b/tests/DAG/data/references/15.json deleted file mode 100644 index f7cb442d6..000000000 --- a/tests/DAG/data/references/15.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "insertion": { - "2": [ - "DS_1" - ] - }, - "deletion": { - "2": [ - "DS_1", - "DS_r", - "SC_r" - ] - }, - "global_inputs": [ - "DS_1", - "Me_1" - ], - "global_input_datasets": [ - "DS_1" - ], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [], - "global_input_component_or_scalar": [ - "Me_1" - ], - "persistent": [ - "DS_r" - ] -} \ No newline at end of file diff --git a/tests/DAG/data/references/17.json b/tests/DAG/data/references/17.json deleted file mode 100644 index 9696fa802..000000000 --- a/tests/DAG/data/references/17.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "insertion": { - "1": [ - "DS_1" - ] - }, - "deletion": { - "1": [ - "DS_1", - "DS_r" - ] - }, - "global_inputs": [ - "DS_1" - ], - "global_input_datasets": [ - "DS_1" - ], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [], - "global_input_component_or_scalar": [], - "persistent": [] -} \ No newline at end of file diff --git a/tests/DAG/data/references/4.json b/tests/DAG/data/references/4.json deleted file mode 100644 index e84302eaa..000000000 --- a/tests/DAG/data/references/4.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "insertion": { - "1": [ - "DS_1", - "DS_2" - ] - }, - "deletion": { - "1": [ - "DS_1", - "DS_2", - "DS_r" - ] - }, - "global_inputs": [ - "DS_1", - "DS_2" - ], - "global_input_datasets": [ - "DS_1", - "DS_2" - ], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [], - "global_input_component_or_scalar": [], - "persistent": [] -} diff --git a/tests/DAG/data/references/10.json b/tests/DAG/data/references/scheduling/10.json similarity index 63% rename from tests/DAG/data/references/10.json rename to tests/DAG/data/references/scheduling/10.json index 74b3870eb..bf970b66c 100644 --- a/tests/DAG/data/references/10.json +++ b/tests/DAG/data/references/scheduling/10.json @@ -64,43 +64,5 @@ "BOP" ] }, - "global_inputs": [ - "ACCOUNTING_ENTRY", - "ADJUSTMENT", - "BOP", - "COMP_METHOD", - "COUNTERPART_SECTOR", - "CURRENCY_DENOM", - "FLOW_STOCK_ENTRY", - "FREQ", - "FUNCTIONAL_CAT", - "INSTR_ASSET", - "INT_ACC_ITEM", - "MATURITY", - "REF_SECTOR", - "VALUATION", - "imbalance" - ], - "global_input_datasets": [ - "BOP" - ], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [], - "global_input_component_or_scalar": [ - "ACCOUNTING_ENTRY", - "ADJUSTMENT", - "COMP_METHOD", - "COUNTERPART_SECTOR", - "CURRENCY_DENOM", - "FLOW_STOCK_ENTRY", - "FREQ", - "FUNCTIONAL_CAT", - "INSTR_ASSET", - "INT_ACC_ITEM", - "MATURITY", - "REF_SECTOR", - "VALUATION", - "imbalance" - ], "persistent": [] } \ No newline at end of file diff --git a/tests/DAG/data/references/11.json b/tests/DAG/data/references/scheduling/11.json similarity index 51% rename from tests/DAG/data/references/11.json rename to tests/DAG/data/references/scheduling/11.json index 53e6ccfef..76743b78b 100644 --- a/tests/DAG/data/references/11.json +++ b/tests/DAG/data/references/scheduling/11.json @@ -9,11 +9,6 @@ "c" ] }, - "global_inputs": [], - "global_input_datasets": [], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [], - "global_input_component_or_scalar": [], "persistent": [ "c" ] diff --git a/tests/DAG/data/references/13.json b/tests/DAG/data/references/scheduling/13.json similarity index 52% rename from tests/DAG/data/references/13.json rename to tests/DAG/data/references/scheduling/13.json index 12ee853e3..082d06b0d 100644 --- a/tests/DAG/data/references/13.json +++ b/tests/DAG/data/references/scheduling/13.json @@ -19,23 +19,6 @@ "SC_2" ] }, - "global_inputs": [ - "DS_1", - "Me_1", - "SC_1", - "SC_2" - ], - "global_input_datasets": [ - "DS_1" - ], - "global_input_scalars": [ - "SC_1", - "SC_2" - ], - "global_input_dataset_or_scalar": [], - "global_input_component_or_scalar": [ - "Me_1" - ], "persistent": [ "DS_r" ] diff --git a/tests/DAG/data/references/16.json b/tests/DAG/data/references/scheduling/16.json similarity index 53% rename from tests/DAG/data/references/16.json rename to tests/DAG/data/references/scheduling/16.json index c94e32353..741f492cb 100644 --- a/tests/DAG/data/references/16.json +++ b/tests/DAG/data/references/scheduling/16.json @@ -15,17 +15,6 @@ "SC_r" ] }, - "global_inputs": [ - "DS_1", - "SC_1" - ], - "global_input_datasets": [], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [ - "DS_1", - "SC_1" - ], - "global_input_component_or_scalar": [], "persistent": [ "DS_r" ] diff --git a/tests/DAG/data/references/2.json b/tests/DAG/data/references/scheduling/2.json similarity index 66% rename from tests/DAG/data/references/2.json rename to tests/DAG/data/references/scheduling/2.json index 0e8dae7c7..902fd2827 100644 --- a/tests/DAG/data/references/2.json +++ b/tests/DAG/data/references/scheduling/2.json @@ -22,15 +22,6 @@ "A" ] }, - "global_inputs": [ - "A" - ], - "global_input_datasets": [], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [ - "A" - ], - "global_input_component_or_scalar": [], "persistent": [ "a", "b", diff --git a/tests/DAG/data/references/3.json b/tests/DAG/data/references/scheduling/3.json similarity index 54% rename from tests/DAG/data/references/3.json rename to tests/DAG/data/references/scheduling/3.json index 63e131dec..2099a26ea 100644 --- a/tests/DAG/data/references/3.json +++ b/tests/DAG/data/references/scheduling/3.json @@ -29,28 +29,6 @@ "A" ] }, - "global_inputs": [ - "A", - "A2", - "var1", - "var3", - "varF", - "varRel", - "varRel2" - ], - "global_input_datasets": [ - "A", - "A2" - ], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [], - "global_input_component_or_scalar": [ - "var1", - "var3", - "varF", - "varRel", - "varRel2" - ], "persistent": [ "F" ] diff --git a/tests/DAG/data/references/scheduling/35.json b/tests/DAG/data/references/scheduling/35.json new file mode 100644 index 000000000..931078719 --- /dev/null +++ b/tests/DAG/data/references/scheduling/35.json @@ -0,0 +1,26 @@ +{ + "insertion": { + "1": [ + "DS_1", + "DS_2" + ], + "2": [ + "DS_3" + ] + }, + "deletion": { + "3": [ + "DS_A", + "DS_B", + "DS_r" + ], + "2": [ + "DS_1", + "DS_3" + ], + "1": [ + "DS_2" + ] + }, + "persistent": [] +} \ No newline at end of file diff --git a/tests/DAG/data/references/scheduling/36.json b/tests/DAG/data/references/scheduling/36.json new file mode 100644 index 000000000..4df60c70b --- /dev/null +++ b/tests/DAG/data/references/scheduling/36.json @@ -0,0 +1,24 @@ +{ + "insertion": { + "2": [ + "SC_1" + ], + "3": [ + "DS_1" + ] + }, + "deletion": { + "2": [ + "SC_1", + "SC_a" + ], + "3": [ + "DS_1", + "DS_r", + "SC_b" + ] + }, + "persistent": [ + "DS_r" + ] +} \ No newline at end of file diff --git a/tests/DAG/data/references/5.json b/tests/DAG/data/references/scheduling/5.json similarity index 59% rename from tests/DAG/data/references/5.json rename to tests/DAG/data/references/scheduling/5.json index 0fad1b8ad..6e3d061ee 100644 --- a/tests/DAG/data/references/5.json +++ b/tests/DAG/data/references/scheduling/5.json @@ -29,28 +29,6 @@ "DSD_POP" ] }, - "global_inputs": [ - "AGE", - "DSD_AGR", - "DSD_POP", - "MEASURE", - "SEX", - "TIME_HORIZ", - "UNIT_MEASURE" - ], - "global_input_datasets": [ - "DSD_AGR", - "DSD_POP" - ], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [], - "global_input_component_or_scalar": [ - "AGE", - "MEASURE", - "SEX", - "TIME_HORIZ", - "UNIT_MEASURE" - ], "persistent": [ "DS_check_agr", "DS_check_countries", diff --git a/tests/DAG/data/references/6.json b/tests/DAG/data/references/scheduling/6.json similarity index 63% rename from tests/DAG/data/references/6.json rename to tests/DAG/data/references/scheduling/6.json index aac54e973..b8672add0 100644 --- a/tests/DAG/data/references/6.json +++ b/tests/DAG/data/references/scheduling/6.json @@ -54,36 +54,6 @@ "DS2" ] }, - "global_inputs": [ - "BIS_LOC_STATS", - "CURRENCY", - "CURRENCY_DENOM", - "DS1", - "DS2", - "DS3", - "EXCHANGE_RATE", - "EXR_SUFFIX", - "EXR_TYPE", - "FREQ", - "OBS_VALUE" - ], - "global_input_datasets": [ - "BIS_LOC_STATS", - "DS1", - "DS2", - "DS3" - ], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [], - "global_input_component_or_scalar": [ - "CURRENCY", - "CURRENCY_DENOM", - "EXCHANGE_RATE", - "EXR_SUFFIX", - "EXR_TYPE", - "FREQ", - "OBS_VALUE" - ], "persistent": [ "numCouYear", "numYearCou", diff --git a/tests/DAG/data/references/7.json b/tests/DAG/data/references/scheduling/7.json similarity index 79% rename from tests/DAG/data/references/7.json rename to tests/DAG/data/references/scheduling/7.json index 757fa6dac..075c98c33 100644 --- a/tests/DAG/data/references/7.json +++ b/tests/DAG/data/references/scheduling/7.json @@ -506,132 +506,6 @@ "ANCRDT_ACCNTNG_C" ] }, - "global_inputs": [ - "ACCMLTD_WRTFFS", - "ANCRDT_ACCNTNG_C", - "ANCRDT_ACCNTNG_C_Z", - "ANCRDT_DRGTN_QRTR_CR_OA", - "ANCRDT_ENTTY", - "ANCRDT_ENTTY_DFLT_C", - "ANCRDT_ENTTY_DFLT_C_T1", - "ANCRDT_ENTTY_INSTRMNT_C", - "ANCRDT_ENTTY_RSK_C", - "ANCRDT_FNNCL_C", - "ANCRDT_FNNCL_C_T1", - "ANCRDT_INSTRMNT_C", - "ANCRDT_INSTRMNT_C_T1", - "ANCRDT_INSTRMNT_PRTCTN_RCVD_C", - "ANCRDT_JNT_LBLTS_C", - "ANCRDT_PRTCTN_RCVD_C", - "ANCRDT_PRTCTN_RCVD_C_T1", - "CC0010", - "CNTRY", - "CRDTR", - "CRDTR_CD", - "DBTR_CD", - "DFLT_STTS", - "DT_BRTH", - "DT_INCPTN", - "DT_RFRNC", - "ENTTY_RIAD_CD", - "ENTTY_RL", - "FRGN_BRNCH", - "HD_OFFC_UNDRT_CD", - "HD_OFFC_UNDRT_CNTRY", - "HD_QRTR_CD_CRDTR", - "HD_QRTR_CD_DBTR", - "IMMDT_PRNT_UNDRT_CD", - "INSTTTNL_SCTR", - "INSTTTNL_SCTR_DTL", - "IS_PRTCTN_PRVDR", - "LGL_FRM", - "OBSRVD_AGNT_CD", - "OFF_BLNC_SHT_AMNT", - "OTHR_TYP_ENTTY", - "OTSTNDNG_NMNL_AMNT", - "PRTCTN_ALLCTD_VL", - "PRTCTN_PRVDR_CD", - "RCGNTN_STTS", - "RCRS", - "SPFUND", - "SRVCR", - "SSMSIGNIFICANCE", - "THRD_PRTY_PRRTY_CLMS", - "TRD_RCVBL_NN_RCRS", - "TTL_NMBR_DBTRS", - "TTL_NMBR_DFLT_DBTRS", - "TYP_INSTRMNT", - "TYP_PRTCTN", - "TYP_SCRTSTN", - "ULTMT_PRNT_UNDRT_CD", - "bool_var", - "int_var" - ], - "global_input_datasets": [ - "ANCRDT_ACCNTNG_C", - "ANCRDT_ACCNTNG_C_Z", - "ANCRDT_ENTTY", - "ANCRDT_ENTTY_DFLT_C", - "ANCRDT_ENTTY_DFLT_C_T1", - "ANCRDT_ENTTY_INSTRMNT_C", - "ANCRDT_ENTTY_RSK_C", - "ANCRDT_FNNCL_C", - "ANCRDT_FNNCL_C_T1", - "ANCRDT_INSTRMNT_C", - "ANCRDT_INSTRMNT_C_T1", - "ANCRDT_INSTRMNT_PRTCTN_RCVD_C", - "ANCRDT_JNT_LBLTS_C", - "ANCRDT_PRTCTN_RCVD_C", - "ANCRDT_PRTCTN_RCVD_C_T1" - ], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [], - "global_input_component_or_scalar": [ - "ACCMLTD_WRTFFS", - "ANCRDT_DRGTN_QRTR_CR_OA", - "CC0010", - "CNTRY", - "CRDTR", - "CRDTR_CD", - "DBTR_CD", - "DFLT_STTS", - "DT_BRTH", - "DT_INCPTN", - "DT_RFRNC", - "ENTTY_RIAD_CD", - "ENTTY_RL", - "FRGN_BRNCH", - "HD_OFFC_UNDRT_CD", - "HD_OFFC_UNDRT_CNTRY", - "HD_QRTR_CD_CRDTR", - "HD_QRTR_CD_DBTR", - "IMMDT_PRNT_UNDRT_CD", - "INSTTTNL_SCTR", - "INSTTTNL_SCTR_DTL", - "IS_PRTCTN_PRVDR", - "LGL_FRM", - "OBSRVD_AGNT_CD", - "OFF_BLNC_SHT_AMNT", - "OTHR_TYP_ENTTY", - "OTSTNDNG_NMNL_AMNT", - "PRTCTN_ALLCTD_VL", - "PRTCTN_PRVDR_CD", - "RCGNTN_STTS", - "RCRS", - "SPFUND", - "SRVCR", - "SSMSIGNIFICANCE", - "THRD_PRTY_PRRTY_CLMS", - "TRD_RCVBL_NN_RCRS", - "TTL_NMBR_DBTRS", - "TTL_NMBR_DFLT_DBTRS", - "TYP_INSTRMNT", - "TYP_PRTCTN", - "TYP_SCRTSTN", - "ULTMT_PRNT_UNDRT_CD", - "bool_var", - "int_var" - ], "persistent": [ "ACCNTNG_CMPLTNSS", "CN0230", diff --git a/tests/DAG/data/references/8.json b/tests/DAG/data/references/scheduling/8.json similarity index 72% rename from tests/DAG/data/references/8.json rename to tests/DAG/data/references/scheduling/8.json index dacbdc15f..5f728af27 100644 --- a/tests/DAG/data/references/8.json +++ b/tests/DAG/data/references/scheduling/8.json @@ -98,44 +98,6 @@ "ANCRDT_INSTRMNT_C_T3" ] }, - "global_inputs": [ - "ANCRDT_ACCNTNG_C", - "ANCRDT_ACCNTNG_C_T3", - "ANCRDT_ENTTY", - "ANCRDT_ENTTY_DFLT_C", - "ANCRDT_ENTTY_INSTRMNT_C", - "ANCRDT_FNNCL_C", - "ANCRDT_INSTRMNT_C", - "ANCRDT_INSTRMNT_C_T1", - "ANCRDT_INSTRMNT_C_T2", - "ANCRDT_INSTRMNT_C_T3", - "ENTTY_RIAD_CD", - "ENTTY_RL", - "HD_OFFC_UNDRT_CD", - "LGL_ENTTY_CD", - "OBSRVD_AGNT_CD" - ], - "global_input_datasets": [ - "ANCRDT_ACCNTNG_C", - "ANCRDT_ACCNTNG_C_T3", - "ANCRDT_ENTTY", - "ANCRDT_ENTTY_DFLT_C", - "ANCRDT_ENTTY_INSTRMNT_C", - "ANCRDT_FNNCL_C", - "ANCRDT_INSTRMNT_C", - "ANCRDT_INSTRMNT_C_T1", - "ANCRDT_INSTRMNT_C_T2", - "ANCRDT_INSTRMNT_C_T3" - ], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [], - "global_input_component_or_scalar": [ - "ENTTY_RIAD_CD", - "ENTTY_RL", - "HD_OFFC_UNDRT_CD", - "LGL_ENTTY_CD", - "OBSRVD_AGNT_CD" - ], "persistent": [ "DP_RLST_ACCNTNG_FRMWRK_RSLT", "DP_RLST_ACCNTNG_INTRCMPNY", diff --git a/tests/DAG/data/references/9.json b/tests/DAG/data/references/scheduling/9.json similarity index 71% rename from tests/DAG/data/references/9.json rename to tests/DAG/data/references/scheduling/9.json index e92a92fc4..b487827b6 100644 --- a/tests/DAG/data/references/9.json +++ b/tests/DAG/data/references/scheduling/9.json @@ -128,58 +128,6 @@ "Oferta_PT_2025_Q1" ] }, - "global_inputs": [ - "Income_PT", - "Inflation_PT", - "Inflation_divisors_Q", - "Oferta_PT_2025_Q1", - "Vendas_PT_2025_Q1", - "coefficient", - "coefficient_cq", - "coefficient_inv", - "coefficient_l", - "coefficient_lc", - "coefficient_lcq", - "coefficient_lq", - "coefficient_q", - "county", - "divisor", - "estado", - "income", - "period_label", - "regiao", - "value", - "var", - "year_str" - ], - "global_input_datasets": [ - "Income_PT", - "Inflation_PT", - "Inflation_divisors_Q", - "Oferta_PT_2025_Q1", - "Vendas_PT_2025_Q1" - ], - "global_input_scalars": [], - "global_input_dataset_or_scalar": [], - "global_input_component_or_scalar": [ - "coefficient", - "coefficient_cq", - "coefficient_inv", - "coefficient_l", - "coefficient_lc", - "coefficient_lcq", - "coefficient_lq", - "coefficient_q", - "county", - "divisor", - "estado", - "income", - "period_label", - "regiao", - "value", - "var", - "year_str" - ], "persistent": [ "output_generic", "output_generic_eda", diff --git a/tests/DAG/data/vtl/12.vtl b/tests/DAG/data/vtl/12.vtl deleted file mode 100644 index f9ee4c4bd..000000000 --- a/tests/DAG/data/vtl/12.vtl +++ /dev/null @@ -1 +0,0 @@ -DS_r <- DS_1[calc Me_2 := Me_1 * SC_1]; diff --git a/tests/DAG/data/vtl/14.vtl b/tests/DAG/data/vtl/14.vtl deleted file mode 100644 index e19c38657..000000000 --- a/tests/DAG/data/vtl/14.vtl +++ /dev/null @@ -1 +0,0 @@ -DS_r <- DS_1 + DS_2; diff --git a/tests/DAG/data/vtl/15.vtl b/tests/DAG/data/vtl/15.vtl deleted file mode 100644 index b17fb76c0..000000000 --- a/tests/DAG/data/vtl/15.vtl +++ /dev/null @@ -1,2 +0,0 @@ -SC_r := 10; -DS_r <- DS_1[calc Me_2 := Me_1 + SC_r]; diff --git a/tests/DAG/data/vtl/17.vtl b/tests/DAG/data/vtl/17.vtl deleted file mode 100644 index 2e9470e04..000000000 --- a/tests/DAG/data/vtl/17.vtl +++ /dev/null @@ -1 +0,0 @@ -DS_r := DS_1#Me_1; diff --git a/tests/DAG/data/vtl/35.vtl b/tests/DAG/data/vtl/35.vtl new file mode 100644 index 000000000..d5a7f8116 --- /dev/null +++ b/tests/DAG/data/vtl/35.vtl @@ -0,0 +1,3 @@ +DS_A := DS_1 + DS_2; +DS_B := DS_1 * DS_3; +DS_r := DS_A + DS_B; diff --git a/tests/DAG/data/vtl/36.vtl b/tests/DAG/data/vtl/36.vtl new file mode 100644 index 000000000..2d6cb0181 --- /dev/null +++ b/tests/DAG/data/vtl/36.vtl @@ -0,0 +1,3 @@ +SC_a := 10; +SC_b := SC_a + SC_1; +DS_r <- DS_1[calc Me_2 := Me_1 + SC_b]; diff --git a/tests/DAG/test_classification.py b/tests/DAG/test_classification.py new file mode 100644 index 000000000..a31477db8 --- /dev/null +++ b/tests/DAG/test_classification.py @@ -0,0 +1,657 @@ +from dataclasses import dataclass, field +from pathlib import Path +from typing import List, Optional + +import pytest + +from vtlengine.API import create_ast +from vtlengine.AST.DAG import DAGAnalyzer + +data_path = Path(__file__).parent / "data" + + +@dataclass +class Classification: + datasets: List[str] = field(default_factory=list) + scalars: List[str] = field(default_factory=list) + dataset_or_scalar: List[str] = field(default_factory=list) + component_or_scalar: List[str] = field(default_factory=list) + vtl: Optional[str] = None + + +# Tests 1-11: vtl=None → read from VTL files (complex multi-statement scripts). +# Tests 12+: vtl is inline. +CASES: dict[str, Classification] = { + # --- File-based tests (complex multi-statement scripts) --- + "1": Classification(), + "2": Classification(dataset_or_scalar=["A"]), + "3": Classification( + datasets=["A", "A2"], + component_or_scalar=["var1", "var3", "varF", "varRel", "varRel2"], + ), + "4": Classification(datasets=["DS_1", "DS_2"]), + "5": Classification( + datasets=["DSD_AGR", "DSD_POP"], + component_or_scalar=["AGE", "MEASURE", "SEX", "TIME_HORIZ", "UNIT_MEASURE"], + ), + "6": Classification( + datasets=["BIS_LOC_STATS", "DS1", "DS2", "DS3"], + component_or_scalar=[ + "CURRENCY", + "CURRENCY_DENOM", + "EXCHANGE_RATE", + "EXR_SUFFIX", + "EXR_TYPE", + "FREQ", + "OBS_VALUE", + ], + ), + "7": Classification( + datasets=[ + "ANCRDT_ACCNTNG_C", + "ANCRDT_ACCNTNG_C_Z", + "ANCRDT_ENTTY", + "ANCRDT_ENTTY_DFLT_C", + "ANCRDT_ENTTY_DFLT_C_T1", + "ANCRDT_ENTTY_INSTRMNT_C", + "ANCRDT_ENTTY_RSK_C", + "ANCRDT_FNNCL_C", + "ANCRDT_FNNCL_C_T1", + "ANCRDT_INSTRMNT_C", + "ANCRDT_INSTRMNT_C_T1", + "ANCRDT_INSTRMNT_PRTCTN_RCVD_C", + "ANCRDT_JNT_LBLTS_C", + "ANCRDT_PRTCTN_RCVD_C", + "ANCRDT_PRTCTN_RCVD_C_T1", + ], + component_or_scalar=[ + "ACCMLTD_WRTFFS", + "ANCRDT_DRGTN_QRTR_CR_OA", + "CC0010", + "CNTRY", + "CRDTR", + "CRDTR_CD", + "DBTR_CD", + "DFLT_STTS", + "DT_BRTH", + "DT_INCPTN", + "DT_RFRNC", + "ENTTY_RIAD_CD", + "ENTTY_RL", + "FRGN_BRNCH", + "HD_OFFC_UNDRT_CD", + "HD_OFFC_UNDRT_CNTRY", + "HD_QRTR_CD_CRDTR", + "HD_QRTR_CD_DBTR", + "IMMDT_PRNT_UNDRT_CD", + "INSTTTNL_SCTR", + "INSTTTNL_SCTR_DTL", + "IS_PRTCTN_PRVDR", + "LGL_FRM", + "OBSRVD_AGNT_CD", + "OFF_BLNC_SHT_AMNT", + "OTHR_TYP_ENTTY", + "OTSTNDNG_NMNL_AMNT", + "PRTCTN_ALLCTD_VL", + "PRTCTN_PRVDR_CD", + "RCGNTN_STTS", + "RCRS", + "SPFUND", + "SRVCR", + "SSMSIGNIFICANCE", + "THRD_PRTY_PRRTY_CLMS", + "TRD_RCVBL_NN_RCRS", + "TTL_NMBR_DBTRS", + "TTL_NMBR_DFLT_DBTRS", + "TYP_INSTRMNT", + "TYP_PRTCTN", + "TYP_SCRTSTN", + "ULTMT_PRNT_UNDRT_CD", + ], + ), + "8": Classification( + datasets=[ + "ANCRDT_ACCNTNG_C", + "ANCRDT_ACCNTNG_C_T3", + "ANCRDT_ENTTY", + "ANCRDT_ENTTY_DFLT_C", + "ANCRDT_ENTTY_INSTRMNT_C", + "ANCRDT_FNNCL_C", + "ANCRDT_INSTRMNT_C", + "ANCRDT_INSTRMNT_C_T1", + "ANCRDT_INSTRMNT_C_T2", + "ANCRDT_INSTRMNT_C_T3", + ], + component_or_scalar=[ + "ENTTY_RIAD_CD", + "ENTTY_RL", + "HD_OFFC_UNDRT_CD", + "LGL_ENTTY_CD", + "OBSRVD_AGNT_CD", + ], + ), + "9": Classification( + datasets=[ + "Income_PT", + "Inflation_PT", + "Inflation_divisors_Q", + "Oferta_PT_2025_Q1", + "Vendas_PT_2025_Q1", + ], + component_or_scalar=[ + "coefficient", + "coefficient_cq", + "coefficient_inv", + "coefficient_lc", + "coefficient_lcq", + "coefficient_q", + "county", + "divisor", + "estado", + "income", + "period_label", + "regiao", + "value", + "var", + "year_str", + ], + ), + "10": Classification( + datasets=["BOP"], + component_or_scalar=[ + "ACCOUNTING_ENTRY", + "ADJUSTMENT", + "COMP_METHOD", + "COUNTERPART_SECTOR", + "CURRENCY_DENOM", + "FLOW_STOCK_ENTRY", + "FREQ", + "FUNCTIONAL_CAT", + "INSTR_ASSET", + "INT_ACC_ITEM", + "MATURITY", + "REF_SECTOR", + "VALUATION", + "imbalance", + ], + ), + "11": Classification(), + # --- Inline tests --- + # Calc with external component/scalar + "12": Classification( + vtl="DS_r <- DS_1[calc Me_2 := Me_1 * SC_1];", + datasets=["DS_1"], + component_or_scalar=["Me_1", "SC_1"], + ), + # Scalar chain with UDO + "13": Classification( + vtl="SC_a := SC_1 + SC_2;\nDS_r <- DS_1[calc Me_2 := Me_1 + SC_a];", + datasets=["DS_1"], + scalars=["SC_1", "SC_2"], + component_or_scalar=["Me_1"], + ), + # Dual binary op + "14": Classification( + vtl="DS_r <- DS_1 + DS_2;", + dataset_or_scalar=["DS_1", "DS_2"], + ), + # Scalar chain feeding calc + "15": Classification( + vtl="SC_r := 10;\nDS_r <- DS_1[calc Me_2 := Me_1 + SC_r];", + datasets=["DS_1"], + component_or_scalar=["Me_1"], + ), + # Mixed dataset_or_scalar + scalar chain + "16": Classification( + vtl="DS_r <- DS_1 + SC_1;\nSC_r := SC_1 * 2;", + dataset_or_scalar=["DS_1", "SC_1"], + ), + # Membership + "17": Classification(vtl="DS_r := DS_1#Me_1;", datasets=["DS_1"]), + # Set operators (dataset-only) + "18": Classification(vtl="DS_r <- union(DS_1, DS_2);", datasets=["DS_1", "DS_2"]), + "19": Classification(vtl="DS_r <- intersect(DS_1, DS_2);", datasets=["DS_1", "DS_2"]), + "20": Classification(vtl="DS_r <- setdiff(DS_1, DS_2);", datasets=["DS_1", "DS_2"]), + # If-then-else (dual) + "21": Classification( + vtl="DS_r := if DS_1 then DS_2 else DS_3;", + dataset_or_scalar=["DS_1", "DS_2", "DS_3"], + ), + "22": Classification( + vtl="SC_r := if true then SC_1 else SC_2;", + dataset_or_scalar=["SC_1", "SC_2"], + ), + # Comparison / logical (dual) + "23": Classification(vtl="DS_r := DS_1 > DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "24": Classification(vtl="DS_r := DS_1 and DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "25": Classification(vtl="DS_r := not DS_1;", dataset_or_scalar=["DS_1"]), + # Numeric (dual) + "26": Classification(vtl="DS_r := abs(DS_1);", dataset_or_scalar=["DS_1"]), + "27": Classification(vtl="SC_r := abs(SC_1);", dataset_or_scalar=["SC_1"]), + # Aggregation (dataset-only) + "28": Classification(vtl="DS_r := sum(DS_1);", datasets=["DS_1"]), + # Clause operators + "29": Classification( + vtl="DS_r <- DS_1[filter Me_1 > 10];", + datasets=["DS_1"], + component_or_scalar=["Me_1"], + ), + "30": Classification(vtl="DS_r <- DS_1[keep Me_1];", datasets=["DS_1"]), + "31": Classification(vtl="DS_r <- DS_1[drop Me_1];", datasets=["DS_1"]), + "32": Classification(vtl="DS_r <- DS_1[rename Me_1 to Me_2];", datasets=["DS_1"]), + "33": Classification( + vtl="DS_r := DS_1[sub Id_1 = 1];", + datasets=["DS_1"], + component_or_scalar=["Id_1"], + ), + # Parameterized (dual) + "34": Classification(vtl="DS_r := round(DS_1, 2);", dataset_or_scalar=["DS_1"]), + # Multi-statement with intermediates + "35": Classification( + vtl="DS_A := DS_1 + DS_2;\nDS_B := DS_1 * DS_3;\nDS_r := DS_A + DS_B;", + dataset_or_scalar=["DS_1", "DS_2", "DS_3"], + ), + # Scalar chain propagation + "36": Classification( + vtl="SC_a := 10;\nSC_b := SC_a + SC_1;\nDS_r <- DS_1[calc Me_2 := Me_1 + SC_b];", + datasets=["DS_1"], + scalars=["SC_1"], + component_or_scalar=["Me_1"], + ), + # Join (dataset-only) + "37": Classification(vtl="DS_r := inner_join(DS_1, DS_2);", datasets=["DS_1", "DS_2"]), + # Dual unary + "38": Classification(vtl="DS_r := isnull(DS_1);", dataset_or_scalar=["DS_1"]), + "39": Classification(vtl="DS_r := -DS_1;", dataset_or_scalar=["DS_1"]), + # Calc with multiple external refs + "40": Classification( + vtl="DS_r <- DS_1[calc Me_2 := Me_1 + SC_1, Me_3 := Me_1 * SC_2];", + datasets=["DS_1"], + component_or_scalar=["Me_1", "SC_1", "SC_2"], + ), + # UDO with typed parameters + "41": Classification( + vtl=( + "define operator my_op (ds dataset, sc number)\n" + " returns dataset is\n" + " ds * sc\n" + "end operator;\n\n" + "DS_r := my_op(DS_1, SC_1);" + ), + datasets=["DS_1"], + scalars=["SC_1"], + ), + # Time operators (dataset-only) + "42": Classification(vtl="DS_r := flow_to_stock(DS_1);", datasets=["DS_1"]), + "43": Classification(vtl="DS_r := stock_to_flow(DS_1);", datasets=["DS_1"]), + "44": Classification(vtl="DS_r := exists_in(DS_1, DS_2, all);", datasets=["DS_1", "DS_2"]), + "45": Classification(vtl="DS_r := timeshift(DS_1, 1);", datasets=["DS_1"]), + # --- Dual BinOp operators --- + "46": Classification(vtl="DS_r := DS_1 / DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "47": Classification(vtl="DS_r := mod(DS_1, DS_2);", dataset_or_scalar=["DS_1", "DS_2"]), + "48": Classification(vtl="DS_r := DS_1 || DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "49": Classification(vtl="DS_r := DS_1 or DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "50": Classification(vtl="DS_r := DS_1 xor DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "51": Classification(vtl="DS_r := DS_1 = DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "52": Classification(vtl="DS_r := DS_1 <> DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "53": Classification(vtl="DS_r := DS_1 >= DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "54": Classification(vtl="DS_r := DS_1 < DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "55": Classification(vtl="DS_r := DS_1 <= DS_2;", dataset_or_scalar=["DS_1", "DS_2"]), + "56": Classification(vtl="DS_r := DS_1 in {1, 2, 3};", dataset_or_scalar=["DS_1"]), + "57": Classification(vtl="DS_r := DS_1 not_in {1, 2, 3};", dataset_or_scalar=["DS_1"]), + "58": Classification( + vtl='DS_r := match_characters(DS_1, "[a-z]+");', + dataset_or_scalar=["DS_1"], + ), + "59": Classification(vtl="DS_r := nvl(DS_1, 0);", dataset_or_scalar=["DS_1"]), + # --- Dual UnaryOp operators --- + "60": Classification(vtl="DS_r := exp(DS_1);", dataset_or_scalar=["DS_1"]), + "61": Classification(vtl="DS_r := ln(DS_1);", dataset_or_scalar=["DS_1"]), + "62": Classification(vtl="DS_r := sqrt(DS_1);", dataset_or_scalar=["DS_1"]), + "63": Classification(vtl="DS_r := ceil(DS_1);", dataset_or_scalar=["DS_1"]), + "64": Classification(vtl="DS_r := floor(DS_1);", dataset_or_scalar=["DS_1"]), + "65": Classification(vtl="DS_r := +DS_1;", dataset_or_scalar=["DS_1"]), + "66": Classification(vtl="DS_r := length(DS_1);", dataset_or_scalar=["DS_1"]), + "67": Classification(vtl="DS_r := upper(DS_1);", dataset_or_scalar=["DS_1"]), + "68": Classification(vtl="DS_r := lower(DS_1);", dataset_or_scalar=["DS_1"]), + "69": Classification(vtl="DS_r := trim(DS_1);", dataset_or_scalar=["DS_1"]), + "70": Classification(vtl="DS_r := ltrim(DS_1);", dataset_or_scalar=["DS_1"]), + "71": Classification(vtl="DS_r := rtrim(DS_1);", dataset_or_scalar=["DS_1"]), + # --- Dual ParamOp operators --- + "72": Classification(vtl="DS_r := trunc(DS_1, 2);", dataset_or_scalar=["DS_1"]), + "73": Classification(vtl="DS_r := power(DS_1, 2);", dataset_or_scalar=["DS_1"]), + "74": Classification(vtl="DS_r := log(DS_1, 10);", dataset_or_scalar=["DS_1"]), + "75": Classification(vtl="DS_r := substr(DS_1, 1, 3);", dataset_or_scalar=["DS_1"]), + "76": Classification(vtl='DS_r := replace(DS_1, "a", "b");', dataset_or_scalar=["DS_1"]), + "77": Classification(vtl='DS_r := instr(DS_1, "a");', dataset_or_scalar=["DS_1"]), + "78": Classification(vtl="DS_r := cast(DS_1, integer);", dataset_or_scalar=["DS_1"]), + # --- Dual MulOp / conditional --- + "79": Classification(vtl="DS_r := between(DS_1, 1, 10);", dataset_or_scalar=["DS_1"]), + "80": Classification( + vtl="DS_r := case when DS_1 > 0 then DS_2 else DS_3;", + dataset_or_scalar=["DS_1", "DS_2", "DS_3"], + ), + # --- Dual time operators --- + "81": Classification( + vtl="DS_r := datediff(DS_1, DS_2);", + dataset_or_scalar=["DS_1", "DS_2"], + ), + "82": Classification(vtl='DS_r := dateadd(DS_1, 1, "M");', dataset_or_scalar=["DS_1"]), + "83": Classification(vtl="DS_r := getyear(DS_1);", dataset_or_scalar=["DS_1"]), + "84": Classification(vtl="DS_r := getmonth(DS_1);", dataset_or_scalar=["DS_1"]), + "85": Classification(vtl="DS_r := dayofmonth(DS_1);", dataset_or_scalar=["DS_1"]), + "86": Classification(vtl="DS_r := dayofyear(DS_1);", dataset_or_scalar=["DS_1"]), + # --- Dataset-only operators --- + "87": Classification(vtl="DS_r := symdiff(DS_1, DS_2);", datasets=["DS_1", "DS_2"]), + "88": Classification(vtl="DS_r := left_join(DS_1, DS_2);", datasets=["DS_1", "DS_2"]), + "89": Classification(vtl="DS_r := full_join(DS_1, DS_2);", datasets=["DS_1", "DS_2"]), + "90": Classification( + vtl="DS_r := cross_join(DS_1 as d1, DS_2 as d2);", + datasets=["DS_1", "DS_2"], + ), + "91": Classification(vtl="DS_r := count(DS_1);", datasets=["DS_1"]), + "92": Classification(vtl="DS_r := min(DS_1);", datasets=["DS_1"]), + "93": Classification(vtl="DS_r := max(DS_1);", datasets=["DS_1"]), + "94": Classification(vtl="DS_r := avg(DS_1);", datasets=["DS_1"]), + "95": Classification(vtl="DS_r := median(DS_1);", datasets=["DS_1"]), + "96": Classification(vtl="DS_r := stddev_pop(DS_1);", datasets=["DS_1"]), + "97": Classification(vtl="DS_r := stddev_samp(DS_1);", datasets=["DS_1"]), + "98": Classification(vtl="DS_r := var_pop(DS_1);", datasets=["DS_1"]), + "99": Classification(vtl="DS_r := var_samp(DS_1);", datasets=["DS_1"]), + "100": Classification(vtl="DS_r := fill_time_series(DS_1, all);", datasets=["DS_1"]), + "101": Classification(vtl="DS_r := period_indicator(DS_1);", datasets=["DS_1"]), + # --- Clause operators --- + "102": Classification( + vtl="DS_r := DS_1[aggr Me_1 := sum(Me_2) group by Id_1];", + datasets=["DS_1"], + component_or_scalar=["Me_2"], + ), + "103": Classification(vtl="DS_r := DS_1[pivot Id_1, Me_1];", datasets=["DS_1"]), + "104": Classification(vtl="DS_r := DS_1[unpivot Id_1, Me_1];", datasets=["DS_1"]), + # --- Analytic operators (operands are always components, never scalars) --- + "105": Classification( + vtl="DS_r := DS_1[calc Me_2 := first_value(Me_1 over (order by Id_1))];", + datasets=["DS_1"], + ), + "106": Classification( + vtl="DS_r := DS_1[calc Me_2 := last_value(Me_1 over (order by Id_1))];", + datasets=["DS_1"], + ), + "107": Classification( + vtl="DS_r := DS_1[calc Me_2 := lag(Me_1, 1 over (order by Id_1))];", + datasets=["DS_1"], + ), + "108": Classification( + vtl="DS_r := DS_1[calc Me_2 := lead(Me_1, 1 over (order by Id_1))];", + datasets=["DS_1"], + ), + "109": Classification( + vtl="DS_r := DS_1[calc Me_2 := rank(over (order by Id_1))];", + datasets=["DS_1"], + ), + "110": Classification( + vtl="DS_r := DS_1[calc Me_2 := ratio_to_report(Me_1 over (partition by Id_1))];", + datasets=["DS_1"], + ), + # --- Join clause operators --- + "111": Classification( + vtl="DS_r := inner_join(DS_1, DS_2 calc Me_2 := Me_1 + SC_1);", + datasets=["DS_1", "DS_2"], + component_or_scalar=["Me_1", "SC_1"], + ), + "112": Classification( + vtl="DS_r := inner_join(DS_1, DS_2 filter Me_1 > 10);", + datasets=["DS_1", "DS_2"], + component_or_scalar=["Me_1"], + ), + "113": Classification( + vtl="DS_r := inner_join(DS_1, DS_2 keep Me_1);", + datasets=["DS_1", "DS_2"], + ), + "114": Classification( + vtl="DS_r := inner_join(DS_1, DS_2 rename Me_1 to Me_2);", + datasets=["DS_1", "DS_2"], + ), + "115": Classification( + vtl="DS_r := inner_join(DS_1, DS_2 using Id_1);", + datasets=["DS_1", "DS_2"], + ), + "116": Classification( + vtl="DS_r := inner_join(DS_1, DS_2, DS_3);", + datasets=["DS_1", "DS_2", "DS_3"], + ), + "117": Classification( + vtl="DS_r := inner_join(DS_1, DS_2 using Id_1 calc Me_2 := Me_1 + SC_1);", + datasets=["DS_1", "DS_2"], + component_or_scalar=["Me_1", "SC_1"], + ), + "118": Classification( + vtl="DS_r := inner_join(DS_1, DS_2 aggr Me_1 := sum(Me_2) group by Id_1);", + datasets=["DS_1", "DS_2"], + component_or_scalar=["Me_2"], + ), + "119": Classification( + vtl="DS_r := left_join(DS_1, DS_2 calc Me_2 := Me_1 * 2);", + datasets=["DS_1", "DS_2"], + component_or_scalar=["Me_1"], + ), + "120": Classification( + vtl="DS_r := full_join(DS_1, DS_2 filter Me_1 > 0);", + datasets=["DS_1", "DS_2"], + component_or_scalar=["Me_1"], + ), + # --- Validation operators --- + "121": Classification( + vtl='DS_r := check(DS_1#Me_1 > 0 errorcode "E001" errorlevel 1);', + datasets=["DS_1"], + ), + "122": Classification( + vtl='DS_r := check(not isnull(DS_1#Me_1) errorcode "E002" errorlevel 2 invalid);', + datasets=["DS_1"], + ), + "123": Classification( + vtl='DS_r := check(exists_in(DS_1, DS_2, true) errorcode "E003" errorlevel 3 all);', + datasets=["DS_1", "DS_2"], + ), + "124": Classification( + vtl=( + "define datapoint ruleset dpr1 (variable Me_1 as Number) is\n" + ' rule1: Me_1 > 0 errorcode "E001" errorlevel 1\n' + "end datapoint ruleset;\n\n" + "DS_r := check_datapoint(DS_1, dpr1);" + ), + datasets=["DS_1"], + ), + "125": Classification( + vtl=( + "define datapoint ruleset dpr1 (variable Me_1 as Number) is\n" + ' rule1: Me_1 > 0 errorcode "E001" errorlevel 1\n' + "end datapoint ruleset;\n\n" + "DS_r := check_datapoint(DS_1, dpr1 all);" + ), + datasets=["DS_1"], + ), + "128": Classification( + vtl='DS_r := check(DS_1 + SC_1 > 0 errorcode "E001" errorlevel 1);', + dataset_or_scalar=["DS_1", "SC_1"], + ), + # --- Hierarchical operators --- + "126": Classification( + vtl=( + "define hierarchical ruleset hr1 (variable rule Me_1) is\n" + " A = B + C\n" + "end hierarchical ruleset;\n\n" + "DS_r := hierarchy(DS_1, hr1 rule Me_1 non_null all);" + ), + datasets=["DS_1"], + ), + "127": Classification( + vtl=( + "define hierarchical ruleset hr1 (variable rule Me_1) is\n" + " A = B + C\n" + "end hierarchical ruleset;\n\n" + "DS_r := check_hierarchy(DS_1, hr1 rule Me_1 non_null all);" + ), + datasets=["DS_1"], + ), + # --- Dataset-only operators with mixed sub-expressions --- + # When a dataset-only operator wraps a complex expression (e.g., DS_1 + SC_1), + # the sub-expression operands should NOT all be forced to datasets. + "129": Classification( + vtl="DS_r := count(DS_1 + SC_1);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + "130": Classification( + vtl="DS_r := sum(DS_1 * SC_1);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + "131": Classification( + vtl="DS_r := avg(DS_1 + SC_1);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + "132": Classification( + vtl="DS_r := flow_to_stock(DS_1 + SC_1);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + "133": Classification( + vtl="DS_r := stock_to_flow(DS_1 - SC_1);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + "134": Classification( + vtl="DS_r := union(DS_1 + SC_1, DS_2);", + datasets=["DS_2"], + dataset_or_scalar=["DS_1", "SC_1"], + ), + "135": Classification( + vtl="DS_r := intersect(DS_1 + SC_1, DS_2);", + datasets=["DS_2"], + dataset_or_scalar=["DS_1", "SC_1"], + ), + "136": Classification( + vtl="DS_r := union(DS_1 + SC_1, DS_2 + SC_2);", + dataset_or_scalar=["DS_1", "DS_2", "SC_1", "SC_2"], + ), + "137": Classification( + vtl="DS_r := exists_in(DS_1 + SC_1, DS_2, all);", + datasets=["DS_2"], + dataset_or_scalar=["DS_1", "SC_1"], + ), + "138": Classification( + vtl="DS_r := timeshift(DS_1 + SC_1, 1);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + "139": Classification( + vtl="DS_r := (DS_1 + SC_1)#Me_1;", + dataset_or_scalar=["DS_1", "SC_1"], + ), + # Calc clause with dataset-only sub-expression: dataset is direct VarID, + # but calc body contains sub-expression with external refs + "140": Classification( + vtl="DS_r <- DS_1[calc Me_2 := sum(Me_1 + SC_1)];", + datasets=["DS_1"], + component_or_scalar=["Me_1", "SC_1"], + ), + # --- Multi-statement ambiguity propagation --- + # Intermediate from mixed expr fed to aggregation: ambiguity propagates + "141": Classification( + vtl="DS_A := DS_1 + SC_1;\nDS_r := sum(DS_A);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + # Intermediate from union fed to flow_to_stock with scalar + "142": Classification( + vtl="DS_A := union(DS_1, DS_2);\nDS_r := flow_to_stock(DS_A + SC_1);", + datasets=["DS_1", "DS_2"], + dataset_or_scalar=["SC_1"], + ), + # Intermediate from mixed expr fed to timeshift + "143": Classification( + vtl="DS_A := DS_1 + SC_1;\nDS_r := timeshift(DS_A, 1);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + # --- Nested dataset-only operators --- + # count(union(...)): inner union marks datasets, count wraps it + "144": Classification( + vtl="DS_r := count(union(DS_1, DS_2));", + datasets=["DS_1", "DS_2"], + ), + # union result combined with scalar in outer expression + "145": Classification( + vtl="DS_r := union(DS_1, DS_2) + SC_1;", + datasets=["DS_1", "DS_2"], + dataset_or_scalar=["SC_1"], + ), + # --- fill_time_series / period_indicator with expression --- + "146": Classification( + vtl="DS_r := fill_time_series(DS_1 + SC_1, all);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + "147": Classification( + vtl="DS_r := period_indicator(DS_1 + SC_1);", + dataset_or_scalar=["DS_1", "SC_1"], + ), + # --- setdiff / symdiff with expressions --- + "148": Classification( + vtl="DS_r := setdiff(DS_1 + SC_1, DS_2);", + datasets=["DS_2"], + dataset_or_scalar=["DS_1", "SC_1"], + ), + "149": Classification( + vtl="DS_r := symdiff(DS_1 + SC_1, DS_2 + SC_2);", + dataset_or_scalar=["DS_1", "DS_2", "SC_1", "SC_2"], + ), + # --- check / validation with mixed sub-expressions --- + # check wrapping aggregation: sum forces DS_1 to dataset + "150": Classification( + vtl='DS_r := check(sum(DS_1) > 0 errorcode "E001" errorlevel 1);', + datasets=["DS_1"], + ), + # check with membership + scalar: DS_1 is dataset (membership), SC_1 ambiguous + "151": Classification( + vtl='DS_r := check(DS_1#Me_1 + SC_1 > 0 errorcode "E001" errorlevel 1);', + datasets=["DS_1"], + dataset_or_scalar=["SC_1"], + ), + # --- Clause operators with external scalar in expression --- + # filter with external scalar ref + "152": Classification( + vtl="DS_r <- DS_1[filter Me_1 + SC_1 > 0];", + datasets=["DS_1"], + component_or_scalar=["Me_1", "SC_1"], + ), + # aggr with external scalar in aggregation body + "153": Classification( + vtl="DS_r := DS_1[aggr Me_1 := sum(Me_2 + SC_1) group by Id_1];", + datasets=["DS_1"], + component_or_scalar=["Me_2", "SC_1"], + ), + # --- Membership on expression result --- + "154": Classification( + vtl="DS_r := (DS_1 * SC_1)#Me_1;", + dataset_or_scalar=["DS_1", "SC_1"], + ), +} + + +_SORTED_CODES = sorted(CASES.keys(), key=lambda k: int(k)) + + +@pytest.mark.parametrize("test_code", _SORTED_CODES) +def test_classification(test_code: str) -> None: + case = CASES[test_code] + if case.vtl is not None: + script = case.vtl + else: + with open(data_path / "vtl" / f"{test_code}.vtl") as f: + script = f.read() + + schedule = DAGAnalyzer.ds_structure(create_ast(script)) + + assert sorted(schedule.global_input_datasets) == case.datasets + assert sorted(schedule.global_input_scalars) == case.scalars + assert sorted(schedule.global_input_dataset_or_scalar) == case.dataset_or_scalar + assert sorted(schedule.global_input_component_or_scalar) == case.component_or_scalar + # global_inputs is the union of all four categories + all_classified = sorted( + case.datasets + case.scalars + case.dataset_or_scalar + case.component_or_scalar + ) + assert sorted(schedule.global_inputs) == all_classified diff --git a/tests/DAG/test_dag.py b/tests/DAG/test_dag.py deleted file mode 100644 index 7edbeb54c..000000000 --- a/tests/DAG/test_dag.py +++ /dev/null @@ -1,57 +0,0 @@ -import json -from pathlib import Path -from typing import List - -import pytest - -from vtlengine.API import create_ast -from vtlengine.AST.DAG import DAGAnalyzer - -override = False -data_path = Path(__file__).parent / "data" - - -def _discover_tests(data_root: Path) -> List[str]: - return sorted(p.stem for p in (data_root / "vtl").iterdir() if p.is_file()) - - -def _normalize_ds_structure(ds_structure): - return json.loads( - json.dumps( - { - "insertion": {k: sorted(v) for k, v in ds_structure.insertion.items()}, - "deletion": {k: sorted(v) for k, v in ds_structure.deletion.items()}, - "global_inputs": sorted(ds_structure.global_inputs), - "global_input_datasets": sorted(ds_structure.global_input_datasets), - "global_input_scalars": sorted(ds_structure.global_input_scalars), - "global_input_dataset_or_scalar": sorted( - ds_structure.global_input_dataset_or_scalar - ), - "global_input_component_or_scalar": sorted( - ds_structure.global_input_component_or_scalar - ), - "persistent": sorted(ds_structure.persistent), - } - ) - ) - - -tests = _discover_tests(data_path) - - -@pytest.mark.parametrize("test_code", tests) -def test_ds_structure(test_code): - with open(data_path / "vtl" / f"{test_code}.vtl") as f: - script = f.read() - - ds_structures = DAGAnalyzer.ds_structure(create_ast(script)) - - if override: - with open(data_path / "references" / f"{test_code}.json", "w") as f: - json.dump(_normalize_ds_structure(ds_structures), f, indent=4) - - with open(data_path / "references" / f"{test_code}.json") as f: - reference = json.load(f) - - normalized_ds_structures = _normalize_ds_structure(ds_structures) - assert normalized_ds_structures == reference diff --git a/tests/DAG/test_scheduling.py b/tests/DAG/test_scheduling.py new file mode 100644 index 000000000..ba5552464 --- /dev/null +++ b/tests/DAG/test_scheduling.py @@ -0,0 +1,46 @@ +import json +from pathlib import Path +from typing import Any, Dict + +import pytest + +from vtlengine.API import create_ast +from vtlengine.AST.DAG import DAGAnalyzer + +override = False +data_path = Path(__file__).parent / "data" + + +def _normalize_scheduling(schedule: Any) -> Dict[str, Any]: + return json.loads( + json.dumps( + { + "insertion": {k: sorted(v) for k, v in schedule.insertion.items()}, + "deletion": {k: sorted(v) for k, v in schedule.deletion.items()}, + "persistent": sorted(schedule.persistent), + } + ) + ) + + +# Only keep tests with non-trivial scheduling (multiple insertion/deletion points). +NONTRIVIAL_TESTS = ["2", "3", "5", "6", "7", "8", "9", "10", "11", "13", "16", "35", "36"] + + +@pytest.mark.parametrize("test_code", NONTRIVIAL_TESTS) +def test_scheduling(test_code: str) -> None: + with open(data_path / "vtl" / f"{test_code}.vtl") as f: + script = f.read() + + schedule = DAGAnalyzer.ds_structure(create_ast(script)) + ref_path = data_path / "references" / "scheduling" / f"{test_code}.json" + + if override: + ref_path.parent.mkdir(parents=True, exist_ok=True) + with open(ref_path, "w") as f: + json.dump(_normalize_scheduling(schedule), f, indent=4) + + with open(ref_path) as f: + reference = json.load(f) + + assert _normalize_scheduling(schedule) == reference From d07869a84216c8e3fcb16cc0ac2b2db658bcd45b Mon Sep 17 00:00:00 2001 From: Javier Hernandez Date: Thu, 5 Mar 2026 18:46:31 +0100 Subject: [PATCH 10/11] feat: add eval visitor and scalar-constrained parameter classification - Add visit_EvalOp to mark eval operands as dataset_inputs - Classify scalar-only parameters: random index, round/trunc/substr/ replace/instr params, between from/to bounds - Add tests 161-171 covering eval, random, and scalar-constrained params --- src/vtlengine/AST/DAG/__init__.py | 39 ++++++++++++++++ tests/DAG/test_classification.py | 74 +++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/src/vtlengine/AST/DAG/__init__.py b/src/vtlengine/AST/DAG/__init__.py index 1d49a4ecf..8f2138cb1 100644 --- a/src/vtlengine/AST/DAG/__init__.py +++ b/src/vtlengine/AST/DAG/__init__.py @@ -24,6 +24,7 @@ DefIdentifier, DPRuleset, DPValidation, + EvalOp, HROperation, HRuleset, Identifier, @@ -44,20 +45,27 @@ from vtlengine.AST.DAG._models import Schedule, StatementDeps from vtlengine.AST.Grammar.tokens import ( AS, + BETWEEN, DROP, EXISTS_IN, FILL_TIME_SERIES, FLOW_TO_STOCK, + INSTR, INTERSECT, KEEP, MEMBERSHIP, PERIOD_INDICATOR, + RANDOM, RENAME, + REPLACE, + ROUND, SETDIFF, STOCK_TO_FLOW, + SUBSTR, SYMDIFF, TIMESHIFT, TO, + TRUNC, UNION, ) from vtlengine.DataTypes import COMP_NAME_MAPPING @@ -70,6 +78,15 @@ DATASET_ONLY_UNARYOP = frozenset({FLOW_TO_STOCK, STOCK_TO_FLOW, PERIOD_INDICATOR}) DATASET_ONLY_BINOP = frozenset({MEMBERSHIP, TIMESHIFT}) +# BinOp operators where the right operand must always be a scalar. +SCALAR_RIGHT_BINOP = frozenset({RANDOM}) + +# ParamOp operators where all params (not children) must be scalars. +SCALAR_PARAMS_PARAMOP = frozenset({ROUND, TRUNC, SUBSTR, REPLACE, INSTR}) + +# MulOp operators where children[1:] must be scalars (operand is children[0]). +SCALAR_TAIL_MULOP = frozenset({BETWEEN}) + # Reserved component names generated by the engine (e.g., count() → int_var, # check() → bool_var). These are never external inputs. RESERVED_COMPONENT_NAMES = frozenset(COMP_NAME_MAPPING.values()) @@ -447,6 +464,11 @@ def visit_BinOp(self, node: BinOp) -> None: self.visit(node.left) self.is_dataset = False self.visit(node.right) + elif node.op in SCALAR_RIGHT_BINOP: + self.visit(node.left) + if isinstance(node.right, VarID) and node.right.value not in self.alias: + self.current_deps.scalar_inputs.append(node.right.value) + self.visit(node.right) elif node.op == AS or node.op == TO: self.visit(node.left) self.alias.add(node.right.value) @@ -460,6 +482,11 @@ def visit_MulOp(self, node: MulOp) -> None: for child in node.children: if isinstance(child, VarID) and child.value not in self.alias: self.current_deps.dataset_inputs.append(child.value) + elif node.op in SCALAR_TAIL_MULOP: + # First child is the operand (dual), rest are scalar-only params. + for child in node.children[1:]: + if isinstance(child, VarID) and child.value not in self.alias: + self.current_deps.scalar_inputs.append(child.value) for child in node.children: self.visit(child) @@ -517,6 +544,10 @@ def visit_ParamOp(self, node: ParamOp) -> None: for child in node.children: if isinstance(child, VarID) and child.value not in self.alias: self.current_deps.dataset_inputs.append(child.value) + elif node.op in SCALAR_PARAMS_PARAMOP: + for param in node.params: + if isinstance(param, VarID) and param.value not in self.alias: + self.current_deps.scalar_inputs.append(param.value) super(DAGAnalyzer, self).visit_ParamOp(node) def visit_Aggregation(self, node: Aggregation) -> None: @@ -588,6 +619,14 @@ def visit_Validation(self, node: Validation) -> None: if node.imbalance is not None: self.visit(node.imbalance) + def visit_EvalOp(self, node: EvalOp) -> None: + """Eval operands are always datasets (external SQL routine inputs).""" + self._current_has_dataset_op = True + for operand in node.operands: + if isinstance(operand, VarID) and operand.value not in self.alias: + self.current_deps.dataset_inputs.append(operand.value) + self.visit(operand) + class HRDAGAnalyzer(DAGAnalyzer): def visit_HRuleset(self, node: HRuleset) -> None: diff --git a/tests/DAG/test_classification.py b/tests/DAG/test_classification.py index a31477db8..9f5c159ee 100644 --- a/tests/DAG/test_classification.py +++ b/tests/DAG/test_classification.py @@ -629,6 +629,80 @@ class Classification: vtl="DS_r := (DS_1 * SC_1)#Me_1;", dataset_or_scalar=["DS_1", "SC_1"], ), + # --- Duration conversion operators (dual) --- + "155": Classification(vtl="DS_r := daytoyear(DS_1);", dataset_or_scalar=["DS_1"]), + "156": Classification(vtl="DS_r := daytomonth(DS_1);", dataset_or_scalar=["DS_1"]), + "157": Classification(vtl="DS_r := yeartoday(DS_1);", dataset_or_scalar=["DS_1"]), + "158": Classification(vtl="DS_r := monthtoday(DS_1);", dataset_or_scalar=["DS_1"]), + # --- time_agg (dual) --- + "159": Classification( + vtl='DS_r := time_agg("A", DS_1);', + dataset_or_scalar=["DS_1"], + ), + # --- current_date (no inputs) --- + "160": Classification(vtl="SC_r := current_date();"), + # --- random (seed=dual, index=scalar) --- + "161": Classification(vtl="SC_r := random(42, 1);"), + "162": Classification(vtl="DS_r := random(DS_1, 1);", dataset_or_scalar=["DS_1"]), + "165": Classification( + vtl="DS_r := random(DS_1, SC_1);", + scalars=["SC_1"], + dataset_or_scalar=["DS_1"], + ), + # --- eval (dataset-only: external SQL routine) --- + "163": Classification( + vtl=( + "DS_r := eval(my_routine(DS_1)" + ' language "SQL"' + " returns dataset { identifier Id_1, measure Me_1 });" + ), + datasets=["DS_1"], + ), + "164": Classification( + vtl=( + "DS_r := eval(my_routine(DS_1, DS_2)" + ' language "SQL"' + " returns dataset { identifier Id_1, measure Me_1 });" + ), + datasets=["DS_1", "DS_2"], + ), + # --- Operators with scalar-constrained parameters --- + # round: param is always scalar + "166": Classification( + vtl="DS_r := round(DS_1, SC_1);", + scalars=["SC_1"], + dataset_or_scalar=["DS_1"], + ), + # trunc: param is always scalar + "167": Classification( + vtl="DS_r := trunc(DS_1, SC_1);", + scalars=["SC_1"], + dataset_or_scalar=["DS_1"], + ), + # substr: start and length are always scalars + "168": Classification( + vtl="DS_r := substr(DS_1, SC_1, SC_2);", + scalars=["SC_1", "SC_2"], + dataset_or_scalar=["DS_1"], + ), + # replace: pattern and replacement are always scalars + "169": Classification( + vtl="DS_r := replace(DS_1, SC_1, SC_2);", + scalars=["SC_1", "SC_2"], + dataset_or_scalar=["DS_1"], + ), + # instr: pattern is always scalar + "170": Classification( + vtl="DS_r := instr(DS_1, SC_1);", + scalars=["SC_1"], + dataset_or_scalar=["DS_1"], + ), + # between: from and to are always scalars + "171": Classification( + vtl="DS_r := between(DS_1, SC_1, SC_2);", + scalars=["SC_1", "SC_2"], + dataset_or_scalar=["DS_1"], + ), } From 3d1c5a3c4c5667ad0a8fd74e6367f4ff0b95f335 Mon Sep 17 00:00:00 2001 From: Javier Hernandez Date: Thu, 5 Mar 2026 19:01:47 +0100 Subject: [PATCH 11/11] fix: handle dual-context variables and add classification edge case tests Fix bug where a variable appearing in both dataset_inputs and scalar_inputs across statements was classified as dataset instead of dataset_or_scalar. Add 15 edge case tests (172-186) covering scalar chain propagation, unknown variable resolution, nested operators, deeply nested expressions, and if-then-else with dataset-only branches. --- src/vtlengine/AST/DAG/__init__.py | 3 ++ tests/DAG/test_classification.py | 88 +++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) diff --git a/src/vtlengine/AST/DAG/__init__.py b/src/vtlengine/AST/DAG/__init__.py index 8f2138cb1..37c7f5e56 100644 --- a/src/vtlengine/AST/DAG/__init__.py +++ b/src/vtlengine/AST/DAG/__init__.py @@ -211,6 +211,9 @@ def _classify_global_inputs( for name in global_inputs: if name in comp_or_scalar: result["global_input_component_or_scalar"].append(name) + elif name in definite_dataset_inputs and name in definite_scalar_inputs: + # Used as dataset in one context and scalar in another → ambiguous + result["global_input_dataset_or_scalar"].append(name) elif name in definite_dataset_inputs: result["global_input_datasets"].append(name) elif name in definite_scalar_inputs or self._feeds_only_scalar_chains( diff --git a/tests/DAG/test_classification.py b/tests/DAG/test_classification.py index 9f5c159ee..65c711843 100644 --- a/tests/DAG/test_classification.py +++ b/tests/DAG/test_classification.py @@ -703,6 +703,94 @@ class Classification: scalars=["SC_1", "SC_2"], dataset_or_scalar=["DS_1"], ), + # --- Group A: Classification logic edge cases --- + # A1: Variable in both dataset_inputs (union) and scalar_inputs (round param) + "172": Classification( + vtl="DS_r := union(DS_1, DS_2);\nDS_r2 := round(DS_3, DS_1);", + datasets=["DS_2"], + dataset_or_scalar=["DS_1", "DS_3"], + ), + # A2: Scalar chain from resolved-from-unknown variable + "173": Classification( + vtl="DS_r <- DS_1[calc Me_2 := Me_1 + SC_b];\nSC_b := SC_1 + 10;", + datasets=["DS_1"], + scalars=["SC_1"], + component_or_scalar=["Me_1"], + ), + # A3: Scalar chain broken by dataset-only operator + "174": Classification( + vtl="SC_a := SC_1 + SC_2;\nDS_r := union(SC_a, DS_1);", + datasets=["DS_1"], + dataset_or_scalar=["SC_1", "SC_2"], + ), + # A4: Same variable in ambiguous AND scalar chain contexts + "175": Classification( + vtl="DS_A := DS_1 + SC_1;\nSC_r := SC_1 * 2;", + dataset_or_scalar=["DS_1", "SC_1"], + ), + # A5: Multiple persistent assignments with shared variable + "176": Classification( + vtl="DS_r1 <- DS_1 + SC_1;\nDS_r2 <- DS_2 * SC_1;", + dataset_or_scalar=["DS_1", "DS_2", "SC_1"], + ), + # --- Group B: Component/scalar edge cases --- + # B1: Calc with multiple assignments referencing each other's components + "177": Classification( + vtl="DS_r <- DS_1[calc Me_2 := Me_1 + SC_1, Me_3 := sum(Me_2)];", + datasets=["DS_1"], + component_or_scalar=["Me_1", "Me_2", "SC_1"], + ), + # B2: Unknown variable not resolved (stays as component_or_scalar) + "178": Classification( + vtl="DS_A <- DS_1[calc Me_2 := Me_1 + X];\nDS_B := DS_A + 1;", + datasets=["DS_1"], + component_or_scalar=["Me_1", "X"], + ), + # B3: Unknown variable resolved by later output (not a global input) + "179": Classification( + vtl="DS_r <- DS_1[calc Me_2 := Me_1 + X];\nX := 10;", + datasets=["DS_1"], + component_or_scalar=["Me_1"], + ), + # --- Group C: Operator combination edge cases --- + # C1: Membership on union result + "180": Classification( + vtl="DS_r := (union(DS_1, DS_2))#Me_1;", + datasets=["DS_1", "DS_2"], + ), + # C2: Scalar-constrained param with expression operand + "181": Classification( + vtl="DS_r := round(DS_1 + SC_1, SC_2);", + scalars=["SC_2"], + dataset_or_scalar=["DS_1", "SC_1"], + ), + # C3: time_agg with expression operand + "182": Classification( + vtl='DS_r := time_agg("A", DS_1 + SC_1);', + dataset_or_scalar=["DS_1", "SC_1"], + ), + # C4: Check with two membership refs on same dataset + "183": Classification( + vtl='DS_r := check(DS_1#Me_1 + DS_1#Me_2 > 0 errorcode "E001" errorlevel 1 all);', + datasets=["DS_1"], + ), + # C5: Join with scalar-constrained param in calc + "184": Classification( + vtl="DS_r := inner_join(DS_1, DS_2 calc Me_2 := round(Me_1, SC_1));", + datasets=["DS_1", "DS_2"], + component_or_scalar=["Me_1", "SC_1"], + ), + # C6: Deeply nested dual operators (6 levels) + "185": Classification( + vtl="DS_r := abs(ceil(floor(exp(ln(sqrt(DS_1))))));", + dataset_or_scalar=["DS_1"], + ), + # C7: If-then-else with dataset-only operator in branch + "186": Classification( + vtl="DS_r := if DS_1 then union(DS_2, DS_3) else DS_4;", + datasets=["DS_2", "DS_3"], + dataset_or_scalar=["DS_1", "DS_4"], + ), }