diff --git a/azurelinuxagent/common/osutil/systemd.py b/azurelinuxagent/common/osutil/systemd.py index a70294beb..c400f9f65 100644 --- a/azurelinuxagent/common/osutil/systemd.py +++ b/azurelinuxagent/common/osutil/systemd.py @@ -20,6 +20,8 @@ from azurelinuxagent.common.osutil import get_osutil from azurelinuxagent.common.utils import shellutil +from azurelinuxagent.ga.extensionprocessutil import TELEMETRY_MESSAGE_MAX_LEN +from azurelinuxagent.common.future import ustr def _get_os_util(): @@ -130,4 +132,43 @@ def is_unit_loaded(unit_name): value = get_unit_property(unit_name, "LoadState") return value.lower() == "loaded" except shellutil.CommandError: - return False \ No newline at end of file + return False + +def is_systemd_run_failure(unit_name, stderr): + """ + Determines if stderr from a systemd-run command indicates a systemd-run infrastructure failure + (as opposed to a failure of the command being executed). + + This method distinguishes between two types of failures: + 1. systemd-run infrastructure failures: systemd-run itself failed to execute the command + (e.g., D-Bus errors, systemd not available, unit configuration issues) + 2. Command execution failures: systemd-run successfully executed the command, but the command + itself failed or produced errors + + The determination is made by examining the stderr output: + - If stderr contains "Unit {unit_name} not found", it indicates systemd-run couldn't find/create + the unit, which is a systemd-run failure + - If stderr does NOT contain the unit_name at all, it suggests systemd-run failed before even + attempting to run the command (e.g., D-Bus connection failures, systemd not running), which + is a systemd-run failure + - If stderr contains the unit_name (but not the "not found" message), it means systemd-run + successfully started the command in the unit, so any errors are from the command itself, + not from systemd-run + + This distinction is important because: + - systemd-run failures should trigger fallback mechanisms (e.g., disable cgroups, run command directly) + - Command failures should be propagated to the caller for proper error handling + + :param unit_name: The name of the systemd unit/scope that was used with systemd-run + :param stderr: Error output from the systemd-run command (str, bytes, or file-like object) + :return: True if this is a systemd-run failure, False if it's a command execution failure + """ + # Handle different types of stderr input + if hasattr(stderr, 'seek') and hasattr(stderr, 'read'): + stderr.seek(0) + stderr_str = ustr(stderr.read(TELEMETRY_MESSAGE_MAX_LEN), encoding='utf-8', errors='backslashreplace') + else: + stderr_str = str(stderr) + + unit_not_found = "Unit {0} not found.".format(unit_name) + return unit_not_found in stderr_str or unit_name not in stderr_str diff --git a/azurelinuxagent/ga/cgroupapi.py b/azurelinuxagent/ga/cgroupapi.py index cc1266a42..788e85e6d 100644 --- a/azurelinuxagent/ga/cgroupapi.py +++ b/azurelinuxagent/ga/cgroupapi.py @@ -32,9 +32,9 @@ ExtensionOperationError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.osutil import systemd +from azurelinuxagent.common.osutil.systemd import is_systemd_run_failure from azurelinuxagent.common.utils import fileutil, shellutil -from azurelinuxagent.ga.extensionprocessutil import handle_process_completion, read_output, \ - TELEMETRY_MESSAGE_MAX_LEN +from azurelinuxagent.ga.extensionprocessutil import handle_process_completion, read_output from azurelinuxagent.common.utils.flexible_version import FlexibleVersion from azurelinuxagent.common.version import get_distro @@ -353,7 +353,7 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh except ExtensionError as e: # The extension didn't terminate successfully. Determine whether it was due to systemd errors or # extension errors. - if not self._is_systemd_failure(scope, stderr): + if not is_systemd_run_failure(scope, stderr): # There was an extension error; it either timed out or returned a non-zero exit code. Re-raise the error raise @@ -374,13 +374,6 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh with self._systemd_run_commands_lock: self._systemd_run_commands.remove(process.pid) - @staticmethod - def _is_systemd_failure(scope_name, stderr): - stderr.seek(0) - stderr = ustr(stderr.read(TELEMETRY_MESSAGE_MAX_LEN), encoding='utf-8', errors='backslashreplace') - unit_not_found = "Unit {0} not found.".format(scope_name) - return unit_not_found in stderr or scope_name not in stderr - class SystemdCgroupApiv1(_SystemdCgroupApi): """ diff --git a/azurelinuxagent/ga/cgroupconfigurator.py b/azurelinuxagent/ga/cgroupconfigurator.py index 004115767..552a1c763 100644 --- a/azurelinuxagent/ga/cgroupconfigurator.py +++ b/azurelinuxagent/ga/cgroupconfigurator.py @@ -74,6 +74,10 @@ LOGCOLLECTOR_ANON_MEMORY_LIMIT_FOR_V1_AND_V2 = 25 * 1024 ** 2 # 25Mb LOGCOLLECTOR_CACHE_MEMORY_LIMIT_FOR_V1_AND_V2 = 155 * 1024 ** 2 # 155Mb +EXT_SIGNATURE_VALIDATION_SLICE = "azure-walinuxagent-extsignaturevalidation.slice" +EXT_SIGNATURE_VALIDATION_CGROUPS_UNIT = "extsignaturevalidation.scope" +EXT_SIGNATURE_VALIDATION_CPU_QUOTA = "50%" + _AGENT_DROP_IN_FILE_SLICE = "10-Slice.conf" _AGENT_DROP_IN_FILE_SLICE_CONTENTS = """ # This drop-in unit file was created by the Azure VM Agent. diff --git a/azurelinuxagent/ga/signature_validation_util.py b/azurelinuxagent/ga/signature_validation_util.py index 9e16f6c01..c9f1338c7 100644 --- a/azurelinuxagent/ga/signature_validation_util.py +++ b/azurelinuxagent/ga/signature_validation_util.py @@ -30,6 +30,8 @@ from azurelinuxagent.common.future import ustr, UTC, datetime_min_utc from azurelinuxagent.common.event import add_event, WALAEventOperation, elapsed_milliseconds from azurelinuxagent.common.version import AGENT_VERSION, AGENT_NAME +from azurelinuxagent.ga.cgroupconfigurator import CGroupConfigurator, EXT_SIGNATURE_VALIDATION_CPU_QUOTA, EXT_SIGNATURE_VALIDATION_SLICE, EXT_SIGNATURE_VALIDATION_CGROUPS_UNIT, DisableCgroups +from azurelinuxagent.common.osutil.systemd import is_systemd_run_failure from azurelinuxagent.ga.confidential_vm_info import ConfidentialVMInfo @@ -168,9 +170,9 @@ def validate_signature(package_path, signature, package_full_name): report_validation_event(op=WALAEventOperation.SignatureValidation, level=logger.LogLevel.INFO, message="Validating signature for package '{0}'".format(package_full_name), name=name, version=version, duration=0) + # Write signature to file and get signing certificate path _write_signature_to_file(signature, signature_path) microsoft_root_cert_file = get_microsoft_signing_certificate_path() - if not os.path.isfile(microsoft_root_cert_file): msg = ("signing certificate was not found at expected location ('{0}'). Try restarting the agent, " "or see log ('{1}') for additional details.").format(microsoft_root_cert_file, conf.get_agent_log_file()) @@ -184,7 +186,7 @@ def validate_signature(package_path, signature, package_full_name): # as a temporary measure until a robust solution for handling expired/revoked certificates is implemented. # # TODO: implement timestamp token parsing and validate that certificate was valid at time of signing - command = [ + base_command = [ conf.get_openssl_cmd(), 'cms', '-verify', '-binary', '-inform', 'der', # Signature input format must be DER (binary encoding) '-in', signature_path, # Path to the CMS signature file to be verified @@ -193,7 +195,32 @@ def validate_signature(package_path, signature, package_full_name): '-CAfile', microsoft_root_cert_file, # Path to the trusted root certificate file used for verification '-no_check_time' # Skips checking whether the certificate is expired ] - run_command(command, encode_output=False) + + # If cgroups are enabled, attempt to run the command in a dedicated systemd-run scope with a dedicated CPU quota. + # This is because signature validation is CPU-intensive and may take excessive time if the agent's CPU quota is low. + # If the systemd-run invocation fails, disable cgroups entirely and fall back to running the OpenSSL command directly. + use_cgroups = CGroupConfigurator.get_instance().enabled() + if use_cgroups: + systemd_cmd = ['systemd-run', '--unit={0}'.format(EXT_SIGNATURE_VALIDATION_CGROUPS_UNIT), + '--slice={0}'.format(EXT_SIGNATURE_VALIDATION_SLICE), '--scope', '--property=CPUAccounting=yes', + '--property=CPUQuota={0}'.format(EXT_SIGNATURE_VALIDATION_CPU_QUOTA)] + base_command + try: + run_command(systemd_cmd) + except CommandError as ex: + # If the systemd-run invocation itself failed, disable cgroups entirely and fall back to running openssl command directly. + # If the openssl command failed, re-raise and do not retry. + if is_systemd_run_failure(EXT_SIGNATURE_VALIDATION_CGROUPS_UNIT, ex.stderr): + error_msg = "'systemd-run' invocation failed for signature validation, disabling cgroups and falling back to direct execution. Error: '{0}'".format(ex.stderr) + report_validation_event(op=WALAEventOperation.SignatureValidation, level=logger.LogLevel.WARNING, + message=error_msg, + name=name, version=version, duration=0) + CGroupConfigurator.get_instance().disable(reason=error_msg, disable_cgroups=DisableCgroups.ALL) + run_command(base_command) + else: + raise + else: + # Run without systemd if cgroups disabled + run_command(base_command) report_validation_event(op=WALAEventOperation.PackageSignatureResult, level=logger.LogLevel.INFO, message="Successfully validated signature for package '{0}'".format(package_full_name), diff --git a/tests/ga/test_cgroupconfigurator_sudo.py b/tests/ga/test_cgroupconfigurator_sudo.py index 4fa8c45ef..c94a99efc 100644 --- a/tests/ga/test_cgroupconfigurator_sudo.py +++ b/tests/ga/test_cgroupconfigurator_sudo.py @@ -137,7 +137,7 @@ def test_start_extension_command_should_not_use_fallback_option_if_extension_tim with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr: with patch("azurelinuxagent.ga.extensionprocessutil.wait_for_process_completion_or_timeout", return_value=[True, None, 0]): - with patch("azurelinuxagent.ga.cgroupapi._SystemdCgroupApi._is_systemd_failure", + with patch("azurelinuxagent.common.osutil.systemd.is_systemd_run_failure", return_value=False): with self.assertRaises(ExtensionError) as context_manager: configurator.start_extension_command( diff --git a/tests/ga/test_signature_validation_sudo.py b/tests/ga/test_signature_validation_sudo.py index 4e5ef674f..c4648f7e0 100644 --- a/tests/ga/test_signature_validation_sudo.py +++ b/tests/ga/test_signature_validation_sudo.py @@ -17,11 +17,14 @@ # Requires Python 2.6+ and Openssl 1.0+ # import os +import subprocess +import re -from tests.lib.tools import AgentTestCase, data_dir, patch, i_am_root +from tests.lib.tools import AgentTestCase, data_dir, patch, i_am_root, MagicMock from azurelinuxagent.ga.signing_certificate_util import write_signing_certificates -from azurelinuxagent.ga.signature_validation_util import validate_signature +from azurelinuxagent.ga.signature_validation_util import validate_signature, SignatureValidationError from azurelinuxagent.common.utils import shellutil +from azurelinuxagent.ga.cgroupconfigurator import EXT_SIGNATURE_VALIDATION_CGROUPS_UNIT class TestSignatureValidationSudo(AgentTestCase): @@ -38,6 +41,9 @@ def setUp(self): self.vm_access_signature = f.read() self.package_name_and_version = "Microsoft.OSTCExtensions.Edp.VMAccessForLinux-1.5.0" + # Regex for 'openssl cms -verify' for the test zip package + self.openssl_cmd_pattern = re.compile(r".*openssl\s+cms\s+-verify.*-content\s+{0}\b".format(re.escape(self.vm_access_zip_path))) + def tearDown(self): patch.stopall() AgentTestCase.tearDown(self) @@ -77,3 +83,105 @@ def test_should_validate_signature_for_package_signed_with_leaf_root_cert(self): # Signature validation should still pass, because the signature was generated when the root certificate was unexpired. self.assertTrue(i_am_root(), "Test does not run when non-root") TestSignatureValidationSudo._validate_signature_in_another_year(2026, self.vm_access_zip_path, self.vm_access_signature, self.package_name_and_version) + + def test_validate_signature_should_use_systemd_run(self): + self.assertTrue(i_am_root(), "Test does not run when non-root") + with patch("azurelinuxagent.ga.signature_validation_util.CGroupConfigurator.get_instance") as mock_get_instance: + mock_instance = mock_get_instance.return_value + mock_instance.enabled.return_value = True + with patch("azurelinuxagent.common.utils.shellutil.subprocess.Popen", wraps=subprocess.Popen) as popen_patch: + validate_signature(self.vm_access_zip_path, self.vm_access_signature, self.package_name_and_version) + + # Check if 'openssl cms -verify' was called with systemd-run for the specified extension + systemd_run_called = any( + cmd.startswith('systemd-run') and self.openssl_cmd_pattern.search(cmd) + for cmd in (" ".join(args[0]) for (args, _) in popen_patch.call_args_list) + ) + self.assertTrue( + systemd_run_called, + "Expected 'validate_signature' to run using 'systemd-run'. " + "Commands called:\n{0}".format("\n".join(str(args[0]) for (args, _) in popen_patch.call_args_list)) + ) + + def test_validate_signature_should_not_use_systemd_run_when_cgroups_disabled(self): + with patch("azurelinuxagent.ga.signature_validation_util.CGroupConfigurator.get_instance") as mock_get_instance: + mock_instance = mock_get_instance.return_value + mock_instance.enabled.return_value = False + + with patch("azurelinuxagent.common.utils.shellutil.subprocess.Popen", wraps=subprocess.Popen) as popen_patch: + validate_signature(self.vm_access_zip_path, self.vm_access_signature, self.package_name_and_version) + + # Verify openssl was called directly (not through systemd-run) for the specified extension + # Find all openssl calls that match the pattern + openssl_calls = [' '.join(args[0]) for (args, _) in popen_patch.call_args_list + if self.openssl_cmd_pattern.search(" ".join(args[0]))] + + self.assertEqual(1, len(openssl_calls), msg="Openssl cms -verify command should have been called exactly once for the extension") + self.assertFalse(openssl_calls[0].startswith('systemd-run'), + msg="Openssl cms -verify command should not have been called with systemd-run when cgroups disabled") + + def test_validate_signature_should_raise_error_on_openssl_failure(self): + with patch("azurelinuxagent.ga.signature_validation_util.CGroupConfigurator.get_instance") as mock_get_instance: + mock_instance = mock_get_instance.return_value + mock_instance.enabled.return_value = True + original_popen = subprocess.Popen + + def mock_openssl_failure(*args, **kwargs): + # Match: openssl cms -verify + cmd = ' '.join(args[0]) + if self.openssl_cmd_pattern.search(cmd) is not None: + # Simulate OpenSSL failure (unit name in stderr means it's NOT a systemd failure) + error_msg = 'Running as unit: {0}\nVerification failure'.format(EXT_SIGNATURE_VALIDATION_CGROUPS_UNIT) + proc = MagicMock() + proc.communicate.return_value = (b"", error_msg.encode()) + proc.returncode = 1 + return proc + return original_popen(*args, **kwargs) + + with patch("azurelinuxagent.common.utils.shellutil.subprocess.Popen", side_effect=mock_openssl_failure): + with self.assertRaises(SignatureValidationError, msg="Expected signature validation to raise due to OpenSSL error"): + validate_signature(self.vm_access_zip_path, self.vm_access_signature, self.package_name_and_version) + + def test_validate_signature_should_retry_on_systemd_error(self): + with patch("azurelinuxagent.ga.signature_validation_util.CGroupConfigurator.get_instance") as mock_get_instance: + mock_instance = mock_get_instance.return_value + mock_instance.enabled.return_value = True + + original_popen = subprocess.Popen + + def popen_side_effect(*args, **kwargs): + # Simulate systemd-run failure + cmd = ' '.join(args[0]) + if cmd.startswith('systemd-run'): + error_msg = 'Unit {0} not found.'.format(EXT_SIGNATURE_VALIDATION_CGROUPS_UNIT) + proc = MagicMock() + proc.communicate.return_value = (b"", error_msg.encode()) + proc.returncode = 1 + return proc + return original_popen(*args, **kwargs) + + with patch("azurelinuxagent.common.utils.shellutil.subprocess.Popen", side_effect=popen_side_effect) as popen_patch: + validate_signature(self.vm_access_zip_path, self.vm_access_signature, self.package_name_and_version) + + + # Check that first openssl call used systemd-run, and second called openssl directly + openssl_calls = [ + ' '.join(args[0]) + for (args, _) in popen_patch.call_args_list + if self.openssl_cmd_pattern.search(' '.join(args[0])) + ] + + self.assertEqual(2, len(openssl_calls), msg="Expected exactly 2 openssl calls (first with systemd-run, second direct)") + + # First openssl cms verify call should use systemd-run + self.assertTrue(openssl_calls[0].startswith('systemd-run'), msg="First openssl call should have used systemd-run, got: {0}".format(openssl_calls[0])) + + # Second openssl cms verify call should be direct (not using systemd-run) + self.assertFalse(openssl_calls[1].startswith('systemd-run'), + msg="Second openssl call should be direct (without systemd-run), got: {0}".format(openssl_calls[1])) + + # Verify that cgroups were disabled + self.assertEqual(1, mock_instance.disable.call_count, "disable() should have been called exactly once") + reason = mock_instance.disable.call_args[1]['reason'] + self.assertTrue(reason.startswith("'systemd-run' invocation failed for signature validation"), + msg="Expected cgroup disable reason to indicate systemd-run error during signature validation")