diff --git a/batch_deobfuscator/batch_interpreter.py b/batch_deobfuscator/batch_interpreter.py index 89a4df8..323f192 100644 --- a/batch_deobfuscator/batch_interpreter.py +++ b/batch_deobfuscator/batch_interpreter.py @@ -336,30 +336,56 @@ def get_commands(self, logical_line): if line_is_comment(logical_line): yield logical_line.strip() return - state = "init" + + stack = ["init"] counter = 0 start_command = 0 + var_start = None + for char in logical_line: - # print(f"C:{char}, S:{state}") + state = stack[-1] + if state == "init": - if char == '"': # quote is on - state = "str_s" + if char == '"': + stack.append("str_s") + elif char == "%": + stack.append("var_s") + var_start = counter elif char == "^": - state = "escape" + stack.append("escape") elif char == "&" and logical_line[counter - 1] == ">": # Usually an output redirection, we want to keep it on the same line pass - elif char == "&" or char == "|": + elif char in ("&", "|"): cmd = logical_line[start_command:counter].strip() if cmd != "": for part in self.get_commands_special_statement(cmd): yield part start_command = counter + 1 + elif state == "str_s": if char == '"': - state = "init" + stack.pop() + elif char == "%": + stack.append("var_s") + var_start = counter + + elif state == "var_s": + if char == "%": + # Inspect variable contents for something that affects parsing + stack.pop() # get rid of var_s + if var_start < counter: # at least one character between percents? + value = self.get_value(logical_line[var_start:counter + 1]) + if value.count('"') == 1: + if stack[-1] != "str_s": + stack.append("str_s") + else: # end of quoted data reached + stack.pop() + elif value == "^": + stack.append("escape") + elif state == "escape": - state = "init" + stack.pop() counter += 1 @@ -951,7 +977,7 @@ def percent_tilde(self, argument): return value if value else "script.bat" # pushdown automata - def normalize_command(self, command): + def normalize_command(self, command, replace_by_space=True): if line_is_comment(command): return command @@ -965,7 +991,8 @@ def normalize_command(self, command): if char == '"': # quote is on state = "str_s" normalized_com += char - elif char == "," or char == ";": # or char == "\t": EDIT: How about we keep those tabs? + # or char == "\t": EDIT: How about we keep those tabs? + elif (char == "," or char == ";") and replace_by_space: # commas (",") are replaced by spaces, unless they are part of a string in doublequotes # semicolons (";") are replaced by spaces, unless they are part of a string in doublequotes # tabs are replaced by a single space @@ -1012,9 +1039,16 @@ def normalize_command(self, command): normalized_com = normalized_com[:variable_start] if len(normalized_com) == 0: traits["start_with_var"] = True - normalized_com += self.normalize_command(value) + normalized_com += self.normalize_command(value, replace_by_space=False) traits["var_used"] += 1 - state = stack.pop() + prev = stack.pop() + if value.count('"') == 1: + if prev != "str_s": + state = "str_s" + else: # end of quoted data reached + state = "init" + else: + state = prev elif char == "%": # Two % in a row normalized_com += char state = stack.pop() diff --git a/tests/test_full_script.py b/tests/test_full_script.py index d3a935a..16fe46b 100644 --- a/tests/test_full_script.py +++ b/tests/test_full_script.py @@ -44,3 +44,24 @@ def test_concat_logical_lines(): rb'curl -X GET --fail -H "Accept: application/octet-stream" ' rb"http://server.org/data?accept=data >>met\resultat\output.log" ) + + +def test_no_substituted_quote_command_splitting(): + deobfuscator = BatchDeobfuscator() + script = rb"""set QUO=" +set %QUO%DATA=bla | foo;bar%QUO%""" + with tempfile.TemporaryDirectory() as temp_dir: + with tempfile.NamedTemporaryFile(dir=temp_dir) as tf: + tf.write(script) + tf.flush() + bat_filename, _ = deobfuscator.analyze(tf.name, temp_dir) + + with open(os.path.join(temp_dir, bat_filename), "rb") as f: + result = f.read() + lines = result.split(b"\r\n") + + assert len(lines) >= 2 + assert lines[0] == b'set QUO="' + # 1. Must not split at | + # 2. Must not replace ; by space + assert lines[1] == b'set "DATA=bla | foo;bar"' diff --git a/tests/test_unittests.py b/tests/test_unittests.py index 07cfdfc..58a8c99 100644 --- a/tests/test_unittests.py +++ b/tests/test_unittests.py @@ -131,6 +131,9 @@ def test_simple_set_a(): # ('set EXP=43^"|', "echo *%EXP%*", []), # ('set EXP=43"^|', "echo *%EXP%*", 'echo *43"^|*'), # ('set EXP=43"^^|', "echo *%EXP%*", 'echo *43"^^|*'), + # Comma in value + ('set EXP=4,3', "echo *%EXP%*", "echo *4,3*"), + ('set "EXP=4,3"', "echo *%EXP%*", "echo *4,3*"), # Getting into really weird stuff ("set EXP=4=3", "echo *%EXP%*", "echo *4=3*"), ('set ""EXP=43"', 'echo *%"EXP%*', "echo *43*"), @@ -755,3 +758,35 @@ def test_keep_quotes_on_set(): cmd = 'set "ab= ""' res = deobfuscator.normalize_command(cmd) assert res == cmd + + @staticmethod + @pytest.mark.parametrize( + "cmd, command_list", + [ + ( + 'set %QUO%DATA=bla | foo%QUO% & bar', + ['set %QUO%DATA=bla | foo%QUO%', 'bar'], + ), + ( + 'set "DATA=bla | foo%QUO% & bar', + ['set "DATA=bla | foo%QUO%', 'bar'], + ), + ( + 'set %QUO%DATA=bla | foo" & bar', + ['set %QUO%DATA=bla | foo"', 'bar'], + ), + ], + ) + def test_substituted_quotes_command_splitting(cmd, command_list): + deobfuscator = BatchDeobfuscator() + deobfuscator.interpret_command('set QUO="') + res = list(deobfuscator.get_commands(cmd)) + assert res == command_list + + @staticmethod + def test_substituted_escape_command_splitting(): + deobfuscator = BatchDeobfuscator() + deobfuscator.interpret_command('set ESCP=^^') + cmd = 'echo a %ESCP%| b' + res = list(deobfuscator.get_commands(cmd)) + assert res == [cmd]