diff --git a/.gitignore b/.gitignore index e469d1c..2ee9845 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,6 @@ coverage.xml # Test files a.out main.pdf + +# Log files +*.log diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..fd07386 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,42 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: check-added-large-files + - id: check-ast + - id: check-builtin-literals + - id: check-case-conflict + - id: check-docstring-first + - id: check-executables-have-shebangs + - id: check-json + - id: check-merge-conflict + - id: check-shebang-scripts-are-executable + - id: check-symlinks + - id: check-toml + - id: check-vcs-permalinks + - id: check-xml + - id: check-yaml + args: [--allow-multiple-documents] + - id: debug-statements + - id: destroyed-symlinks + - id: detect-private-key + - id: end-of-file-fixer + - id: fix-byte-order-marker + - id: fix-encoding-pragma + args: [--remove] + - id: mixed-line-ending + - id: pretty-format-json + - id: trailing-whitespace + args: [--markdown-linebreak-ext=md] + - repo: https://github.com/psf/black + rev: 23.1.0 + hooks: + - id: black + args: [-l 99] + - repo: https://github.com/doublify/pre-commit-clang-format + rev: 62302476d0da01515660132d76902359bed0f782 + hooks: + - id: clang-format + types: [file] + files: \.(cpp|cc|cxx|c|h|hxx)$ + args: [--style=file] diff --git a/LICENSES/GTDGmbH.md b/LICENSES/GTDGmbH.md new file mode 100644 index 0000000..120e287 --- /dev/null +++ b/LICENSES/GTDGmbH.md @@ -0,0 +1,24 @@ +Valid-License-Identifier: GTDGmbH +License-Text: + +Copyright (c) 2023 GTD GmbH. All rights reserved. + +MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the “Software”), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/LICENSE b/LICENSES/MauriMustonen.md similarity index 95% rename from LICENSE rename to LICENSES/MauriMustonen.md index 4895137..e8969fd 100644 --- a/LICENSE +++ b/LICENSES/MauriMustonen.md @@ -1,3 +1,6 @@ +Valid-License-Identifier: MauriMustonen +License-Text: + MIT License Copyright (c) 2021 Mauri Mustonen diff --git a/Pipfile b/Pipfile index 3918b1b..a4ded50 100644 --- a/Pipfile +++ b/Pipfile @@ -12,3 +12,4 @@ flake8 = "*" pytest = "*" pytest-cov = "*" pylint = "*" +pre-commit = "*" diff --git a/README.md b/README.md index 360b555..2602e52 100644 --- a/README.md +++ b/README.md @@ -1,353 +1,28 @@ +# Object Code Graph (ocgraph) -# asm2cfg -![CI status](https://github.com/Kazhuu/asm2cfg/actions/workflows/ci.yml/badge.svg) -[![codecov](https://codecov.io/gh/Kazhuu/asm2cfg/branch/main/graph/badge.svg?token=ZHLOJO8Q3V)](https://codecov.io/gh/Kazhuu/asm2cfg) +This repo contains *asm2cfg*, (**Assembler to control-flow-graph**), a tool to +read disassembler output and optional corresponding coverage data to produce +control flow graphs including coverage analysis on assembler level. -Python command-line tool and GDB extension to view and save x86, ARM and objdump -assembly files as control-flow graph (CFG) pdf files. From GDB debugging session -use `viewcfg` command to view CFG and use `savecfg` command to save it to the -pdf file. +The *asm2cfg* source code implemented by GTD GmbH is forked from the asm2cfg +repository on [Github](https://github.com/Kazhuu/asm2cfg) and reworked to +function with several architectures and disassembler. -

- -

+The tool source code is subject to the MIT License as indicated by the headers +of the corresponding source code files. -Program has been developed to support X86, ARM and objdump assembly outputs. -Program is mostly tested with x86 assembly. ARM and objdump formats might not be -fully supported. If you have any suggestions or find bugs, please open an issue -or create a pull request. If you want to contribute, check -[Development](#development) how to get started. +[!WARNING] +> **WARNING**: +> +> **This tool is currently under development and in beta state It's not intended +> to be complete and using is on your own risk.** -## Table of Content +## Documentation - +Due to the lack of a full documentation, all previous information can be found +in separate *Markdown* files in the *doc* folder: -* [Install](#install) -* [Usage From GDB](#usage-from-gdb) -* [Usage as Standalone](#usage-as-standalone) - * [Knowing Function Name](#knowing-function-name) - * [Disassemble Function](#disassemble-function) - * [Draw CFG](#draw-cfg) - * [Examples](#examples) -* [Development](#development) - * [Python Environment](#python-environment) - * [Testing](#testing) - * [Code Linting](#code-linting) - * [Command-Line Interface](#command-line-interface) - * [GDB Integration](#gdb-integration) - * [Current Development Goals](#current-development-goals) - - - -## Install - -Project can be installed with pip - -``` -pip install asm2cfg -``` - -To be able to view the dot files from GDB. External dot viewer is required. For -this purpose [xdot](https://pypi.org/project/xdot/) can be used for example. Any -other dot viewer will also do. To install this on Debian based distro run - -``` -sudo apt install xdot -``` - -Or Arch based - -``` -sudo pacman -S xdot -``` - -To add extension to GDB you need to source the pip installed plugin to it. To -find where pip placed GDB extension run `which gdb_asm2cfg` or in case if you -use pyenv use `pyenv which gdb_asm2cfg`. Copy the path to the clipboard. - -Then in you home directory if not already add `.gdbinit` file -and place following line in it and replace path from the earlier step. - -``` -source -``` - -For example in my Linux machine line end up to be - -``` -source ~/.local/bin/gdb_asm2cfg.py -``` - -Now when you start GDB no errors should be displayed and you are ready to go. - -## Usage From GDB - -In GDB session this extension provides command `viewcfg` to view CFG with -external dot viewer. Command `savecfg` saves the CFG to pdf file to current -working directory with same name as the function being dumped. Both commands -disassemble the current execution frame/function when the command is issued. To -see help for these commands use `help` command like `help viewcfg`. - -For example let's view main function from you favorite non-stripped executable. -First run GDB until main function - -``` -gdb -ex 'b main' -ex 'run' -``` - -Now run `viewcfg` to view CFG as a dot graph with external editor. Or run `savecfg` -to save CFG to pdf file named `main.pdf` to current working directory. If -function is stripped then memory address of the function will used as a name -instead. For example `0x555555555faf-0x555555557008.pdf`. - -If assembly function is very large with a lot of jumps and calls to other -functions. Then rendering the CFG can take a long time. So be patient or cancel -rendering with Ctrl-C. To make the rendering faster you can skip function calls -instructions from splitting the code to more blocks. To set this run `set -skipcalls on` and then run earlier command again. Note that if function is long -and has a lot of jumps inside itself, then rendering is still gonna take a long -time. To have normal behavior again run `set skipcalls off`. - -## Usage as Standalone - -This method can be used with assembly files saved from ouput of objdump and GDB -disassembly. Pip installation will come with `asm2cfg` command-line tool for -this purpose. - -To use as standalone script you first need to dump assembly from GDB or objdump -to the file which is explained below. - -### Knowing Function Name - -If you don't know the name of function you're looking for then you can also list -all function names using GDB: - -``` -gdb -batch -ex 'b main' -ex r -ex 'info functions' ./test_executable -``` - -This will set breakpoint at function `main`, then -run the program and print symbols from all loaded libraries. - -For functions which come from main executable you can avoid running the program -and simply do - -``` -gdb -batch -ex 'info functions' ./test_executable -``` - -If you want to narrow the search down you can also use regexp - -``` -gdb ... -ex 'info functions ' ... -``` - -### Disassemble Function - -Once you have the function name, you can produce its disassembly via - -``` -gdb -batch -ex 'b main' -ex r -ex 'pipe disassemble test_function | tee test_function.asm' ./test_executable -``` - -or - -``` -gdb -batch -ex 'set breakpoints pending on' -ex 'b test_function' -ex r -ex 'pipe disassemble | tee test_function.asm' ./test_executable -``` - -(the `set breakpoint pending on` command enables pending breakpoints and -could be added to your `.gdbinit` instead) - -For functions from main executable it's enough to do - -``` -gdb -batch -ex 'pipe disassemble test_function | tee test_function.asm' ./test_executable -``` - -You can also extract function's disassembly from `objdump` output: - -``` -objdump -d ./test_executable | sed -ne '/ test_executable.asm -``` - -(this may be useful for specific non-native targets which lack GDB support). - -### Draw CFG - -Now you have the assembly file. Time to turn that to CFG pdf file. Do that by giving it -to `asm2cfg` command-line tool like so - -``` -asm2cfg test_function.asm -``` - -Asm2cfg by default expects x86 assembly files. If you want to use ARM assembly files, -then provide `--target arm` command-line flag. - -Above command should output `test_function.pdf` file in the same directory where -the executable was ran. If the assembly file is stripped then the function -memory range is used as a name instead. For example -`0x555555555faf-0x555555557008.pdf`. - -To view CFG instead of saving provide `-v` flag. And to skip function calls from -splitting the code to further blocks provide `-c` flag. To show the help use -`-h`. - -### Examples - -Repository includes examples which can be used to test the standalone -functionality for x86, ARM and objdump. - -File `test_function.asm` is non-stripped assembly file and its -corresponding output `test_function.pdf`. - -File `stripped_function.asm` contains -stripped function and its corresponding output -`stripped_function.pdf`. - -File `att_syntax.asm` is an example of non-stripped AT&T assembly. - -File `huge.asm` is a large stripped -assembly function and its corresponding output `huge.pdf`. This can be used to -test processing time of big functions. - -Files `objdump.asm` and `stripped_objdump.asm` are the regular and stripped -objdump-based disassemblies of short functions. - -File `arm.asm` is ARM based assembly file and its corresponding pdf file is -`arm.pdf`. - -## Development - -You want to contribute? You're very welcome to do so! This section will give you -guidance how to setup development environment and test things locally. - -### Python Environment - -For development this project manages packages with pipenv. Pipenv is a tool to -manage Python virtual environments and packages with much less pain compared to -normal pip and virtualenv usage. - -Install pipenv for your system following the guide -[here](https://pipenv.pypa.io/en/latest/). - -After installing pipenv. Create virtual environment and install all required -packages to it. Run following at project root - -``` -pipenv install -d -``` - -Now you can activate the virtual environment with - -``` -pipenv shell -``` - -Now your `python` and `pip` commands will correspond to created virtual environment -instead of your system's Python installation. - -To deactivate the environment, use - -``` -exit -``` - -### Testing - -This project uses [pytest](https://pypi.org/project/pytest/) for testing. Some -test are written using Python's own unittest testing framework, but they work -with pytest out of the box. Pytest style is preferred way to write tests. - -To run tests from project root, use `pytest` or - -``` -pipenv run pytest -``` - -During testing dot viewer might be opened if you have it installed. This is -because GDB integration command `viewcfg` is tested, which will open external -dot viewer. Just close it after it's opened. It should not affect the test run -itself. - -### Code Linting - -Project uses [flake8](https://flake8.pycqa.org/en/latest/) and -[pylint](https://pylint.org/) for code linting. - -To run flake8, use - -``` -flake8 -``` - -And to run pylint use - -``` -pylint src test -``` - -Both commands should not print any errors. - -### Command-Line Interface - -To test command-line interface of asm2cfg wihtout installing the package. You -can execute module directly. For example to print help - -``` -python -m src.asm2cfg -h -``` - -Standalone method can be used to try out the examples under `examples` folder as -well. For example following command should generate `main.pdf` file to current -working directory. - -``` -python -m src.asm2cfg -c examples/huge.asm -``` - -### GDB Integration - -Before testing GDB functionality, make sure asm2cfg is not installed with pip! -This can lead to GDB using code from pip installed asm2cfg package instead of -code from this repository! - -Also pipenv cannot be used with GDB. You need to install required packages to -your system's Python pip. This is because your installed GDB is linked against -system's Python interpreter and will use it, instead of active virtual -environment. If packages are not installed to your system's pip. You are likely -to receive following error messages when trying to use asm2cfg with GDB - -``` -ModuleNotFoundError: No module named 'graphviz' -``` - -To fix this, install required packages to your system's pip without active -virtual environment. Currently GDB integration only requires graphviz. - -``` -pip install graphviz -``` - -To use asm2cfg GDB related functionality. Use following line from -project root. - -``` -PYTHONPATH=${PWD}/src gdb -ex 'source src/gdb_asm2cfg.py' -``` - -This will set Python import path so that GDB can import code from this -repository without installing the package. After this you should be able to use -commands `viewcfg` and `savecfg`. - -### Current Development Goals - -There are might be cases asm2cfg will not fully support all x86 or ARM assembly -lines. If you encounter such problems please open an issue. - -Current developed goals are best described in issues section. Please open a new -one if existing one does not exist. - -If you want to talk to me, you can contact me at Discord with name -`Kazhuu#3121`. +1. [Installation](doc/1_Installation.md): Instructions to install the tool +2. [How to Run](doc/2_HowToRun.md): Guideline to run the tool +3. [Development](doc/3_Development.md): Information for Developer +4. [Github asm2cfg](doc/3_Development.md): Original Github documentation diff --git a/asm2cfg b/asm2cfg new file mode 100755 index 0000000..4f1b04a --- /dev/null +++ b/asm2cfg @@ -0,0 +1,12 @@ +#!/bin/sh + +# SPDX-License-Identifier: GTDGmbH +# Copyright 2023 by GTD GmbH. + +dir="$(dirname "$(readlink -f "$0")")" +cd $dir +export PATH=$dir/bin:$PATH +if [ -d "venv" ]; then + . venv/bin/activate +fi +python3 -m ocgraph.__main__ "$@" diff --git a/doc/1_Installation.md b/doc/1_Installation.md new file mode 100644 index 0000000..1e90e8b --- /dev/null +++ b/doc/1_Installation.md @@ -0,0 +1,7 @@ +# Installation + +## Dependencies for *asm2cfg* + +- Python >=3.6 +- The python `graphviz` for printing the graph, please install them e.g. with + `pip`. Preferably use a `virtualenv`. diff --git a/doc/2_HowToRun.md b/doc/2_HowToRun.md new file mode 100644 index 0000000..192c301 --- /dev/null +++ b/doc/2_HowToRun.md @@ -0,0 +1,41 @@ +# How to Run + +## As python module + +```cmd +python3 -m ocgraph -f a.out -d objdump -a sparc -c cov.csv -o a.pdf +``` + +## As command line script + +```cmd +./asm2cfg -f a.out -d objdump -a sparc -c cov.csv -o a.pdf +``` + +## Custom python script + +```python +from ocgraph.interface.analyzer import Analyzer +from ocgraph.interface.drawer import Drawer +from ocgraph.interface.coverage_reader import CoverageReader + +from ocgraph.coverage_tracer import CoverageTracer +from ocgraph.configuration.configuration import CovTraceConfiguration + +# Create configuration +config = OcGraphConfiguration(disassembler="objdump", arch="sparc") + +# Read input text +lines = read_lines("a.out") + +# Analyze input text +analyser = Analyzer(config=config) +analyser.parse_lines(lines=lines) + +# Update analyzed input with coverage data +cov_reader = CoverageReader(instructions=analyser.instructions config=config) +cov_reader.update_by_csv(args.coverage) + +drawer = Drawer(analyser.configuration) +drawer.draw_cfg(name=analyser.function_name, basic_blocks=analyser.basic_blocks, output="a.pdf") +``` diff --git a/doc/3_Development.md b/doc/3_Development.md new file mode 100644 index 0000000..e1cf37a --- /dev/null +++ b/doc/3_Development.md @@ -0,0 +1,53 @@ +# Development + +## Design + +```mermaid +--- +title: OcGraph design +--- +classDiagram + + class Configuration { + __init__(arch, disassembler, logging): + +dict disassembler_option + +dict architecture_option + +dict preset_logging + } + class Disassembler { + Name + parse_line() + ...() + } + class Architecture { + is_branch() + ...() + } + class Logger { Name } + + Configuration --* Disassembler + Configuration --* Architecture + Configuration --* Logger + + class Analyzer { + __init__(config) + parse_file(file_path): basic_blocks + } + class CoverageReader { + __init__(basic_blocks, config) + update_by_csv(file_path) + } + class Drawer { + __init__(config) + draw_cfg(basic_blocks, output) + } + class __main__ { + main() + } + + __main__ --> Configuration + __main__ --> Analyzer + __main__ --> CoverageReader + __main__ --> Drawer + +``` diff --git a/doc/4_Github.md b/doc/4_Github.md new file mode 100644 index 0000000..b3a81ff --- /dev/null +++ b/doc/4_Github.md @@ -0,0 +1,354 @@ + +# asm2cfg + +![CI status](https://github.com/Kazhuu/asm2cfg/actions/workflows/ci.yml/badge.svg) +[![codecov](https://codecov.io/gh/Kazhuu/asm2cfg/branch/main/graph/badge.svg?token=ZHLOJO8Q3V)](https://codecov.io/gh/Kazhuu/asm2cfg) + +Python command-line tool and GDB extension to view and save x86, ARM and objdump +assembly files as control-flow graph (CFG) pdf files. From GDB debugging session +use `viewcfg` command to view CFG and use `savecfg` command to save it to the +pdf file. + +

+ +

+ +Program has been developed to support X86, ARM and objdump assembly outputs. +Program is mostly tested with x86 assembly. ARM and objdump formats might not be +fully supported. If you have any suggestions or find bugs, please open an issue +or create a pull request. If you want to contribute, check +[Development](#development) how to get started. + +## Table of Content + + + +* [Install](#install) +* [Usage From GDB](#usage-from-gdb) +* [Usage as Standalone](#usage-as-standalone) + * [Knowing Function Name](#knowing-function-name) + * [Disassemble Function](#disassemble-function) + * [Draw CFG](#draw-cfg) + * [Examples](#examples) +* [Development](#development) + * [Python Environment](#python-environment) + * [Testing](#testing) + * [Code Linting](#code-linting) + * [Command-Line Interface](#command-line-interface) + * [GDB Integration](#gdb-integration) + * [Current Development Goals](#current-development-goals) + + + +## Install + +Project can be installed with pip + +``` +pip install asm2cfg +``` + +To be able to view the dot files from GDB. External dot viewer is required. For +this purpose [xdot](https://pypi.org/project/xdot/) can be used for example. Any +other dot viewer will also do. To install this on Debian based distro run + +``` +sudo apt install xdot +``` + +Or Arch based + +``` +sudo pacman -S xdot +``` + +To add extension to GDB you need to source the pip installed plugin to it. To +find where pip placed GDB extension run `which gdb_asm2cfg` or in case if you +use pyenv use `pyenv which gdb_asm2cfg`. Copy the path to the clipboard. + +Then in you home directory if not already add `.gdbinit` file +and place following line in it and replace path from the earlier step. + +``` +source +``` + +For example in my Linux machine line end up to be + +``` +source ~/.local/bin/gdb_asm2cfg.py +``` + +Now when you start GDB no errors should be displayed and you are ready to go. + +## Usage From GDB + +In GDB session this extension provides command `viewcfg` to view CFG with +external dot viewer. Command `savecfg` saves the CFG to pdf file to current +working directory with same name as the function being dumped. Both commands +disassemble the current execution frame/function when the command is issued. To +see help for these commands use `help` command like `help viewcfg`. + +For example let's view main function from you favorite non-stripped executable. +First run GDB until main function + +``` +gdb -ex 'b main' -ex 'run' +``` + +Now run `viewcfg` to view CFG as a dot graph with external editor. Or run `savecfg` +to save CFG to pdf file named `main.pdf` to current working directory. If +function is stripped then memory address of the function will used as a name +instead. For example `0x555555555faf-0x555555557008.pdf`. + +If assembly function is very large with a lot of jumps and calls to other +functions. Then rendering the CFG can take a long time. So be patient or cancel +rendering with Ctrl-C. To make the rendering faster you can skip function calls +instructions from splitting the code to more blocks. To set this run `set +skipcalls on` and then run earlier command again. Note that if function is long +and has a lot of jumps inside itself, then rendering is still gonna take a long +time. To have normal behavior again run `set skipcalls off`. + +## Usage as Standalone + +This method can be used with assembly files saved from ouput of objdump and GDB +disassembly. Pip installation will come with `asm2cfg` command-line tool for +this purpose. + +To use as standalone script you first need to dump assembly from GDB or objdump +to the file which is explained below. + +### Knowing Function Name + +If you don't know the name of function you're looking for then you can also list +all function names using GDB: + +``` +gdb -batch -ex 'b main' -ex r -ex 'info functions' ./test_executable +``` + +This will set breakpoint at function `main`, then +run the program and print symbols from all loaded libraries. + +For functions which come from main executable you can avoid running the program +and simply do + +``` +gdb -batch -ex 'info functions' ./test_executable +``` + +If you want to narrow the search down you can also use regexp + +``` +gdb ... -ex 'info functions ' ... +``` + +### Disassemble Function + +Once you have the function name, you can produce its disassembly via + +``` +gdb -batch -ex 'b main' -ex r -ex 'pipe disassemble test_function | tee test_function.asm' ./test_executable +``` + +or + +``` +gdb -batch -ex 'set breakpoints pending on' -ex 'b test_function' -ex r -ex 'pipe disassemble | tee test_function.asm' ./test_executable +``` + +(the `set breakpoint pending on` command enables pending breakpoints and +could be added to your `.gdbinit` instead) + +For functions from main executable it's enough to do + +``` +gdb -batch -ex 'pipe disassemble test_function | tee test_function.asm' ./test_executable +``` + +You can also extract function's disassembly from `objdump` output: + +``` +objdump -d ./test_executable | sed -ne '/ test_executable.asm +``` + +(this may be useful for specific non-native targets which lack GDB support). + +### Draw CFG + +Now you have the assembly file. Time to turn that to CFG pdf file. Do that by giving it +to `asm2cfg` command-line tool like so + +``` +asm2cfg test_function.asm +``` + +Asm2cfg by default expects x86 assembly files. If you want to use ARM assembly files, +then provide `--target arm` command-line flag. + +Above command should output `test_function.pdf` file in the same directory where +the executable was ran. If the assembly file is stripped then the function +memory range is used as a name instead. For example +`0x555555555faf-0x555555557008.pdf`. + +To view CFG instead of saving provide `-v` flag. And to skip function calls from +splitting the code to further blocks provide `-c` flag. To show the help use +`-h`. + +### Examples + +Repository includes examples which can be used to test the standalone +functionality for x86, ARM and objdump. + +File `test_function.asm` is non-stripped assembly file and its +corresponding output `test_function.pdf`. + +File `stripped_function.asm` contains +stripped function and its corresponding output +`stripped_function.pdf`. + +File `att_syntax.asm` is an example of non-stripped AT&T assembly. + +File `huge.asm` is a large stripped +assembly function and its corresponding output `huge.pdf`. This can be used to +test processing time of big functions. + +Files `objdump.asm` and `stripped_objdump.asm` are the regular and stripped +objdump-based disassemblies of short functions. + +File `arm.asm` is ARM based assembly file and its corresponding pdf file is +`arm.pdf`. + +## Development + +You want to contribute? You're very welcome to do so! This section will give you +guidance how to setup development environment and test things locally. + +### Python Environment + +For development this project manages packages with pipenv. Pipenv is a tool to +manage Python virtual environments and packages with much less pain compared to +normal pip and virtualenv usage. + +Install pipenv for your system following the guide +[here](https://pipenv.pypa.io/en/latest/). + +After installing pipenv. Create virtual environment and install all required +packages to it. Run following at project root + +``` +pipenv install -d +``` + +Now you can activate the virtual environment with + +``` +pipenv shell +``` + +Now your `python` and `pip` commands will correspond to created virtual environment +instead of your system's Python installation. + +To deactivate the environment, use + +``` +exit +``` + +### Testing + +This project uses [pytest](https://pypi.org/project/pytest/) for testing. Some +test are written using Python's own unittest testing framework, but they work +with pytest out of the box. Pytest style is preferred way to write tests. + +To run tests from project root, use `pytest` or + +``` +pipenv run pytest +``` + +During testing dot viewer might be opened if you have it installed. This is +because GDB integration command `viewcfg` is tested, which will open external +dot viewer. Just close it after it's opened. It should not affect the test run +itself. + +### Code Linting + +Project uses [flake8](https://flake8.pycqa.org/en/latest/) and +[pylint](https://pylint.org/) for code linting. + +To run flake8, use + +``` +flake8 +``` + +And to run pylint use + +``` +pylint src test +``` + +Both commands should not print any errors. + +### Command-Line Interface + +To test command-line interface of asm2cfg wihtout installing the package. You +can execute module directly. For example to print help + +``` +python -m src.asm2cfg -h +``` + +Standalone method can be used to try out the examples under `examples` folder as +well. For example following command should generate `main.pdf` file to current +working directory. + +``` +python -m src.asm2cfg -c examples/huge.asm +``` + +### GDB Integration + +Before testing GDB functionality, make sure asm2cfg is not installed with pip! +This can lead to GDB using code from pip installed asm2cfg package instead of +code from this repository! + +Also pipenv cannot be used with GDB. You need to install required packages to +your system's Python pip. This is because your installed GDB is linked against +system's Python interpreter and will use it, instead of active virtual +environment. If packages are not installed to your system's pip. You are likely +to receive following error messages when trying to use asm2cfg with GDB + +``` +ModuleNotFoundError: No module named 'graphviz' +``` + +To fix this, install required packages to your system's pip without active +virtual environment. Currently GDB integration only requires graphviz. + +``` +pip install graphviz +``` + +To use asm2cfg GDB related functionality. Use following line from +project root. + +``` +PYTHONPATH=${PWD}/src gdb -ex 'source src/gdb_asm2cfg.py' +``` + +This will set Python import path so that GDB can import code from this +repository without installing the package. After this you should be able to use +commands `viewcfg` and `savecfg`. + +### Current Development Goals + +There are might be cases asm2cfg will not fully support all x86 or ARM assembly +lines. If you encounter such problems please open an issue. + +Current developed goals are best described in issues section. Please open a new +one if existing one does not exist. + +If you want to talk to me, you can contact me at Discord with name +`Kazhuu#3121`. diff --git a/images/example.png b/doc/images/example.png similarity index 100% rename from images/example.png rename to doc/images/example.png diff --git a/images/example.svg b/doc/images/example.svg similarity index 100% rename from images/example.svg rename to doc/images/example.svg diff --git a/src/asm2cfg/__init__.py b/ocgraph/__init__.py similarity index 100% rename from src/asm2cfg/__init__.py rename to ocgraph/__init__.py diff --git a/ocgraph/__main__.py b/ocgraph/__main__.py new file mode 100755 index 0000000..fee92c2 --- /dev/null +++ b/ocgraph/__main__.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH +""" +Let this module be executed from the command line with python -m ocgraph +from root of the project +""" +import argparse + +from .interface.analyzer import Analyzer +from .interface.drawer import Drawer +from .interface.coverage_reader import CoverageReader + +from .configuration.configuration import OcGraphConfiguration + + +def print_assembly(basic_blocks): + """Debug function to print the assembly.""" + for basic_block in basic_blocks.values(): + print(basic_block) + + +def read_lines(file_path) -> list[str]: + """Read lines from the file and return then as a list.""" + with open(file_path, "r", encoding="utf8") as asm_file: + lines = asm_file.readlines() + return lines + + +def main(): + """Command-line entry point to the program.""" + parser = argparse.ArgumentParser(description="Assembly to Control-Flow-Graph rendering.") + + parser.add_argument( + "-f", + "--file", + help="Disassembled object file", + required=True, + ) + parser.add_argument( + "-d", + "--diss", + help="Disassembler option", + required=True, + choices=OcGraphConfiguration.disassemblers(), + ) + parser.add_argument( + "-a", + "--arch", + help="Architecture option", + required=True, + choices=OcGraphConfiguration.architectures(), + ) + + parser.add_argument("-c", "--coverage", help="Coverage file for printing coverage") + parser.add_argument("-v", "--view", action="store_true", help="View as a dot graph") + parser.add_argument("-o", "--output", help="Target output filename") + parser.add_argument( + "-l", + "--logger", + choices=OcGraphConfiguration.loggers(), + default="default", + help="Logging mechanism preset", + ) + args = parser.parse_args() + + # Create configuration + config = OcGraphConfiguration(disassembler=args.diss, arch=args.arch, preset=args.logger) + + lines = read_lines(args.file) + + analyser = Analyzer(config=config) + analyser.parse_lines(lines=lines) + + if args.coverage: + cov_reader = CoverageReader(instructions=analyser.instructions, config=config) + cov_reader.update_by_csv(args.coverage) + + drawer = Drawer(analyser.configuration) + drawer.draw_cfg( + name=analyser.function_name, basic_blocks=analyser.basic_blocks, output=args.output + ) + + +if __name__ == "__main__": + main() diff --git a/ocgraph/configuration/__init__.py b/ocgraph/configuration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ocgraph/configuration/architecture/__init__.py b/ocgraph/configuration/architecture/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ocgraph/configuration/architecture/architecture.py b/ocgraph/configuration/architecture/architecture.py new file mode 100755 index 0000000..511c5a1 --- /dev/null +++ b/ocgraph/configuration/architecture/architecture.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH +"""Contains all necessary functions for a TargetInfo class.""" + +from abc import ABC, abstractmethod + +from ...data.instruction import Instruction + + +class Architecture(ABC): + """TargetInfo Class defining the target specific instruction set characteristics""" + + def __init__(self): + pass + + @abstractmethod + def comment(self) -> str: + """Return how comments starts in the disassembly""" + raise NotImplementedError() + + @abstractmethod + def is_call(self, instruction: Instruction) -> bool: + """Return if disassembled instruction is a subroutine call""" + raise NotImplementedError() + + @abstractmethod + def is_unconditional_branch(self, instruction: Instruction) -> bool: + """Return if disassembled instruction is an unconditional branch""" + raise NotImplementedError() + + def get_branch_delay(self, instruction: Instruction) -> int | None: + """Return the branch delay of an instruction or None if not a branch""" + return 1 if self.is_branch(instruction) else None + + @abstractmethod + def is_direct_branch(self, instruction: Instruction) -> bool: + """Return if disassembled instruction is a direct branch""" + raise NotImplementedError() + + @abstractmethod + def is_branch(self, instruction: Instruction) -> bool: + """Return if disassembled instruction is a branch instruction (conditional or unconditional)""" + raise NotImplementedError() + + @abstractmethod + def is_sink(self, instruction: Instruction) -> bool: + """Return if disassembled instruction serves as sink (e.g. ret)""" + raise NotImplementedError() diff --git a/ocgraph/configuration/architecture/arm.py b/ocgraph/configuration/architecture/arm.py new file mode 100755 index 0000000..920b626 --- /dev/null +++ b/ocgraph/configuration/architecture/arm.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH +""" Contains instruction info for ARM-compatible targets. """ + +import re + +from .architecture import Architecture +from ...data.instruction import Instruction + +# Common regexes +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN + + +class ArmArchitecture(Architecture): + """ArmArchitecture Class""" + + def comment(self): + return ";" + + def is_call(self, instruction: Instruction): + # Various flavors of call: + # bl 0x19d90 <_IO_vtable_check> + # Note that we should be careful to not mix it with conditional + # branches like 'ble'. + return instruction.opcode.startswith("bl") and instruction.opcode not in ( + "blt", + "ble", + "bls", + ) + + def is_branch(self, instruction: Instruction): + return instruction.opcode[0] == "b" and not self.is_call(instruction) + + def is_direct_branch(self, instruction: Instruction): + return self.is_branch(instruction) and re.match(rf"{HEX_LONG_PATTERN}", instruction.ops[0]) + + def is_unconditional_branch(self, instruction: Instruction): + return instruction.opcode == "b" + + def is_sink(self, instruction: Instruction): + """ + Is this an instruction which terminates function execution e.g. return? + Detect various flavors of return like + bx lr + pop {r2-r6,pc} + Note that we do not consider conditional branches (e.g. 'bxle') to sink. + """ + return ( + re.search(r"\bpop\b.*\bpc\b", instruction.body) + or (instruction.opcode == "bx" and instruction.ops[0] == "lr") + or instruction.opcode == "udf" + ) diff --git a/ocgraph/configuration/architecture/ppc.py b/ocgraph/configuration/architecture/ppc.py new file mode 100755 index 0000000..b446120 --- /dev/null +++ b/ocgraph/configuration/architecture/ppc.py @@ -0,0 +1,713 @@ +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH +"""Contains instruction info for PPC-compatible targets.""" + +import re + +from .architecture import Architecture +from ...data.instruction import Instruction + + +# Common regexes +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN + +# fmt: off +ppc_call_opcodes = [ + "bl", +] + +ppc_sink_opcodes = [ + "blr", +] + +ppc_unconditional_branch_opcodes = [ + "b", "ba", "bla", + "bctr", "bctrl", "blrl", +] + +ppc_conditional_branch_opcodes = [ + "bdnz-", + "bdnz+", + "bdnz", + "bdn", + "bdnzl-", + "bdnzl+", + "bdnzl", + "bdnl", + "bdnza-", + "bdnza+", + "bdnza", + "bdna", + "bdnzla-", + "bdnzla+", + "bdnzla", + "bdnla", + "bdz-", + "bdz+", + "bdz", + "bdzl-", + "bdzl+", + "bdzl", + "bdza-", + "bdza+", + "bdza", + "bdzla-", + "bdzla+", + "bdzla", + + "bge-", + "bge+", + "bge", + "bnl-", + "bnl+", + "bnl", + "bgel-", + "bgel+", + "bgel", + "bnll-", + "bnll+", + "bnll", + "bgea-", + "bgea+", + "bgea", + "bnla-", + "bnla+", + "bnla", + "bgela-", + "bgela+", + "bgela", + "bnlla-", + "bnlla+", + "bnlla", + "ble-", + "ble+", + "ble", + "bng-", + "bng+", + "bng", + "blel-", + "blel+", + "blel", + "bngl-", + "bngl+", + "bngl", + "blea-", + "blea+", + "blea", + "bnga-", + "bnga+", + "bnga", + "blela-", + "blela+", + "blela", + "bngla-", + "bngla+", + "bngla", + "bne-", + "bne+", + "bne", + "bnel-", + "bnel+", + "bnel", + "bnea-", + "bnea+", + "bnea", + "bnela-", + "bnela+", + "bnela", + "bns-", + "bns+", + "bns", + "bnu-", + "bnu+", + "bnu", + "bnsl-", + "bnsl+", + "bnsl", + "bnul-", + "bnul+", + "bnul", + "bnsa-", + "bnsa+", + "bnsa", + "bnua-", + "bnua+", + "bnua", + "bnsla-", + "bnsla+", + "bnsla", + "bnula-", + "bnula+", + "bnula", + + "blt-", + "blt+", + "blt", + "bltl-", + "bltl+", + "bltl", + "blta-", + "blta+", + "blta", + "bltla-", + "bltla+", + "bltla", + "bgt-", + "bgt+", + "bgt", + "bgtl-", + "bgtl+", + "bgtl", + "bgta-", + "bgta+", + "bgta", + "bgtla-", + "bgtla+", + "bgtla", + "beq-", + "beq+", + "beq", + "beql-", + "beql+", + "beql", + "beqa-", + "beqa+", + "beqa", + "beqla-", + "beqla+", + "beqla", + "bso-", + "bso+", + "bso", + "bun-", + "bun+", + "bun", + "bsol-", + "bsol+", + "bsol", + "bunl-", + "bunl+", + "bunl", + "bsoa-", + "bsoa+", + "bsoa", + "buna-", + "buna+", + "buna", + "bsola-", + "bsola+", + "bsola", + "bunla-", + "bunla+", + "bunla", + + "bdnzf-", + "bdnzf+", + "bdnzf", + "bdnzfl-", + "bdnzfl+", + "bdnzfl", + "bdnzfa-", + "bdnzfa+", + "bdnzfa", + "bdnzfla-", + "bdnzfla+", + "bdnzfla", + "bdzf-", + "bdzf+", + "bdzf", + "bdzfl-", + "bdzfl+", + "bdzfl", + "bdzfa-", + "bdzfa+", + "bdzfa", + "bdzfla-", + "bdzfla+", + "bdzfla", + + "bf-", + "bf+", + "bf", + "bbf", + "bfl-", + "bfl+", + "bfl", + "bbfl", + "bfa-", + "bfa+", + "bfa", + "bbfa", + "bfla-", + "bfla+", + "bfla", + "bbfla", + + "bdnzt-", + "bdnzt+", + "bdnzt", + "bdnztl-", + "bdnztl+", + "bdnztl", + "bdnzta-", + "bdnzta+", + "bdnzta", + "bdnztla-", + "bdnztla+", + "bdnztla", + "bdzt-", + "bdzt+", + "bdzt", + "bdztl-", + "bdztl+", + "bdztl", + "bdzta-", + "bdzta+", + "bdzta", + "bdztla-", + "bdztla+", + "bdztla", + + "bt-", + "bt+", + "bt", + "bbt", + "btl-", + "btl+", + "btl", + "bbtl", + "bta-", + "bta+", + "bta", + "bbta", + "btla-", + "btla+", + "btla", + "bbtla", + + "bc-", + "bc+", + "bc", + "bcl-", + "bcl+", + "bcl", + "bca-", + "bca+", + "bca", + "bcla-", + "bcla+", + "bcla", + + "bdnzlr", + "bdnzlr-", + "bdnzlrl", + "bdnzlrl-", + "bdnzlr+", + "bdnzlrl+", + "bdzlr", + "bdzlr-", + "bdzlrl", + "bdzlrl-", + "bdzlr+", + "bdzlrl+", + #"blr", + "br", + #"blrl", + "brl", + "bdnzlr-", + "bdnzlrl-", + "bdnzlr+", + "bdnzlrl+", + "bdzlr-", + "bdzlrl-", + "bdzlr+", + "bdzlrl+", + + "bgelr", + "bgelr-", + "bger", + "bnllr", + "bnllr-", + "bnlr", + "bgelrl", + "bgelrl-", + "bgerl", + "bnllrl", + "bnllrl-", + "bnlrl", + "blelr", + "blelr-", + "bler", + "bnglr", + "bnglr-", + "bngr", + "blelrl", + "blelrl-", + "blerl", + "bnglrl", + "bnglrl-", + "bngrl", + "bnelr", + "bnelr-", + "bner", + "bnelrl", + "bnelrl-", + "bnerl", + "bnslr", + "bnslr-", + "bnsr", + "bnulr", + "bnulr-", + "bnslrl", + "bnslrl-", + "bnsrl", + "bnulrl", + "bnulrl-", + "bgelr+", + "bnllr+", + "bgelrl+", + "bnllrl+", + "blelr+", + "bnglr+", + "blelrl+", + "bnglrl+", + "bnelr+", + "bnelrl+", + "bnslr+", + "bnulr+", + "bnslrl+", + "bnulrl+", + "bgelr-", + "bnllr-", + "bgelrl-", + "bnllrl-", + "blelr-", + "bnglr-", + "blelrl-", + "bnglrl-", + "bnelr-", + "bnelrl-", + "bnslr-", + "bnulr-", + "bnslrl-", + "bnulrl-", + "bgelr+", + "bnllr+", + "bgelrl+", + "bnllrl+", + "blelr+", + "bnglr+", + "blelrl+", + "bnglrl+", + "bnelr+", + "bnelrl+", + "bnslr+", + "bnulr+", + "bnslrl+", + "bnulrl+", + "bltlr", + "bltlr-", + "bltr", + "bltlrl", + "bltlrl-", + "bltrl", + "bgtlr", + "bgtlr-", + "bgtr", + "bgtlrl", + "bgtlrl-", + "bgtrl", + "beqlr", + "beqlr-", + "beqr", + "beqlrl", + "beqlrl-", + "beqrl", + "bsolr", + "bsolr-", + "bsor", + "bunlr", + "bunlr-", + "bsolrl", + "bsolrl-", + "bsorl", + "bunlrl", + "bunlrl-", + "bltlr+", + "bltlrl+", + "bgtlr+", + "bgtlrl+", + "beqlr+", + "beqlrl+", + "bsolr+", + "bunlr+", + "bsolrl+", + "bunlrl+", + "bltlr-", + "bltlrl-", + "bgtlr-", + "bgtlrl-", + "beqlr-", + "beqlrl-", + "bsolr-", + "bunlr-", + "bsolrl-", + "bunlrl-", + "bltlr+", + "bltlrl+", + "bgtlr+", + "bgtlrl+", + "beqlr+", + "beqlrl+", + "bsolr+", + "bunlr+", + "bsolrl+", + "bunlrl+", + + "bdnzflr", + "bdnzflr-", + "bdnzflrl", + "bdnzflrl-", + "bdnzflr+", + "bdnzflrl+", + "bdzflr", + "bdzflr-", + "bdzflrl", + "bdzflrl-", + "bdzflr+", + "bdzflrl+", + "bflr", + "bflr-", + "bbfr", + "bflrl", + "bflrl-", + "bbfrl", + "bflr+", + "bflrl+", + "bflr-", + "bflrl-", + "bflr+", + "bflrl+", + "bdnztlr", + "bdnztlr-", + "bdnztlrl", + "bdnztlrl-", + "bdnztlr+", + "bdnztlrl+", + "bdztlr", + "bdztlr-", + "bdztlrl", + "bdztlrl-", + "bdztlr+", + "bdztlrl+", + "btlr", + "btlr-", + "bbtr", + "btlrl", + "btlrl-", + "bbtrl", + "btlr+", + "btlrl+", + "btlr-", + "btlrl-", + "btlr+", + "btlrl+", + + "bclr-", + "bclrl-", + "bclr+", + "bclrl+", + "bclr", + "bcr", + "bclrl", + "bcrl", + + #"bctr", + #"bctrl", + + "bgectr", + "bgectr-", + "bnlctr", + "bnlctr-", + "bgectrl", + "bgectrl-", + "bnlctrl", + "bnlctrl-", + "blectr", + "blectr-", + "bngctr", + "bngctr-", + "blectrl", + "blectrl-", + "bngctrl", + "bngctrl-", + "bnectr", + "bnectr-", + "bnectrl", + "bnectrl-", + "bnsctr", + "bnsctr-", + "bnuctr", + "bnuctr-", + "bnsctrl", + "bnsctrl-", + "bnuctrl", + "bnuctrl-", + "bgectr+", + "bnlctr+", + "bgectrl+", + "bnlctrl+", + "blectr+", + "bngctr+", + "blectrl+", + "bngctrl+", + "bnectr+", + "bnectrl+", + "bnsctr+", + "bnuctr+", + "bnsctrl+", + "bnuctrl+", + "bgectr-", + "bnlctr-", + "bgectrl-", + "bnlctrl-", + "blectr-", + "bngctr-", + "blectrl-", + "bngctrl-", + "bnectr-", + "bnectrl-", + "bnsctr-", + "bnuctr-", + "bnsctrl-", + "bnuctrl-", + "bgectr+", + "bnlctr+", + "bgectrl+", + "bnlctrl+", + "blectr+", + "bngctr+", + "blectrl+", + "bngctrl+", + "bnectr+", + "bnectrl+", + "bnsctr+", + "bnuctr+", + "bnsctrl+", + "bnuctrl+", + "bltctr", + "bltctr-", + "bltctrl", + "bltctrl-", + "bgtctr", + "bgtctr-", + "bgtctrl", + "bgtctrl-", + "beqctr", + "beqctr-", + "beqctrl", + "beqctrl-", + "bsoctr", + "bsoctr-", + "bunctr", + "bunctr-", + "bsoctrl", + "bsoctrl-", + "bunctrl", + "bunctrl-", + "bltctr+", + "bltctrl+", + "bgtctr+", + "bgtctrl+", + "beqctr+", + "beqctrl+", + "bsoctr+", + "bunctr+", + "bsoctrl+", + "bunctrl+", + "bltctr-", + "bltctrl-", + "bgtctr-", + "bgtctrl-", + "beqctr-", + "beqctrl-", + "bsoctr-", + "bunctr-", + "bsoctrl-", + "bunctrl-", + "bltctr+", + "bltctrl+", + "bgtctr+", + "bgtctrl+", + "beqctr+", + "beqctrl+", + "bsoctr+", + "bunctr+", + "bsoctrl+", + "bunctrl+", + + "bfctr", + "bfctr-", + "bfctrl", + "bfctrl-", + "bfctr+", + "bfctrl+", + "bfctr-", + "bfctrl-", + "bfctr+", + "bfctrl+", + "btctr", + "btctr-", + "btctrl", + "btctrl-", + "btctr+", + "btctrl+", + "btctr-", + "btctrl-", + "btctr+", + "btctrl+", + + "bcctr-", + "bcctrl-", + "bcctr+", + "bcctrl+", + "bcctr", + "bcc", + "bcctrl", + "bccl", + + "bctar-", + "bctarl-", + "bctar+", + "bctarl+", + "bctar", + "bctarl", +] +# fmt: on + + +class PpcArchitecture(Architecture): + """PpcArchitecture Class""" + + def comment(self): + return "#" + + def is_call(self, instruction: Instruction): + return instruction.opcode in ppc_call_opcodes + + def is_branch(self, instruction: Instruction): + return instruction.opcode in ( + ppc_conditional_branch_opcodes + ppc_unconditional_branch_opcodes + ) and not self.is_call(instruction) + + def is_unconditional_branch(self, instruction: Instruction): + return instruction.opcode in ppc_unconditional_branch_opcodes + + def is_sink(self, instruction: Instruction): + return instruction.opcode in ppc_sink_opcodes + + def is_direct_branch(self, instruction: Instruction): + return self.is_branch(instruction) and ( + re.search(rf"{HEX_LONG_PATTERN}", "|".join(instruction.ops)) + ) diff --git a/ocgraph/configuration/architecture/sparc.py b/ocgraph/configuration/architecture/sparc.py new file mode 100755 index 0000000..174577c --- /dev/null +++ b/ocgraph/configuration/architecture/sparc.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH +"""Contains instruction info for Sparc-compatible targets.""" + +from .architecture import Architecture +from ...data.instruction import Instruction + + +# fmt: off +sparc_v8_call_opcodes = [ + "call", +] + +sparc_v8_sink_opcodes = [ + # ret: return from subroutine + # retl: return from leaf subroutine + "ret", "retl", +] + +sparc_v8_Bicc_opcodes = [ + # conditional icc branch opcodes + "ba", "bn", "bne", "be", "bg", "ble", "bge", "bl", "bgu", "bleu", "bcc", + "bcs", "bpos", "bneg", "bvc", "bvs", +] + +sparc_v8_FBfcc_opcodes = [ + # conditional fcc branch opcodes + "fba", "fbn", "fbu", "fbg", "fbug", "fbl", "fbul", "fblg", "fbne", "fbe", + "fbue", "fbge", "fbuge", "fble", "fbule", "fbo", +] + +sparc_v8_CBfcc_opcodes = [ + # conditional coprocessor opcodes + "cba", "cbn", "cb3", "cb2", "cb23", "cb1", "cb13", "cb12", "cb123", "cb0", + "cb03", "cb02", "cb023", "cb01", "cb013", "cb012", +] + +sparc_v8_Ticc_opcodes = [ + # conditional traps on icc + "ta", "tn", "tne", "te", "tg", "tle", "tge", "tl", "tgu", "tleu", "tcc", + "tcs", "tpos", "tneg", "tvc", "tvs", +] + +sparc_v8_branch_cond_delay_opcodes = [ + f"{x},a" for x in + sparc_v8_Bicc_opcodes + + sparc_v8_FBfcc_opcodes + + sparc_v8_CBfcc_opcodes +] + +sparc_v8_unconditional_branch_opcodes = [ + "jmpl", "jmp", "b", "b,a" +] + +sparc_v8_delayed_opcodes = sparc_v8_Bicc_opcodes + \ + sparc_v8_FBfcc_opcodes + \ + sparc_v8_CBfcc_opcodes + \ + sparc_v8_branch_cond_delay_opcodes + \ + sparc_v8_unconditional_branch_opcodes + +sparc_v8_conditional_branch_opcodes = sparc_v8_Bicc_opcodes + \ + sparc_v8_FBfcc_opcodes + \ + sparc_v8_CBfcc_opcodes + \ + sparc_v8_Ticc_opcodes + \ + sparc_v8_branch_cond_delay_opcodes +# fmt: on + + +class SparcArchitecture(Architecture): + """SparcArchitecture Class""" + + def comment(self): + return "!" + + def is_call(self, instruction: Instruction): + return instruction.opcode in sparc_v8_call_opcodes + + def is_branch(self, instruction: Instruction): + return instruction.opcode in ( + sparc_v8_conditional_branch_opcodes + sparc_v8_unconditional_branch_opcodes + ) + + def get_branch_delay(self, instruction: Instruction) -> int | None: + delay = None + if instruction.opcode in sparc_v8_delayed_opcodes: + delay = 2 + elif self.is_sink(instruction): + delay = 2 + else: + delay = 1 + return delay + + def is_direct_branch(self, instruction: Instruction): + # every branch is disassembled with the complete offset + return self.is_branch(instruction) + + def is_unconditional_branch(self, instruction: Instruction): + return instruction.opcode in sparc_v8_unconditional_branch_opcodes + + def is_sink(self, instruction: Instruction): + return instruction.opcode in sparc_v8_sink_opcodes diff --git a/ocgraph/configuration/architecture/x86.py b/ocgraph/configuration/architecture/x86.py new file mode 100755 index 0000000..fb12212 --- /dev/null +++ b/ocgraph/configuration/architecture/x86.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH +"""Contains instruction info for X86-compatible targets.""" + +import re + +from .architecture import Architecture +from ...data.instruction import Instruction + +# Common regexes +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN + + +class X86Architecture(Architecture): + """X86Architecture Class""" + + def comment(self): + return "#" + + def is_call(self, instruction: Instruction): + # Various flavors of call: + # call *0x26a16(%rip) + # call 0x555555555542 + # addr32 call 0x55555558add0 + return "call" in instruction.opcode + + def is_branch(self, instruction: Instruction): + return instruction.opcode[0] == "j" + + def is_direct_branch(self, instruction: Instruction): + return self.is_branch(instruction) and re.match(rf"{HEX_LONG_PATTERN}", instruction.ops[0]) + + def is_unconditional_branch(self, instruction: Instruction): + return instruction.opcode.startswith("jmp") + + def is_sink(self, instruction: Instruction): + return instruction.opcode.startswith("ret") diff --git a/ocgraph/configuration/configuration.py b/ocgraph/configuration/configuration.py new file mode 100755 index 0000000..34630ef --- /dev/null +++ b/ocgraph/configuration/configuration.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH +"""Module for configuration of the ocgraph package.""" + +from .logger import OCGraphLogger, logging_preset + +from .architecture.architecture import Architecture +from .architecture.x86 import X86Architecture +from .architecture.arm import ArmArchitecture +from .architecture.sparc import SparcArchitecture +from .architecture.ppc import PpcArchitecture + +from .disassembler.disassembler import Disassembler +from .disassembler.objdump_sparc import ObjDumpSparcDisassembler +from .disassembler.objdump_ppc import ObjDumpPpcDisassembler +from .disassembler.gdb_default import GdbDisassembler +from .disassembler.objdump_x86 import ObjDumpx86Disassembler +from .disassembler.objdump_arm import ObjDumpArmDisassembler + +# fmt: off +disassembler_option: dict[str, dict] = { + "OBJDUMP": { + "sparc": ObjDumpSparcDisassembler(), + "ppc": ObjDumpPpcDisassembler(), + "x86": ObjDumpx86Disassembler(), + "arm": ObjDumpArmDisassembler(), + }, + "GDB": { + "sparc": GdbDisassembler(), + "ppc": GdbDisassembler(), + "x86": GdbDisassembler(), + "arm": GdbDisassembler(), + }, +} + +architecture_option: dict[str, dict] = { + "x86": { + "platform": "X86", + "architecture": X86Architecture(), + }, + "arm": { + "platform": "ARM", + "architecture": ArmArchitecture(), + }, + "sparc": { + "platform": "SPARC", + "architecture": SparcArchitecture(), + }, + "ppc": { + "platform": "PPC", + "architecture": PpcArchitecture(), + }, +} +# fmt: on + + +class OcGraphConfiguration: + """Implement configuration presets for the ASM2CFG tool.""" + + logger: OCGraphLogger + """Logging mechanism for module""" + architecture: Architecture + """Target architecture instance""" + disassembler: Disassembler + """Target disassembler tool like OBJDump, GDB, ...""" + + def __init__(self, arch: str = "sparc", disassembler: str = "OBJDUMP", preset="default"): + if architecture_option.get(arch) is None: + raise NotImplementedError("Architecture option not supported!") + if disassembler_option.get(disassembler) is None: + raise NotImplementedError("Disassembler option not supported!") + if logging_preset.get(preset) is None: + raise NotImplementedError("Logging preset not supported!") + + # load module preset + _preset = architecture_option[arch] + _preset["disassembler"] = disassembler_option[disassembler][arch] + self.__dict__ = _preset + + # configure logging + self.logger = OCGraphLogger("OcGraph", preset, "asm2cfg.log") + + @staticmethod + def architectures(): + """Return all available architectures options""" + return architecture_option.keys() + + @staticmethod + def disassemblers(): + """Return all available disassemblers options""" + return disassembler_option.keys() + + @staticmethod + def loggers(): + """Return all available disassemblers options""" + return logging_preset.keys() diff --git a/ocgraph/configuration/disassembler/__init__.py b/ocgraph/configuration/disassembler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ocgraph/configuration/disassembler/disassembler.py b/ocgraph/configuration/disassembler/disassembler.py new file mode 100755 index 0000000..ff118ea --- /dev/null +++ b/ocgraph/configuration/disassembler/disassembler.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +"""Class configuring the used disassembler tool.""" + +from abc import ABC, abstractmethod + +from ..architecture.sparc import SparcArchitecture +from ..architecture.ppc import PpcArchitecture +from ...data.instruction import Instruction + + +class DisassemblerError(Exception): + """Raised when the extract_information method was not successful.""" + + +class Disassembler(ABC): + """Disassembler Class""" + + def __init__(self): + self.architecture = SparcArchitecture() + + name: str = "" + """ Disassembler tool identification like SparcV8Objdump, GDB, ...""" + + @abstractmethod + def extract_information(self, str_input: str) -> dict[str, str]: + """Specification of the extracted information. Required attributes are: + * address = instruction location in the binary + * location: instruction address location + * instr_d: instruction in disassembled format + * instr_h: instruction in hex-notation + * opcode: instruction opcode + * printable: a printable line of the collected information + """ + raise NotImplementedError() + + @abstractmethod + def parse_function_header(self, line: str) -> str | None: + """Return function name of memory range from the given string line.""" + raise NotImplementedError() + + @abstractmethod + def parse_line(self, line: str, lineno, function_name: str) -> Instruction | None: + """Parses a single line of assembly to create Instruction instance""" + + @abstractmethod + def parse_jump_target(self, str_input: str) -> int | None: + """Parses a string (e.g., coma separated operands) and returns + the jump target value + """ diff --git a/ocgraph/configuration/disassembler/gdb_default.py b/ocgraph/configuration/disassembler/gdb_default.py new file mode 100755 index 0000000..beecbbf --- /dev/null +++ b/ocgraph/configuration/disassembler/gdb_default.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 +"""Class for parsing the input""" + +import re +from typing import List + +from ...data.address import Address +from ...data.encoding import Encoding +from ...data.instruction import Instruction + +from .disassembler import Disassembler, DisassemblerError + +# Common regexes +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN + + +class GdbDisassembler(Disassembler): + """x86 GDB disassembler""" + + name: str = "Default GDB Disassembler (x86 Binutils)" + + # Expected format: <>: + regex: str = r"(\S+)( <(\S+)>|):\s+([\S ]+)\s([\S ]+)" + regex_information: dict[str, int] = { + "address": 1, + "location": 3, + "instruction_hex": 4, + "instruction_str": 5, + } + + def extract_information(self, str_input: str) -> dict[str, str]: + result = {} + + if "=> " in str_input: + extracted_line = str_input.split("=> ", 1)[1].split("\n", 1)[0] + + information = re.search(self.regex, extracted_line) + if not information: + raise DisassemblerError("Line not processable: \n" + str(extracted_line)) + + address = str(information.group(self.regex_information.get("address"))) + location = str(information.group(self.regex_information.get("location"))) + instr_d = str(information.group(self.regex_information.get("instruction_str"))) + instr_h = str(information.group(self.regex_information.get("instruction_hex"))) + opcode = (instr_d.split()[0],) + + result = { + "address": address, + "location": location, + "instr_h": instr_h, + "instr_d": instr_d, + "opcode": opcode, + "printable": extracted_line, + } + else: + raise DisassemblerError("Line not processable: \n" + str(str_input)) + return result + + def parse_function_header(self, line: str) -> str | None: + """ + Return function name of memory range from the given string line. + + Match lines for non-stripped binaries: + 'Dump of assembler code for function test_function:' + lines for stripped binaries: + 'Dump of assembler code from 0x555555555faf to 0x555555557008:' + and lines for objdump disassembly: + '0000000000016bb0 <_obstack_allocated_p@@Base>:' + """ + objdump_name_pattern = re.compile(rf"{HEX_PATTERN} <([a-zA-Z_0-9@.]+)>:") + function_name = objdump_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + function_name_pattern = re.compile(r"function (\w+):$") + function_name = function_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + memory_range_pattern = re.compile( + rf"(?:Address range|from) ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$" + ) + memory_range = memory_range_pattern.search(line) + if memory_range is not None: + return f"{memory_range[1]}-{memory_range[2]}" + + return None + + def parse_address(self, line: str) -> (Address, str): + """ + Parses leading address of instruction + """ + address_match = re.match(rf"^\s*(?:0x)?({HEX_PATTERN})\s*(?:<([+-][0-9]+)>)?:(.*)", line) + if address_match is None: + return None, line + address = Address( + int(address_match[1], 16), None, int(address_match[2]) if address_match[2] else None + ) + return address, address_match[3] + + def split_nth(self, string: str, count: int) -> List[str]: + """ + Splits string to equally-sized chunks + """ + return [string[i : i + count] for i in range(0, len(string), count)] + + def parse_encoding(self, line): + """ + Parses byte encoding of instruction for objdump disassemblies + e.g. the '31 c0' in + '16bd3: 31 c0 xor %eax,%eax' + In addition to X86 supports ARM encoding styles: + '4: e1a01000 mov r1, r0' + '50: f7ff fffe bl 0 <__aeabi_dadd>' + '54: 0002 movs r2, r0' + """ + # Encoding is separated from assembly mnemonic via tab + # so we allow whitespace separators between bytes + # to avoid accidentally matching the mnemonic. + enc_match = re.match(r"^\s*((?:[0-9a-f]{2,8} +)+)(.*)", line) + if enc_match is None: + return None, line + bites = [] + for chunk in enc_match[1].strip().split(" "): + bites.extend(int(byte, 16) for byte in self.split_nth(chunk, 2)) + return Encoding(bites), enc_match[2] + + def parse_body(self, line: str) -> (str, str, List[str], str): + """Parses instruction body (opcode and operands)""" + comment_symbol = self.architecture.comment() + body_match = re.match(rf"^\s*([^{comment_symbol}<]+)(.*)", line) + if body_match is None: + return None, None, None, line + body = body_match[1].strip() + line = body_match[2] + opcode_match = re.match(r"^(\S*)\s*(.*)", body) + if opcode_match is None: + return None, None, None, line + opcode = opcode_match[1] + ops = opcode_match[2].split(",") if opcode_match[2] else [] + return body, opcode, ops, line + + def parse_target(self, line: str) -> (Address, str): + """ + Parses optional instruction branch target hint + """ + target_match = re.match(r"\s*<([a-zA-Z_@0-9]+)([+-]0x[0-9a-f]+|[+-][0-9]+)?>(.*)", line) + if target_match is None: + return None, line + offset = target_match[2] or "+0" + address = Address(None, target_match[1], int(offset, 0)) + return address, target_match[3] + + def parse_comment(self, line: str) -> (Address, str): + """ + Parses optional instruction comment + """ + comment_symbol = self.architecture.comment() + comment_match = re.match(rf"^\s*{comment_symbol}\s*(.*)", line) + if comment_match is None: + return None, line + comment = comment_match[1] + imm_match = re.match(rf"^(?:0x)?({HEX_PATTERN})\s*(<.*>)?(.*)", comment) + if imm_match is None: + # If no imm was found, ignore the comment. + # In particular this takes care of useless ARM comments like + # '82: 46c0 nop ; (mov r8, r8)' + return None, "" + abs_addr = int(imm_match[1], 16) + if imm_match[2]: + target, _ = self.parse_target(imm_match[2]) + target.abs = abs_addr + else: + target = Address(abs_addr) + return target, imm_match[3] + + def parse_line(self, line: str, lineno, function_name: str) -> Instruction | None: + """Parses a single line of assembly to create Instruction instance""" + # Strip GDB prefix and leading whites + line = line.removeprefix("=> ") + line = line.lstrip() + + address, line = self.parse_address(line) + if address is None: + return None + + original_line = line + body, opcode, ops, line = self.parse_body(line) + if opcode is None: + return None + + target, line = self.parse_target(line) + + _, line = self.parse_comment(line) + if line: + # Expecting complete parse + return None + + # Set base symbol for relative addresses + if address.base is None: + address.base = function_name + if target is not None and target.base is None: + target.base = function_name + + return Instruction( + body, + original_line.strip(), + lineno, + address, + opcode, + ops, + target, + ) + + def parse_jump_target(self, str_input: str) -> int | None: + # TODO: Fix str_input indexing because it's a list. + return int(re.search(rf"{HEX_LONG_PATTERN}", str_input[0])[0], 16) diff --git a/ocgraph/configuration/disassembler/objdump_arm.py b/ocgraph/configuration/disassembler/objdump_arm.py new file mode 100644 index 0000000..128330d --- /dev/null +++ b/ocgraph/configuration/disassembler/objdump_arm.py @@ -0,0 +1,224 @@ +"""Class for parsing the objdump ARM input""" + +import re +from typing import List + +from ...data.address import Address +from ...data.encoding import Encoding +from ...data.instruction import Instruction + +from .disassembler import Disassembler, DisassemblerError + +# Common regexes +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN + + +class ObjDumpArmDisassembler(Disassembler): + """Objdump ARM disassembler""" + + name: str = "ARM Objdump Disassembler" + + # Expected format: <>: + regex: str = r"(\S+)( <(\S+)>|):\s+([\S ]+)\s([\S ]+)" + regex_information: dict[str, int] = { + "address": 1, + "location": 3, + "instruction_hex": 4, + "instruction_str": 5, + } + + def extract_information(self, str_input: str) -> dict[str, str]: + result = {} + + if "=> " in str_input: + extracted_line = str_input.split("=> ", 1)[1].split("\n", 1)[0] + + information = re.search(self.regex, extracted_line) + if not information: + raise DisassemblerError("Line not processable: \n" + str(extracted_line)) + + address = str(information.group(self.regex_information.get("address"))) + location = str(information.group(self.regex_information.get("location"))) + instr_d = str(information.group(self.regex_information.get("instruction_str"))) + instr_h = str(information.group(self.regex_information.get("instruction_hex"))) + opcode = (instr_d.split()[0],) + + result = { + "address": address, + "location": location, + "instr_h": instr_h, + "instr_d": instr_d, + "opcode": opcode, + "printable": extracted_line, + } + else: + raise DisassemblerError("Line not processable: \n" + str(str_input)) + + return result + + def parse_function_header(self, line: str) -> str | None: + """ + Return function name of memory range from the given string line. + + Match lines for non-stripped binaries: + 'Dump of assembler code for function test_function:' + lines for stripped binaries: + 'Dump of assembler code from 0x555555555faf to 0x555555557008:' + and lines for objdump disassembly: + '0000000000016bb0 <_obstack_allocated_p@@Base>:' + """ + objdump_name_pattern = re.compile(rf"{HEX_PATTERN} <([a-zA-Z_0-9@.]+)>:") + function_name = objdump_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + function_name_pattern = re.compile(r"function (\w+):$") + function_name = function_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + memory_range_pattern = re.compile( + rf"(?:Address range|from) ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$" + ) + memory_range = memory_range_pattern.search(line) + if memory_range is not None: + return f"{memory_range[1]}-{memory_range[2]}" + + return None + + def parse_address(self, line: str) -> (Address, str): + """ + Parses leading address of instruction + """ + address_match = re.match(rf"^\s*(?:0x)?({HEX_PATTERN})\s*(?:<([+-][0-9]+)>)?:(.*)", line) + if address_match is None: + return None, line + address = Address( + int(address_match[1], 16), None, int(address_match[2]) if address_match[2] else None + ) + return address, address_match[3] + + def split_nth(self, string: str, count: int) -> List[str]: + """ + Splits string to equally-sized chunks + """ + return [string[i : i + count] for i in range(0, len(string), count)] + + def parse_encoding(self, line): + """ + Parses byte encoding of instruction for objdump disassemblies + e.g. the '31 c0' in + '16bd3: 31 c0 xor %eax,%eax' + In addition to X86 supports ARM encoding styles: + '4: e1a01000 mov r1, r0' + '50: f7ff fffe bl 0 <__aeabi_dadd>' + '54: 0002 movs r2, r0' + """ + # Encoding is separated from assembly mnemonic via tab + # so we allow whitespace separators between bytes + # to avoid accidentally matching the mnemonic. + enc_match = re.match(r"^\s*((?:[0-9a-f]{2,8} +)+)(.*)", line) + if enc_match is None: + return None, line + bites = [] + for chunk in enc_match[1].strip().split(" "): + bites.extend(int(byte, 16) for byte in self.split_nth(chunk, 2)) + return Encoding(bites), enc_match[2] + + def parse_body(self, line: str) -> (str, str, List[str], str): + """Parses instruction body (opcode and operands)""" + comment_symbol = self.architecture.comment() + body_match = re.match(rf"^\s*([^{comment_symbol}<]+)(.*)", line) + if body_match is None: + return None, None, None, line + body = body_match[1].strip() + line = body_match[2] + opcode_match = re.match(r"^(\S*)\s*(.*)", body) + if opcode_match is None: + return None, None, None, line + opcode = opcode_match[1] + ops = opcode_match[2].split(",") if opcode_match[2] else [] + return body, opcode, ops, line + + def parse_target(self, line: str) -> (Address, str): + """ + Parses optional instruction branch target hint + """ + target_match = re.match(r"\s*<([.a-zA-Z_@0-9]+)([+-]0x[0-9a-f]+|[+-][0-9]+)?>(.*)", line) + if target_match is None: + return None, line + offset = target_match[2] or "+0" + address = Address(None, target_match[1], int(offset, 0)) + return address, target_match[3] + + def parse_comment(self, line: str) -> (Address, str): + """ + Parses optional instruction comment + """ + comment_symbol = self.architecture.comment() + comment_match = re.match(rf"^\s*{comment_symbol}\s*(.*)", line) + if comment_match is None: + return None, line + comment = comment_match[1] + imm_match = re.match(rf"^(?:0x)?({HEX_PATTERN})\s*(<.*>)?(.*)", comment) + if imm_match is None: + # If no imm was found, ignore the comment. + # In particular this takes care of useless ARM comments like + # '82: 46c0 nop ; (mov r8, r8)' + return None, "" + abs_addr = int(imm_match[1], 16) + if imm_match[2]: + target, _ = self.parse_target(imm_match[2]) + target.abs = abs_addr + else: + target = Address(abs_addr) + return target, imm_match[3] + + def parse_line(self, line: str, lineno, function_name: str) -> Instruction | None: + """ + Parses a single line of assembly to create Instruction instance + """ + # Strip GDB prefix and leading whites + line = line.removeprefix("=> ") + line = line.lstrip() + + address, line = self.parse_address(line) + if address is None: + return None + + encoding, line = self.parse_encoding(line) + if not line: + return encoding + + original_line = line + body, opcode, ops, line = self.parse_body(line) + if opcode is None: + return None + + target, line = self.parse_target(line) + + _, line = self.parse_comment(line) + if line: + # Expecting complete parse + return None + + # Set base symbol for relative addresses + if address.base is None: + address.base = function_name + if target is not None and target.base is None: + target.base = function_name + + return Instruction( + body, + original_line.strip(), + lineno, + address, + opcode, + ops, + target, + ) + + def parse_jump_target(self, ops: List[str]) -> int | None: + # it assumes the first operand to contain the target address + return int(ops[-1], 16) diff --git a/ocgraph/configuration/disassembler/objdump_ppc.py b/ocgraph/configuration/disassembler/objdump_ppc.py new file mode 100755 index 0000000..6b563ef --- /dev/null +++ b/ocgraph/configuration/disassembler/objdump_ppc.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python3 +"""Class for parsing the objdump PPC input""" + +import re +from typing import List + +from ...data.address import Address +from ...data.encoding import Encoding +from ...data.instruction import Instruction + +from .disassembler import Disassembler, DisassemblerError + +# Common regexes +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN + + +class ObjDumpPpcDisassembler(Disassembler): + """Objdump PPC disassembler""" + + name: str = "PPC Objdump Disassembler (PPC Binutils)" + + # Expected format: <>: + regex: str = r"(\S+)( <(\S+)>|):\s+([\S ]+)\s([\S ]+)" + regex_information: dict[str, int] = { + "address": 1, + "location": 3, + "instruction_hex": 4, + "instruction_str": 5, + } + + def extract_information(self, str_input: str) -> dict[str, str]: + result = {} + + if "=> " in str_input: + extracted_line = str_input.split("=> ", 1)[1].split("\n", 1)[0] + + information = re.search(self.regex, extracted_line) + if not information: + raise DisassemblerError("Line not processable: \n" + str(extracted_line)) + + address = str(information.group(self.regex_information.get("address"))) + location = str(information.group(self.regex_information.get("location"))) + instr_d = str(information.group(self.regex_information.get("instruction_str"))) + instr_h = str(information.group(self.regex_information.get("instruction_hex"))) + opcode = (instr_d.split()[0],) + + result = { + "address": address, + "location": location, + "instr_h": instr_h, + "instr_d": instr_d, + "opcode": opcode, + "printable": extracted_line, + } + else: + raise DisassemblerError("Line not processable: \n" + str(str_input)) + + return result + + def parse_function_header(self, line: str) -> str | None: + """ + Return function name of memory range from the given string line. + + Match lines for non-stripped binaries: + 'Dump of assembler code for function test_function:' + lines for stripped binaries: + 'Dump of assembler code from 0x555555555faf to 0x555555557008:' + and lines for objdump disassembly: + '0000000000016bb0 <_obstack_allocated_p@@Base>:' + """ + objdump_name_pattern = re.compile(rf"{HEX_PATTERN} <([a-zA-Z_0-9@.]+)>:") + function_name = objdump_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + function_name_pattern = re.compile(r"function (\w+):$") + function_name = function_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + memory_range_pattern = re.compile( + rf"(?:Address range|from) ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$" + ) + memory_range = memory_range_pattern.search(line) + if memory_range is not None: + return f"{memory_range[1]}-{memory_range[2]}" + + return None + + def parse_address(self, line: str) -> (Address, str): + """ + Parses leading address of instruction + """ + address_match = re.match(rf"^\s*(?:0x)?({HEX_PATTERN})\s*(?:<([+-][0-9]+)>)?:(.*)", line) + if address_match is None: + return None, line + address = Address( + int(address_match[1], 16), None, int(address_match[2]) if address_match[2] else None + ) + return address, address_match[3] + + def split_nth(self, string: str, count: int) -> List[str]: + """ + Splits string to equally-sized chunks + """ + return [string[i : i + count] for i in range(0, len(string), count)] + + def parse_encoding(self, line): + """ + Parses byte encoding of instruction for objdump disassemblies + e.g. the '31 c0' in + '16bd3: 31 c0 xor %eax,%eax' + In addition to X86 supports ARM encoding styles: + '4: e1a01000 mov r1, r0' + '50: f7ff fffe bl 0 <__aeabi_dadd>' + '54: 0002 movs r2, r0' + """ + # Encoding is separated from assembly mnemonic via tab (only in objdump not for llvm-objdump) + # so we allow only 1 white space separator between bytes for compatibility with llvm-objdump + # to avoid accidentally matching the mnemonic. + enc_match = re.match(r"^\s*((?:(?:[0-9a-f]{2,8} )+)+)(.*)", line) + if enc_match is None: + return None, line + bites = [] + for chunk in enc_match[1].strip().split(" "): + bites.extend(int(byte, 16) for byte in self.split_nth(chunk, 2)) + return Encoding(bites), enc_match[2] + + def parse_body(self, line: str) -> (str, str, List[str], str): + """Parses instruction body (opcode and operands)""" + comment_symbol = self.architecture.comment() + body_match = re.match(rf"^\s*([^{comment_symbol}<]+)(.*)", line) + if body_match is None: + return None, None, None, line + body = body_match[1].strip() + line = body_match[2] + opcode_match = re.match(r"^(\S*)\s*(.*)", body) + if opcode_match is None: + return None, None, None, line + opcode = opcode_match[1] + ops = [op.strip() for op in opcode_match[2].split(",")] if opcode_match[2] else [] + return body, opcode, ops, line + + def parse_target(self, line: str) -> (Address, str): + """ + Parses optional instruction branch target hint + """ + target_match = re.match(r"\s*<([.a-zA-Z_@0-9]+)([+-]0x[0-9a-f]+|[+-][0-9]+)?>(.*)", line) + if target_match is None: + return None, line + offset = target_match[2] or "+0" + address = Address(None, target_match[1], int(offset, 0)) + return address, target_match[3] + + def parse_comment(self, line: str) -> (Address, str): + """ + Parses optional instruction comment + """ + comment_symbol = self.architecture.comment() + comment_match = re.match(rf"^\s*{comment_symbol}\s*(.*)", line) + if comment_match is None: + return None, line + comment = comment_match[1] + imm_match = re.match(rf"^(?:0x)?({HEX_PATTERN})\s*(<.*>)?(.*)", comment) + if imm_match is None: + # If no imm was found, ignore the comment. + # In particular this takes care of useless ARM comments like + # '82: 46c0 nop ; (mov r8, r8)' + return None, "" + abs_addr = int(imm_match[1], 16) + if imm_match[2]: + target, _ = self.parse_target(imm_match[2]) + target.abs = abs_addr + else: + target = Address(abs_addr) + return target, imm_match[3] + + def parse_line(self, line: str, lineno, function_name: str) -> Instruction | None: + """ + Parses a single line of assembly to create Instruction instance + """ + # Strip GDB prefix and leading whites + line = line.removeprefix("=> ") + line = line.lstrip() + + address, line = self.parse_address(line) + if address is None: + return None + + encoding, line = self.parse_encoding(line) + if not line: + return encoding + + original_line = line + body, opcode, ops, line = self.parse_body(line) + if opcode is None: + return None + + target, line = self.parse_target(line) + + _, line = self.parse_comment(line) + if line: + # Expecting complete parse + return None + + # Set base symbol for relative addresses + if address.base is None: + address.base = function_name + if target is not None and target.base is None: + target.base = function_name + + instruction = Instruction( + body, + original_line.strip(), + lineno, + address, + opcode, + ops, + target, + ) + + return instruction + + def parse_jump_target(self, ops: List[str]) -> int | None: + # it assumes the last operand of the branch to be the target address + return int(ops[-1], 16) diff --git a/ocgraph/configuration/disassembler/objdump_sparc.py b/ocgraph/configuration/disassembler/objdump_sparc.py new file mode 100755 index 0000000..a9d7437 --- /dev/null +++ b/ocgraph/configuration/disassembler/objdump_sparc.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +"""Class for parsing the objdump SPARC input""" + +import re +from typing import List + +from ...data.address import Address +from ...data.encoding import Encoding +from ...data.instruction import Instruction + +from .disassembler import Disassembler, DisassemblerError + +# Common regexes +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN + + +class ObjDumpSparcDisassembler(Disassembler): + """Objdump SPARC disassembler""" + + name: str = "SPARC Objdump Disassembler (SparcV8 Binutils)" + + # Expected format: <>: + regex: str = r"(\S+)( <(\S+)>|):\s+([\S ]+)\s([\S ]+)" + regex_information: dict[str, int] = { + "address": 1, + "location": 3, + "instruction_hex": 4, + "instruction_str": 5, + } + + def extract_information(self, str_input: str) -> dict[str, str]: + result = {} + + if "=> " in str_input: + extracted_line = str_input.split("=> ", 1)[1].split("\n", 1)[0] + + information = re.search(self.regex, extracted_line) + if not information: + raise DisassemblerError("Line not processable: \n" + str(extracted_line)) + + address = str(information.group(self.regex_information.get("address"))) + location = str(information.group(self.regex_information.get("location"))) + instr_d = str(information.group(self.regex_information.get("instruction_str"))) + instr_h = str(information.group(self.regex_information.get("instruction_hex"))) + opcode = (instr_d.split()[0],) + + result = { + "address": address, + "location": location, + "instr_h": instr_h, + "instr_d": instr_d, + "opcode": opcode, + "printable": extracted_line, + } + else: + raise DisassemblerError("Line not processable: \n" + str(str_input)) + + return result + + def parse_function_header(self, line: str) -> str | None: + """ + Return function name of memory range from the given string line. + + Match lines for non-stripped binaries: + 'Dump of assembler code for function test_function:' + lines for stripped binaries: + 'Dump of assembler code from 0x555555555faf to 0x555555557008:' + and lines for objdump disassembly: + '0000000000016bb0 <_obstack_allocated_p@@Base>:' + """ + objdump_name_pattern = re.compile(rf"{HEX_PATTERN} <([a-zA-Z_0-9@.]+)>:") + function_name = objdump_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + function_name_pattern = re.compile(r"function (\w+):$") + function_name = function_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + memory_range_pattern = re.compile( + rf"(?:Address range|from) ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$" + ) + memory_range = memory_range_pattern.search(line) + if memory_range is not None: + return f"{memory_range[1]}-{memory_range[2]}" + + return None + + def parse_address(self, line: str) -> (Address, str): + """ + Parses leading address of instruction + """ + address_match = re.match(rf"^\s*(?:0x)?({HEX_PATTERN})\s*(?:<([+-][0-9]+)>)?:(.*)", line) + if address_match is None: + return None, line + address = Address( + int(address_match[1], 16), None, int(address_match[2]) if address_match[2] else None + ) + return address, address_match[3] + + def split_nth(self, string: str, count: int) -> List[str]: + """ + Splits string to equally-sized chunks + """ + return [string[i : i + count] for i in range(0, len(string), count)] + + def parse_encoding(self, line): + """ + Parses byte encoding of instruction for objdump disassemblies + e.g. the '31 c0' in + '16bd3: 31 c0 xor %eax,%eax' + In addition to X86 supports ARM encoding styles: + '4: e1a01000 mov r1, r0' + '50: f7ff fffe bl 0 <__aeabi_dadd>' + '54: 0002 movs r2, r0' + """ + # Encoding is separated from assembly mnemonic via tab + # so we allow whitespace separators between bytes + # to avoid accidentally matching the mnemonic. + enc_match = re.match(r"^\s*((?:[0-9a-f]{2,8} +)+)(.*)", line) + if enc_match is None: + return None, line + bites = [] + for chunk in enc_match[1].strip().split(" "): + bites.extend(int(byte, 16) for byte in self.split_nth(chunk, 2)) + return Encoding(bites), enc_match[2] + + def parse_body(self, line: str) -> (str, str, List[str], str): + """Parses instruction body (opcode and operands)""" + comment_symbol = self.architecture.comment() + body_match = re.match(rf"^\s*([^{comment_symbol}<]+)(.*)", line) + if body_match is None: + return None, None, None, line + body = body_match[1].strip() + line = body_match[2] + opcode_match = re.match(r"^(\S*)\s*(.*)", body) + if opcode_match is None: + return None, None, None, line + opcode = opcode_match[1] + ops = opcode_match[2].split(",") if opcode_match[2] else [] + return body, opcode, ops, line + + def parse_target(self, line: str) -> (Address, str): + """ + Parses optional instruction branch target hint + """ + target_match = re.match(r"\s*<([.a-zA-Z_@0-9]+)([+-]0x[0-9a-f]+|[+-][0-9]+)?>(.*)", line) + if target_match is None: + return None, line + offset = target_match[2] or "+0" + address = Address(None, target_match[1], int(offset, 0)) + return address, target_match[3] + + def parse_comment(self, line: str) -> (Address, str): + """ + Parses optional instruction comment + """ + comment_symbol = self.architecture.comment() + comment_match = re.match(rf"^\s*{comment_symbol}\s*(.*)", line) + if comment_match is None: + return None, line + comment = comment_match[1] + imm_match = re.match(rf"^(?:0x)?({HEX_PATTERN})\s*(<.*>)?(.*)", comment) + if imm_match is None: + # If no imm was found, ignore the comment. + # In particular this takes care of useless ARM comments like + # '82: 46c0 nop ; (mov r8, r8)' + return None, "" + abs_addr = int(imm_match[1], 16) + if imm_match[2]: + target, _ = self.parse_target(imm_match[2]) + target.abs = abs_addr + else: + target = Address(abs_addr) + return target, imm_match[3] + + def parse_line(self, line: str, lineno, function_name: str) -> Instruction | None: + """ + Parses a single line of assembly to create Instruction instance + """ + # Strip GDB prefix and leading whites + line = line.removeprefix("=> ") + line = line.lstrip() + + address, line = self.parse_address(line) + if address is None: + return None + + encoding, line = self.parse_encoding(line) + if not line: + return encoding + + original_line = line + body, opcode, ops, line = self.parse_body(line) + if opcode is None: + return None + + target, line = self.parse_target(line) + + _, line = self.parse_comment(line) + if line: + # Expecting complete parse + return None + + # Set base symbol for relative addresses + if address.base is None: + address.base = function_name + if target is not None and target.base is None: + target.base = function_name + + return Instruction( + body, + original_line.strip(), + lineno, + address, + opcode, + ops, + target, + ) + + def parse_jump_target(self, ops: List[str]) -> int | None: + # it assumes the first operand to contain the target address + return int(ops[-1], 16) diff --git a/ocgraph/configuration/disassembler/objdump_x86.py b/ocgraph/configuration/disassembler/objdump_x86.py new file mode 100644 index 0000000..8ed4583 --- /dev/null +++ b/ocgraph/configuration/disassembler/objdump_x86.py @@ -0,0 +1,224 @@ +"""Class for parsing the input""" + +import re +from typing import List + +from ...data.address import Address +from ...data.encoding import Encoding +from ...data.instruction import Instruction + +from .disassembler import Disassembler, DisassemblerError + +# Common regexes +HEX_PATTERN = r"[0-9a-fA-F]+" +HEX_LONG_PATTERN = r"(?:0x0*)?" + HEX_PATTERN + + +class ObjDumpx86Disassembler(Disassembler): + """x86 objdump disassembler""" + + name: str = "x86 Disassembler (x86 Binutils)" + + # Expected format: <>: + regex: str = r"(\S+)( <(\S+)>|):\s+([\S ]+)\s([\S ]+)" + regex_information: dict[str, int] = { + "address": 1, + "location": 3, + "instruction_hex": 4, + "instruction_str": 5, + } + + def extract_information(self, str_input: str) -> dict[str, str]: + result = {} + + if "=> " in str_input: + extracted_line = str_input.split("=> ", 1)[1].split("\n", 1)[0] + + information = re.search(self.regex, extracted_line) + if not information: + raise DisassemblerError("Line not processable: \n" + str(extracted_line)) + + address = str(information.group(self.regex_information.get("address"))) + location = str(information.group(self.regex_information.get("location"))) + instr_d = str(information.group(self.regex_information.get("instruction_str"))) + instr_h = str(information.group(self.regex_information.get("instruction_hex"))) + opcode = (instr_d.split()[0],) + + result = { + "address": address, + "location": location, + "instr_h": instr_h, + "instr_d": instr_d, + "opcode": opcode, + "printable": extracted_line, + } + else: + raise DisassemblerError("Line not processable: \n" + str(str_input)) + + return result + + def parse_function_header(self, line: str) -> str | None: + """ + Return function name of memory range from the given string line. + + Match lines for non-stripped binaries: + 'Dump of assembler code for function test_function:' + lines for stripped binaries: + 'Dump of assembler code from 0x555555555faf to 0x555555557008:' + and lines for objdump disassembly: + '0000000000016bb0 <_obstack_allocated_p@@Base>:' + """ + objdump_name_pattern = re.compile(rf"{HEX_PATTERN} <([a-zA-Z_0-9@.]+)>:") + function_name = objdump_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + function_name_pattern = re.compile(r"function (\w+):$") + function_name = function_name_pattern.search(line) + if function_name is not None: + return function_name[1] + + memory_range_pattern = re.compile( + rf"(?:Address range|from) ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$" + ) + memory_range = memory_range_pattern.search(line) + if memory_range is not None: + return f"{memory_range[1]}-{memory_range[2]}" + + return None + + def parse_address(self, line: str) -> (Address, str): + """ + Parses leading address of instruction + """ + address_match = re.match(rf"^\s*(?:0x)?({HEX_PATTERN})\s*(?:<([+-][0-9]+)>)?:(.*)", line) + if address_match is None: + return None, line + address = Address( + int(address_match[1], 16), None, int(address_match[2]) if address_match[2] else None + ) + return address, address_match[3] + + def split_nth(self, string: str, count: int) -> List[str]: + """ + Splits string to equally-sized chunks + """ + return [string[i : i + count] for i in range(0, len(string), count)] + + def parse_encoding(self, line): + """ + Parses byte encoding of instruction for objdump disassemblies + e.g. the '31 c0' in + '16bd3: 31 c0 xor %eax,%eax' + In addition to X86 supports ARM encoding styles: + '4: e1a01000 mov r1, r0' + '50: f7ff fffe bl 0 <__aeabi_dadd>' + '54: 0002 movs r2, r0' + """ + # Encoding is separated from assembly mnemonic via tab + # so we allow whitespace separators between bytes + # to avoid accidentally matching the mnemonic. + enc_match = re.match(r"^\s*((?:[0-9a-f]{2,8} +)+)(.*)", line) + if enc_match is None: + return None, line + bites = [] + for chunk in enc_match[1].strip().split(" "): + bites.extend(int(byte, 16) for byte in self.split_nth(chunk, 2)) + return Encoding(bites), enc_match[2] + + def parse_body(self, line: str) -> (str, str, List[str], str): + """Parses instruction body (opcode and operands)""" + comment_symbol = self.architecture.comment() + body_match = re.match(rf"^\s*([^{comment_symbol}<]+)(.*)", line) + if body_match is None: + return None, None, None, line + body = body_match[1].strip() + line = body_match[2] + opcode_match = re.match(r"^(\S*)\s*(.*)", body) + if opcode_match is None: + return None, None, None, line + opcode = opcode_match[1] + ops = opcode_match[2].split(",") if opcode_match[2] else [] + return body, opcode, ops, line + + def parse_target(self, line: str) -> (Address, str): + """ + Parses optional instruction branch target hint + """ + target_match = re.match(r"\s*<([a-zA-Z_@0-9]+)([+-]0x[0-9a-f]+|[+-][0-9]+)?>(.*)", line) + if target_match is None: + return None, line + offset = target_match[2] or "+0" + address = Address(None, target_match[1], int(offset, 0)) + return address, target_match[3] + + def parse_comment(self, line: str) -> (Address, str): + """ + Parses optional instruction comment + """ + comment_symbol = self.architecture.comment() + comment_match = re.match(rf"^\s*{comment_symbol}\s*(.*)", line) + if comment_match is None: + return None, line + comment = comment_match[1] + imm_match = re.match(rf"^(?:0x)?({HEX_PATTERN})\s*(<.*>)?(.*)", comment) + if imm_match is None: + # If no imm was found, ignore the comment. + # In particular this takes care of useless ARM comments like + # '82: 46c0 nop ; (mov r8, r8)' + return None, "" + abs_addr = int(imm_match[1], 16) + if imm_match[2]: + target, _ = self.parse_target(imm_match[2]) + target.abs = abs_addr + else: + target = Address(abs_addr) + return target, imm_match[3] + + def parse_line(self, line: str, lineno, function_name: str) -> Instruction | None: + """ + Parses a single line of assembly to create Instruction instance + """ + # Strip GDB prefix and leading whites + line = line.removeprefix("=> ") + line = line.lstrip() + + address, line = self.parse_address(line) + if address is None: + return None + + encoding, line = self.parse_encoding(line) + if not line: + return encoding + + original_line = line + body, opcode, ops, line = self.parse_body(line) + if opcode is None: + return None + + target, line = self.parse_target(line) + + _, line = self.parse_comment(line) + if line: + # Expecting complete parse + return None + + # Set base symbol for relative addresses + if address.base is None: + address.base = function_name + if target is not None and target.base is None: + target.base = function_name + + return Instruction( + body, + original_line.strip(), + lineno, + address, + opcode, + ops, + target, + ) + + def parse_jump_target(self, ops: List[str]) -> int | None: + # it assumes the first operand to contain the target address + return int(ops[-1], 16) diff --git a/ocgraph/configuration/logger.py b/ocgraph/configuration/logger.py new file mode 100644 index 0000000..badab2a --- /dev/null +++ b/ocgraph/configuration/logger.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: GTDGmbH +# Copyright 2024 by GTD GmbH. +"""Class configuring the OCGraph logging.""" +import logging + +# fmt: off +logging_preset: dict[str, dict] = { + "development": { + "file_log": True, + "file_level": logging.DEBUG, + "console_log": True, + "console_level": logging.DEBUG + }, + + "module": { + "file_log": False, + "file_level": logging.ERROR, + "console_log": False, + "console_level": logging.ERROR + }, + + "default": { + "file_log": True, + "file_level": logging.INFO, + "console_log": True, + "console_level": logging.INFO + }, +} +# fmt: on + + +class OCGraphLogger(logging.Logger): + """Logging mechanism for module""" + + def __init__(self, name: str, preset="default", file=""): + super().__init__(name) + log_config = logging_preset.get(preset) + if log_config["file_log"]: + logging_file = file + ".log" + file_stream: logging.FileHandler = logging.FileHandler(logging_file) + file_stream.setLevel(log_config["file_level"]) + self.addHandler(file_stream) + if log_config["console_log"]: + console_stream: logging.StreamHandler = logging.StreamHandler() + console_stream.setLevel(log_config["console_level"]) + self.addHandler(console_stream) diff --git a/ocgraph/data/__init__.py b/ocgraph/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ocgraph/data/address.py b/ocgraph/data/address.py new file mode 100755 index 0000000..61b184c --- /dev/null +++ b/ocgraph/data/address.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +"""Represents location in program which may be absolute or relative""" + + +class Address: + """Address Class""" + + def __init__(self, abs_addr: int, base: int = None, offset: int = None): + self.abs = abs_addr + self.base = base + self.offset = offset + + def is_absolute(self): + """Return if address is absolute""" + return self.base is None + + def is_relative(self): + """Return if address is relative""" + return not self.is_absolute() + + def __str__(self): + if self.offset is not None and self.abs is not None: + return f"0x{self.abs:x} ({self.base}+0x{self.offset:x})" + if isinstance(self.abs, int): + return f"0x{self.abs:x}" + return str(self.abs) diff --git a/ocgraph/data/basic_block.py b/ocgraph/data/basic_block.py new file mode 100755 index 0000000..71bd814 --- /dev/null +++ b/ocgraph/data/basic_block.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +"""Class to represent a node in CFG with lines of code without jump or calls instructions.""" + +from typing import List + +from .instruction import Instruction + + +class BasicBlock: + """Basic Block Class""" + + def __init__(self, key): + self.key = key + self.instructions: List[Instruction] = [] + self.jump_edge = None + self.no_jump_edge = None + + def add_instruction(self, instruction): + """Add instruction to this block.""" + self.instructions.append(instruction) + + def add_jump_edge(self, basic_block_key) -> None: + """Add jump target block to this block.""" + if isinstance(basic_block_key, BasicBlock): + self.jump_edge = basic_block_key.key + else: + self.jump_edge = basic_block_key + + def add_no_jump_edge(self, basic_block_key) -> None: + """Add no jump target block to this block.""" + if isinstance(basic_block_key, BasicBlock): + self.no_jump_edge = basic_block_key.key + else: + self.no_jump_edge = basic_block_key + + def __str__(self) -> str: + return "\n".join([i.text for i in self.instructions]) + + def __repr__(self) -> str: + return "\n".join([i.text for i in self.instructions]) diff --git a/ocgraph/data/encoding.py b/ocgraph/data/encoding.py new file mode 100755 index 0000000..e6603b6 --- /dev/null +++ b/ocgraph/data/encoding.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +""" +Represents a sequence of bytes used for instruction encoding +e.g. the '31 c0' in +'16bd3: 31 c0 xor %eax,%eax' +""" + + +class Encoding: + """Encoding Class""" + + def __init__(self, bites): + self.bites = bites + + def size(self): + """Return size of the bytes""" + return len(self.bites) + + def __str__(self): + return " ".join(map(lambda b: f"{b:#x}", self.bites)) diff --git a/ocgraph/data/instruction.py b/ocgraph/data/instruction.py new file mode 100755 index 0000000..6f8af80 --- /dev/null +++ b/ocgraph/data/instruction.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +"""Represents a single assembly instruction with it operands, location and optional branch target""" +from enum import Enum +from typing import List + +from .address import Address + + +class Coverage(Enum): + """Enumeration for coverage records""" + + MISS = "missed" + """Indicates instruction is not executed""" + LINE_TAKEN = "taken" + """Indicates instruction is executed""" + JUMP_PASS = "skipped" + """Indicates, if branch is just passed without jump""" + JUMP_TAKEN = "jumped" + """Indicates, if branch is just jumped without passed""" + JUMP_BOTH = "both taken" + """Indicates, if branch is just jumped without passed""" + + +class Instruction: + """Instruction Class""" + + body: str = None + """Disassembled instruction code (without extra information)""" + + text: str = None + """Complete Disassembled instruction code""" + + lineno: int = None + """Line number in the file""" + + address: Address = None + """Computed address of the instruction""" + + opcode: str = None + """Disassembled opcode""" + + ops: List[str] = [] + """Disassembled operands of the instruction""" + + target: Address = None + """Optional target of the instruction (branch)""" + + returns: set[int] = set() + """Stores addresses of sink instructions like return""" + + coverage: Coverage = Coverage.MISS + """If line is executed on test""" + + branch_taken: bool = None + + def __init__(self, body, text, lineno, address, opcode, ops, target): + self.body = body + self.text = text + self.lineno = lineno + self.address = address + self.opcode = opcode + self.ops = ops + self.target = target + self.returns = set() + self.coverage = Coverage.MISS + + def update_coverage(self, addresses: set[int], is_branch=False) -> None: + """Update the coverage information of the instruction.""" + if not is_branch: + # exception for ret (target is None) or call (target.abs is None) + self.coverage = Coverage.LINE_TAKEN + self.returns = addresses + elif len(addresses) == 2 and self.target.abs in addresses: + self.coverage = Coverage.JUMP_BOTH + elif len(addresses) == 1 and self.target.abs in addresses: + self.coverage = Coverage.JUMP_TAKEN + elif len(addresses) == 1: + self.coverage = Coverage.JUMP_PASS + else: + raise AssertionError(f"Invalid Coverage Information at {self.address}: {addresses}") + + def __str__(self): + result = f"{self.address}: {self.opcode}" + if self.ops: + result += f" {self.ops}" + if self.target is not None: + result += " -> " + str(self.target) + return result + + def __repr__(self) -> str: + result = f"{self.address}: {self.opcode}" + if self.ops: + result += f" {self.ops}" + if self.target is not None: + result += " -> " + str(self.target) + return result diff --git a/ocgraph/data/jump_table.py b/ocgraph/data/jump_table.py new file mode 100755 index 0000000..b5f77f9 --- /dev/null +++ b/ocgraph/data/jump_table.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +"""Holds info about branch sources and destinations in asm function.""" + +from typing import List, Dict, Set + +from .address import Address +from .instruction import Instruction +from ..configuration.configuration import OcGraphConfiguration + + +class JumpTable: + """JumpTable Class""" + + def __init__(self, instructions: List[Instruction], configuration: OcGraphConfiguration): + self.config: OcGraphConfiguration = configuration + + # Address where the jump begins and value which address + # to jump to. This also includes calls. + self.abs_sources: Dict[int, Address] = {} + self.rel_sources: Dict[int, Address] = {} + + # Addresses where jumps end inside the current function. + self.abs_destinations: Set[int] = set() + self.rel_destinations: Set[int] = set() + + # Iterate over the lines and collect jump targets and branching points. + for instr in instructions: + if instr is None or not self.config.architecture.is_direct_branch(instr): + continue + + self.abs_sources[instr.address.abs] = instr.target + self.abs_destinations.add(instr.target.abs) + + self.rel_sources[instr.address.offset] = instr.target + self.rel_destinations.add(instr.target.offset) + + def is_jump_target(self, addr: Address) -> bool: + """Return if address is a destination""" + if addr.abs is not None: + return addr.abs in self.abs_destinations + if addr.offset is not None: + return addr.offset in self.rel_destinations + return False + + def get_target(self, addr: Address): + """Return the target of a branch""" + if addr.abs is not None: + return self.abs_sources.get(addr.abs) + if addr.offset is not None: + return self.rel_sources.get(addr.offset) + return None diff --git a/ocgraph/interface/__init__.py b/ocgraph/interface/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ocgraph/interface/analyzer.py b/ocgraph/interface/analyzer.py new file mode 100755 index 0000000..2427109 --- /dev/null +++ b/ocgraph/interface/analyzer.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +"""Class for read and analyze the input string.""" + +import sys + +from ..data.address import Address +from ..data.basic_block import BasicBlock +from ..data.encoding import Encoding +from ..data.instruction import Instruction +from ..data.jump_table import JumpTable + +from ..configuration.configuration import OcGraphConfiguration, Disassembler + + +class Analyzer: + """Analyzer Class""" + + def __init__(self, config: OcGraphConfiguration): + self.configuration = config + self.logger = self.configuration.logger + self.parser: Disassembler = self.configuration.disassembler + + self.lines: list[str] = [] + self.function_name = None + self.instructions: list[Instruction] = [] + self.jump_table = None + self.basic_blocks: dict[int, BasicBlock] = {} + + def parse_file(self, file_path: str): + """Parse a assembler file""" + with open(file_path, "r", encoding="utf8") as asm_file: + lines = asm_file.readlines() + self.parse_lines(lines) + + def parse_lines(self, lines): + """Parse a list of assembly lines""" + self.lines = lines + self._parse_instructions() + self._compute_jump_targets() + self._create_jump_table() + self._create_basic_blocks() + + def _parse_instructions(self): + self.instructions = [] + for num, line in enumerate(self.lines, 1): + current_function_name = self.parser.parse_function_header(line) + if current_function_name is not None: + if self.function_name is not None: + raise RuntimeError("we handle only one function for now") + self.logger.info(f"New function {current_function_name}") + self.function_name = current_function_name + continue + + instruction_or_encoding = self.parser.parse_line(line, num, self.function_name) + if isinstance(instruction_or_encoding, Encoding): + # Partial encoding for previous instruction, skip it + continue + if instruction_or_encoding is not None: + self.instructions.append(instruction_or_encoding) + continue + + if line.startswith("End of assembler dump") or not line: + continue + + if line.strip() == "": + continue + + self.logger.error(f"Unexpected assembly at line {num}:\n {line}") + sys.exit(1) + + def _compute_jump_targets(self): + # Infer target address for jump instructions + for instr in self.instructions: + if ( + instr.target is None or instr.target.abs is None + ) and self.configuration.architecture.is_direct_branch(instr): + if instr.target is None: + instr.target = Address(0) + # parse the absolute target out of the operands + # (first hex address is assumed to be the target address) + instr.target.abs = self.parser.parse_jump_target(instr.ops) + + # Infer relative addresses (for objdump or stripped gdb) + start_address = self.instructions[0].address.abs + end_address = self.instructions[-1].address.abs + for instr in self.instructions: + for addr in (instr.address, instr.target): + if ( + addr is not None + and addr.offset is None + and start_address <= addr.abs <= end_address + ): + addr.offset = addr.abs - start_address + + self.logger.debug("Instructions:") + for instruction in self.instructions: + if instruction is not None: + self.logger.debug(f" {instruction}") + + def _create_jump_table(self): + self.jump_table = JumpTable(self.instructions, self.configuration) + + self.logger.debug("Absolute destinations:") + for dst in self.jump_table.abs_destinations: + self.logger.debug(f" {dst:#x}") + self.logger.debug("Relative destinations:") + for dst in self.jump_table.rel_destinations: + self.logger.debug(f" {dst:#x}") + self.logger.debug("Absolute branches:") + for key, addr in self.jump_table.abs_sources.items(): + self.logger.debug(f" {key:#x} -> {addr}") + self.logger.debug("Relative branches:") + for key, addr in self.jump_table.rel_sources.items(): + self.logger.debug(f" {key:#x} -> {addr}") + + def _create_basic_blocks(self) -> None: + """ + Now iterate over the assembly again and split it to basic blocks using the branching + information from earlier. + """ + self.basic_blocks = {} + + curr_basic_block: BasicBlock | None = None + # Store last block if ending with branch opcode + prev_branch_block: BasicBlock | None = None + + # block completion flag (introduced for SPARC pipeline) + block_completion: int | None = 0 + + for instruction in self.instructions: + # if block completion is in progress + if block_completion is not None and block_completion > 0: + block_completion -= 1 + if block_completion > 0: + self.basic_blocks[curr_basic_block.key].add_instruction(instruction) + continue + curr_basic_block = None + + # Current program counter + pc_addr = instruction.address + # Get optional jump target + jump_target = self.jump_table.get_target(pc_addr) + is_unconditional = self.configuration.architecture.is_unconditional_branch(instruction) + + # Start new blocks if last ended + if curr_basic_block is None: + # Create new basic block + self.basic_blocks[pc_addr.abs] = curr_basic_block = BasicBlock(key=pc_addr.abs) + + # if previous basic block ended in branch instruction. Add the basic + # block what follows if the jump was not taken. + if prev_branch_block is not None: + prev_branch_block.add_no_jump_edge(curr_basic_block) + prev_branch_block = None + # or if current address is a jump target + elif self.jump_table.is_jump_target(pc_addr): + closing_block = curr_basic_block + self.basic_blocks[pc_addr.abs] = curr_basic_block = BasicBlock(key=pc_addr.abs) + closing_block.add_no_jump_edge(pc_addr.abs) + + curr_basic_block.add_instruction(instruction) + + # End current block if current opcode is a jump/branch/sink + if jump_target: + curr_basic_block.add_jump_edge(jump_target.abs) + prev_branch_block = None if is_unconditional else curr_basic_block + block_completion = self.configuration.architecture.get_branch_delay(instruction) + elif self.configuration.architecture.is_sink(instruction): + block_completion = self.configuration.architecture.get_branch_delay(instruction) + prev_branch_block = None + + if prev_branch_block is not None: + # If last instruction of the function is jump/call, then add dummy + # block to designate end of the function. + end_instruction = Instruction("", "end of function", 0, None, None, [], None) + end_block = BasicBlock("end_of_function") + end_block.add_instruction(end_instruction) + prev_branch_block.add_no_jump_edge(end_block.key) + self.basic_blocks[end_block.key] = end_block diff --git a/ocgraph/interface/coverage_reader.py b/ocgraph/interface/coverage_reader.py new file mode 100755 index 0000000..18dde93 --- /dev/null +++ b/ocgraph/interface/coverage_reader.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH +"""Class for read coverage input and update the instruction.""" +import ast +import csv + +from ..data.instruction import Instruction +from ..configuration.configuration import OcGraphConfiguration + + +class CoverageReader: # pylint: disable=too-few-public-methods + """CoverageReader Class""" + + def __init__(self, instructions: [Instruction], config: OcGraphConfiguration): + self.instructions = instructions + self.config = config + + def update_by_csv(self, file_path: str): + """Read coverage csv file and update""" + # Store for coverage information + coverage_info: dict[int, set[int]] = {} + + # read the csv file. expected values in address and branch_jumps + with open(file_path, "r", newline="", encoding="utf-8") as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + _temp = ast.literal_eval(row["branch_jumps"]) + coverage_info[int(row["address"], 0)] = {int(x, 0) for x in _temp} + # update instructions + for instr in self.instructions: + if coverage_info.get(instr.address.abs, None) is not None: + is_branch = self.config.architecture.is_branch(instr) + instr.update_coverage(coverage_info[instr.address.abs], is_branch=is_branch) diff --git a/ocgraph/interface/drawer.py b/ocgraph/interface/drawer.py new file mode 100755 index 0000000..e2ba199 --- /dev/null +++ b/ocgraph/interface/drawer.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH +"""Class for drawing the output.""" + +import tempfile + +from typing import Dict + +from graphviz import Digraph + +from ..configuration.configuration import OcGraphConfiguration +from ..data.basic_block import BasicBlock +from ..data.instruction import Coverage + +coverage_color = { + Coverage.MISS: "#f08080", # light coral + Coverage.LINE_TAKEN: "#90ee90", # light green + Coverage.JUMP_BOTH: "#90ee90", # light green + Coverage.JUMP_PASS: "#fdfd96", # pastel yellow + Coverage.JUMP_TAKEN: "#fdfd96", # pastel yellow +} + + +class Drawer: + """Drawer Class""" + + def __init__(self, config: OcGraphConfiguration, graph_options: dict = None) -> None: + self.config = config + self.graph_option = graph_options if graph_options else {} + + def set_graph_option(self, graph_options: dict = None) -> None: + """Set new graph options""" + self.graph_option = graph_options + + @staticmethod + def _escape(text: str) -> str: + """ + Escape used dot graph characters in given instruction so they will be + displayed correctly. + """ + text = text.replace("<", r"<") + text = text.replace(">", r">") + text = text.replace("\t", " ") + return text + + def _create_label(self, basic_block: BasicBlock, line_coverage=False): + """Create annotated graph label""" + label = "" + returns = set() + + # start label + label += '< \n' + # for each instruction in block + for instr in basic_block.instructions: + bg_color = coverage_color[instr.coverage] if line_coverage else "white" + label += ( + "" + f'\n" + ) + if self.config.architecture.is_sink(instr): + returns = instr.returns + for return_addr in returns: + if isinstance(return_addr, int): + returns.remove(return_addr) + returns.add(f"0x{return_addr:x}") + + # add JUMP/NO JUMP cells with dot PORT navigation + cells = [basic_block.jump_edge, basic_block.no_jump_edge] + span = 3 - len([x for x in cells if x is not None]) # 3 - count of trues in cells + + label += "" + if basic_block.no_jump_edge: + label += f'' + if basic_block.jump_edge: + label += f'' + if not basic_block.jump_edge and not basic_block.no_jump_edge: + label += f'' + label += " \n
' + f"0x{instr.address.abs:0>8x}: {instr.opcode:<10} {Drawer._escape(text=instr.text.removeprefix(instr.opcode).strip())}" + "
NO JUMPJUMPRETURN targets: {str(returns) if returns else "--"}
>" + return label + + def _create_cfg(self, name: str, basic_blocks: Dict[int, BasicBlock], coverage=False): + """Create a cgf""" + dot = Digraph(name=name, comment=name, engine="dot") + dot.attr("graph", label=name) + dot.attr("graph", fontname="Mono") + dot.attr("node", fontname="Mono") + dot.attr("edge", fontname="Mono") + + # Create nodes in graph + for address, basic_block in basic_blocks.items(): + key = str(address) + label = self._create_label(basic_block, coverage) + dot.node(name=key, label=label, shape="plaintext", **self.graph_option) + + # Create edges in graph + for basic_block in basic_blocks.values(): + if basic_block.jump_edge: + dot.edge(f"{basic_block.key}:jump", str(basic_block.jump_edge)) + if basic_block.no_jump_edge: + dot.edge(f"{basic_block.key}:pass", str(basic_block.no_jump_edge)) + return dot + + def view_cfg(self, name: str, basic_blocks: Dict[int, BasicBlock]): + """view a function graph""" + dot = self._create_cfg(name, basic_blocks) + dot.format = "gv" + with tempfile.NamedTemporaryFile(mode="w+b", prefix=name) as filename: + dot.view(filename.name) + print(f"Opening a file {filename.name}.{dot.format} with default viewer.") + + def draw_cfg(self, name: str, basic_blocks: Dict[int, BasicBlock], output: str = None): + """Draw a function graph""" + dot = self._create_cfg(name, basic_blocks, coverage=True) + + filename = output if output else name + dot.format = "pdf" + dot.render(filename=filename, cleanup=True) + self.config.logger.info(f"Saved CFG to a file {name}.{dot.format}") + + dot.format = "gv" + dot.render(filename=filename, cleanup=True) diff --git a/ocrecord/__init__.py b/ocrecord/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/gdb_asm2cfg.py b/ocrecord/gdb_asm2cfg.py similarity index 57% rename from src/gdb_asm2cfg.py rename to ocrecord/gdb_asm2cfg.py index a40a712..b2a039a 100644 --- a/src/gdb_asm2cfg.py +++ b/ocrecord/gdb_asm2cfg.py @@ -4,13 +4,12 @@ For further information see https://sourceware.org/gdb/current/onlinedocs/gdb/Python.html#Python. """ - - import traceback import gdb -from asm2cfg import asm2cfg +from ..ocgraph.interface.drawer import Drawer +from ..ocgraph.interface.analyzer import Analyzer class SkipCalls(gdb.Parameter): @@ -20,25 +19,25 @@ class SkipCalls(gdb.Parameter): set skipcalls off """ - set_doc = 'Set whether savecfg and viewcfg commands will skip function calls from splitting CFG blocks' - show_doc = 'Set whether savecfg and viewcfg commands will skip function calls from splitting CFG blocks' + set_doc = "Set whether savecfg and viewcfg commands will skip function calls from splitting CFG blocks" + show_doc = "Set whether savecfg and viewcfg commands will skip function calls from splitting CFG blocks" def __init__(self): - super().__init__('skipcalls', gdb.COMMAND_DATA, gdb.PARAM_BOOLEAN) + super().__init__("skipcalls", gdb.COMMAND_DATA, gdb.PARAM_BOOLEAN) self.value = False def get_set_string(self): - return f'Commands savecfg and viewcfg will skip function calls \ - from splitting CFG blocks: {self.value_to_string()}' + return f"Commands savecfg and viewcfg will skip function calls \ + from splitting CFG blocks: {self.value_to_string()}" def get_show_string(self, _): - return f'Commands savecfg and viewcfg will skip function calls \ - from splitting CFG blocks: {self.value_to_string()}' + return f"Commands savecfg and viewcfg will skip function calls \ + from splitting CFG blocks: {self.value_to_string()}" def value_to_string(self): if self.value: - return 'on' - return 'off' + return "on" + return "off" class ViewCfg(gdb.Command): # pylint: disable=too-few-public-methods @@ -50,23 +49,25 @@ class ViewCfg(gdb.Command): # pylint: disable=too-few-public-methods """ def __init__(self): - super().__init__('viewcfg', gdb.COMMAND_USER) + super().__init__("viewcfg", gdb.COMMAND_USER) def invoke(self, _arg, _from_tty): # pylint: disable=bad-option-value,no-self-use - """ Called by GDB when viewcfg command is invoked """ + """Called by GDB when viewcfg command is invoked""" try: frame = gdb.selected_frame() arch = frame.architecture().name() - if arch.startswith('i386'): - target_name = 'x86' - elif arch.startswith('arm'): - target_name = 'arm' + if arch.startswith("i386"): + target_name = "x86" + elif arch.startswith("arm"): + target_name = "arm" + elif arch.startswith("sparc"): + target_name = "sparc" else: - raise RuntimeError(f'unknown platform: {arch}') - assembly_lines = gdb.execute('disassemble', from_tty=False, to_string=True).split('\n') - function_name, basic_blocks = asm2cfg.parse_lines(assembly_lines, gdb.parameter('skipcalls'), - target_name) - asm2cfg.draw_cfg(function_name, basic_blocks, view=True) + raise RuntimeError(f"unknown platform: {arch}") + assembly_lines = gdb.execute("disassemble", from_tty=False, to_string=True).split("\n") + analyzer = Analyzer(config=target_name + " GDB") + analyzer.parse_lines(assembly_lines) + Drawer(analyzer.configuration).view_cfg(analyzer.function_name, analyzer.basic_blocks) # Catch error coming from GDB side before other errors. except gdb.error as ex: raise gdb.GdbError(ex) @@ -83,15 +84,15 @@ class SaveCfg(gdb.Command): # pylint: disable=too-few-public-methods """ def __init__(self): - super().__init__('savecfg', gdb.COMMAND_USER) + super().__init__("savecfg", gdb.COMMAND_USER) def invoke(self, _arg, _from_tty): # pylint: disable=no-self-use - """ Called by GDB when savecfg command is invoked """ + """Called by GDB when savecfg command is invoked""" try: - assembly_lines = gdb.execute('disassemble', from_tty=False, to_string=True).split('\n') - function_name, basic_blocks = asm2cfg.parse_lines(assembly_lines, gdb.parameter('skipcalls'), - 'x86') - asm2cfg.draw_cfg(function_name, basic_blocks, view=False) + assembly_lines = gdb.execute("disassemble", from_tty=False, to_string=True).split("\n") + analyzer = Analyzer(config="x86 GDB") + analyzer.parse_lines(assembly_lines) + Drawer(analyzer.configuration).view_cfg(analyzer.function_name, analyzer.basic_blocks) # Catch error coming from GDB side before other errors. except gdb.error as ex: raise gdb.GdbError(ex) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a31f820 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +graphviz==0.9 diff --git a/scripts/batch_objdump.sh b/scripts/batch_objdump.sh new file mode 100755 index 0000000..0c1ff66 --- /dev/null +++ b/scripts/batch_objdump.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env python +# SPDX-License-Identifier: GTDGmbH + +function_array=( + acos + asin + atan + atan2 + cos + sin + tan + acosh + atanh + cosh + sinh + tanh + exp + exp2 + expm1 + frexp + ilogb + ldexp + log + log10 + log1p + log2 + logb + modf + scalbn + scalbln + cbrt + fabs + hypot + pow + sqrt + erf + erfc + lgamma + tgamma + ceil + floor + nearbyint + rint + lrint + llrint + round + lround + llround + trunc + fmod + remainder + remquo + copysign + nan + nextafter + fdim + fmax + fmin + fma + + acosf + asinf + atanf + atan2f + cosf + sinf + tanf + acoshf + asinhf + atanhf + coshf + sinhf + tanhf + expf + exp2f + expm1f + frexpf + ilogbf + ldexpf + logf + log10f + log1pf + log2f + logbf + modff + scalbnf + scalblnf + cbrtf + fabsf + hypotf + powf + sqrtf + erff + erfcf + lgammaf + tgammaf + ceilf + floorf + nearbyintf + rintf + lrintf + llrintf + roundf + lroundf + llroundf + truncf + fmodf + remainderf + remquof + copysignf + nanf + nextafterf + fdimf + fmaxf + fminf + fmaf +) + +objdump=objdump + +application=./qualification/code/app.exe +coverage_file=./qualification/code/app.exe.csv + +asm_folder=./qualification/asm +pdf_folder=./qualification/pdf + +mkdir -p $asm_folder +mkdir -p $pdf_folder + +for i in ${function_array[@]} +do + $objdump -d $application | sed -ne '/<'$i'>:/,/^$/p' > $asm_folder/$i.asm + python -m ocgraph -c $coverage_file -d 'OBJDUMP' -a sparc -f $asm_folder/$i.asm +done + +mv *.pdf $pdf_folder/ +rm -f *.gv diff --git a/scripts/update_examples.sh b/scripts/update_examples.sh index d60a970..92fb3dd 100755 --- a/scripts/update_examples.sh +++ b/scripts/update_examples.sh @@ -9,6 +9,6 @@ for asm in examples/*.asm; do if echo $asm | grep -q 'arm.asm'; then flags="$flags --target arm" fi - pdf=$(python3 -m src.asm2cfg $flags -c $asm | awk '/Saved CFG/{print $NF}') + pdf=$(python3 -m ocgraph $flags -c $asm | awk '/Saved CFG/{print $NF}') mv $pdf $(echo $asm | sed 's/\.asm/\.pdf/') done diff --git a/setup.cfg b/setup.cfg index c25a957..8917b85 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,5 +41,4 @@ logging-format-style = new expected-line-ending-format = LF include-naming-hint = yes ignored-modules = gdb # Ignore because of the GDB integration -notes= # disable warnings for TODO, FIXME etc. -disable=bad-option-value,missing-function-docstring,no-self-use,too-many-instance-attributes,too-many-arguments,too-many-locals,too-many-branches,too-many-statements +disable=duplicate-code,too-many-instance-attributes,too-many-arguments diff --git a/src/asm2cfg/__main__.py b/src/asm2cfg/__main__.py deleted file mode 100644 index e715118..0000000 --- a/src/asm2cfg/__main__.py +++ /dev/null @@ -1,9 +0,0 @@ -""" -Let this module to be executed from the command line with python -m src.asm2cfg -from root of the project -""" - -from . import command_line - -if __name__ == '__main__': - command_line.main() diff --git a/src/asm2cfg/asm2cfg.py b/src/asm2cfg/asm2cfg.py deleted file mode 100644 index 7a93bb4..0000000 --- a/src/asm2cfg/asm2cfg.py +++ /dev/null @@ -1,630 +0,0 @@ -""" -Module containing main building blocks to parse assembly and draw CFGs. -""" - -import re -import sys -import tempfile - -from graphviz import Digraph - - -# TODO: make this a command-line flag -VERBOSE = 0 - - -def escape(instruction): - """ - Escape used dot graph characters in given instruction so they will be - displayed correctly. - """ - instruction = instruction.replace('<', r'\<') - instruction = instruction.replace('>', r'\>') - instruction = instruction.replace('|', r'\|') - instruction = instruction.replace('{', r'\{') - instruction = instruction.replace('}', r'\}') - instruction = instruction.replace(' ', ' ') - return instruction - - -class BasicBlock: - """ - Class to represent a node in CFG with straight lines of code without jump - or calls instructions. - """ - - def __init__(self, key): - self.key = key - self.instructions = [] - self.jump_edge = None - self.no_jump_edge = None - - def add_instruction(self, instruction): - """ - Add instruction to this block. - """ - self.instructions.append(instruction) - - def add_jump_edge(self, basic_block_key): - """ - Add jump target block to this block. - """ - if isinstance(basic_block_key, BasicBlock): - self.jump_edge = basic_block_key.key - else: - self.jump_edge = basic_block_key - - def add_no_jump_edge(self, basic_block_key): - """ - Add no jump target block to this block. - """ - if isinstance(basic_block_key, BasicBlock): - self.no_jump_edge = basic_block_key.key - else: - self.no_jump_edge = basic_block_key - - def get_label(self): - """ - Return content of the block for dot graph. - """ - # Left align in dot. - label = r'\l'.join([escape(i.text) for i in self.instructions]) - # Left justify the last line too. - label += r'\l' - if self.jump_edge: - if self.no_jump_edge: - label += '|{No Jump|Jump}' - else: - label += '|{Jump}' - return '{' + label + '}' - - def __str__(self): - return '\n'.join([i.text for i in self.instructions]) - - def __repr__(self): - return '\n'.join([i.text for i in self.instructions]) - - -def print_assembly(basic_blocks): - """ - Debug function to print the assembly. - """ - for basic_block in basic_blocks.values(): - print(basic_block) - - -def read_lines(file_path): - """ Read lines from the file and return then as a list. """ - lines = [] - with open(file_path, 'r', encoding='utf8') as asm_file: - lines = asm_file.readlines() - return lines - - -# Common regexes -HEX_PATTERN = r'[0-9a-fA-F]+' -HEX_LONG_PATTERN = r'(?:0x0*)?' + HEX_PATTERN - - -class InputFormat: # pylint: disable=too-few-public-methods - """ - An enum which represents various supported input formats - """ - GDB = 'GDB' - OBJDUMP = 'OBJDUMP' - - -def parse_function_header(line): - """ - Return function name of memory range from the given string line. - - Match lines for non-stripped binaries: - 'Dump of assembler code for function test_function:' - lines for stripped binaries: - 'Dump of assembler code from 0x555555555faf to 0x555555557008:' - and lines for obdjdump disassembly: - '0000000000016bb0 <_obstack_allocated_p@@Base>:' - """ - - objdump_name_pattern = re.compile(fr'{HEX_PATTERN} <([a-zA-Z_0-9@.]+)>:') - function_name = objdump_name_pattern.search(line) - if function_name is not None: - return InputFormat.OBJDUMP, function_name[1] - - function_name_pattern = re.compile(r'function (\w+):$') - function_name = function_name_pattern.search(line) - if function_name is not None: - return InputFormat.GDB, function_name[1] - - memory_range_pattern = re.compile(fr'(?:Address range|from) ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$') - memory_range = memory_range_pattern.search(line) - if memory_range is not None: - return InputFormat.GDB, f'{memory_range[1]}-{memory_range[2]}' - - return None, None - - -class Address: - """ - Represents location in program which may be absolute or relative - """ - def __init__(self, abs_addr, base=None, offset=None): - self.abs = abs_addr - self.base = base - self.offset = offset - - def is_absolute(self): - return self.base is None - - def is_relative(self): - return not self.is_absolute() - - def __str__(self): - if self.offset is not None: - return f'0x{self.abs:x} ({self.base}+{self.offset})' - return f'0x{self.abs}' - - def merge(self, other): - if self.abs is not None: - assert self.abs is None or self.abs == other.abs - self.abs = other.abs - if self.base is not None: - assert self.base is None or self.base == other.base - self.base = other.base - if self.offset is not None: - assert self.offset is None or self.offset == other.offset - self.offset = other.offset - - -class Encoding: - """ - Represents a sequence of bytes used for instruction encoding - e.g. the '31 c0' in - '16bd3: 31 c0 xor %eax,%eax' - """ - def __init__(self, bites): - self.bites = bites - - def size(self): - return len(self.bites) - - def __str__(self): - return ' '.join(map(lambda b: f'{b:#x}', self.bites)) - - -class X86TargetInfo: - """ - Contains instruction info for X86-compatible targets. - """ - - def __init__(self): - pass - - def comment(self): - return '#' - - def is_call(self, instruction): - # Various flavors of call: - # call *0x26a16(%rip) - # call 0x555555555542 - # addr32 call 0x55555558add0 - return 'call' in instruction.opcode - - def is_jump(self, instruction): - return instruction.opcode[0] == 'j' - - def is_unconditional_jump(self, instruction): - return instruction.opcode.startswith('jmp') - - def is_sink(self, instruction): - """ - Is this an instruction which terminates function execution e.g. return? - """ - return instruction.opcode.startswith('ret') - - -class ARMTargetInfo: - """ - Contains instruction info for ARM-compatible targets. - """ - - def __init__(self): - pass - - def comment(self): - return ';' - - def is_call(self, instruction): - # Various flavors of call: - # bl 0x19d90 <_IO_vtable_check> - # Note that we should be careful to not mix it with conditional - # branches like 'ble'. - return instruction.opcode.startswith('bl') \ - and instruction.opcode not in ('blt', 'ble', 'bls') - - def is_jump(self, instruction): - return instruction.opcode[0] == 'b' and not self.is_call(instruction) - - def is_unconditional_jump(self, instruction): - return instruction.opcode == 'b' - - def is_sink(self, instruction): - """ - Is this an instruction which terminates function execution e.g. return? - Detect various flavors of return like - bx lr - pop {r2-r6,pc} - Note that we do not consider conditional branches (e.g. 'bxle') to sink. - """ - return re.search(r'\bpop\b.*\bpc\b', instruction.body) \ - or (instruction.opcode == 'bx' and instruction.ops[0] == 'lr') \ - or instruction.opcode == 'udf' - - -class Instruction: - """ - Represents a single assembly instruction with it operands, location and - optional branch target - """ - def __init__(self, body, text, lineno, address, opcode, ops, target, imm, target_info): # noqa - self.body = body - self.text = text - self.lineno = lineno - self.address = address - self.opcode = opcode - self.ops = ops - self.target = target - self.info = target_info - if imm is not None and (self.is_jump() or self.is_call()): - if self.target is None: - self.target = imm - else: - self.target.merge(imm) - - def is_call(self): - return self.info.is_call(self) - - def is_jump(self): - return self.info.is_jump(self) - - def is_direct_jump(self): - return self.is_jump() and re.match(fr'{HEX_LONG_PATTERN}', self.ops[0]) - - def is_sink(self): - return self.info.is_sink(self) - - def is_unconditional_jump(self): - return self.info.is_unconditional_jump(self) - - def __str__(self): - result = f'{self.address}: {self.opcode}' - if self.ops: - result += f' {self.ops}' - return result - - -def parse_address(line): - """ - Parses leading address of instruction - """ - address_match = re.match(fr'^\s*(?:0x)?({HEX_PATTERN})\s*(?:<([+-][0-9]+)>)?:(.*)', line) - if address_match is None: - return None, line - address = Address(int(address_match[1], 16), None, int(address_match[2]) if address_match[2] else None) - return address, address_match[3] - - -def split_nth(string, count): - """ - Splits string to equally-sized chunks - """ - return [string[i:i+count] for i in range(0, len(string), count)] - - -def parse_encoding(line): - """ - Parses byte encoding of instruction for objdump disassemblies - e.g. the '31 c0' in - '16bd3: 31 c0 xor %eax,%eax' - In addition to X86 supports ARM encoding styles: - '4: e1a01000 mov r1, r0' - '50: f7ff fffe bl 0 <__aeabi_dadd>' - '54: 0002 movs r2, r0' - """ - # Encoding is separated from assembly mnemonic via tab - # so we allow whitespace separators between bytes - # to avoid accidentally matching the mnemonic. - enc_match = re.match(r'^\s*((?:[0-9a-f]{2,8} +)+)(.*)', line) - if enc_match is None: - return None, line - bites = [] - for chunk in enc_match[1].strip().split(' '): - bites.extend(int(byte, 16) for byte in split_nth(chunk, 2)) - return Encoding(bites), enc_match[2] - - -def parse_body(line, target_info): - """ - Parses instruction body (opcode and operands) - """ - comment_symbol = target_info.comment() - body_match = re.match(fr'^\s*([^{comment_symbol}<]+)(.*)', line) - if body_match is None: - return None, None, None, line - body = body_match[1].strip() - line = body_match[2] - opcode_match = re.match(r'^(\S*)\s*(.*)', body) - if opcode_match is None: - return None, None, None, line - opcode = opcode_match[1] - ops = opcode_match[2].split(',') if opcode_match[2] else [] - return body, opcode, ops, line - - -def parse_target(line): - """ - Parses optional instruction branch target hint - """ - target_match = re.match(r'\s*<([a-zA-Z_@.0-9]+)([+-]0x[0-9a-f]+|[+-][0-9]+)?>(.*)', line) - if target_match is None: - return None, line - offset = target_match[2] or '+0' - address = Address(None, target_match[1], int(offset, 0)) - return address, target_match[3] - - -def parse_comment(line, target_info): - """ - Parses optional instruction comment - """ - comment_symbol = target_info.comment() - comment_match = re.match(fr'^\s*{comment_symbol}\s*(.*)', line) - if comment_match is None: - return None, line - comment = comment_match[1] - imm_match = re.match(fr'^(?:0x)?({HEX_PATTERN})\s*(<.*>)?(.*)', comment) - if imm_match is None: - # If no imm was found, ignore the comment. - # In particular this takes care of useless ARM comments like - # '82: 46c0 nop ; (mov r8, r8)' - return None, '' - abs_addr = int(imm_match[1], 16) - if imm_match[2]: - target, _ = parse_target(imm_match[2]) - target.abs = abs_addr - else: - target = Address(abs_addr) - return target, imm_match[3] - - -def parse_line(line, lineno, function_name, fmt, target_info): - """ - Parses a single line of assembly to create Instruction instance - """ - - # Strip GDB prefix and leading whites - if line.startswith('=> '): - # Strip GDB marker - line = line[3:] - line = line.lstrip() - - address, line = parse_address(line) - if address is None: - return None - - if fmt == InputFormat.OBJDUMP: - encoding, line = parse_encoding(line) - if not line: - return encoding - - original_line = line - body, opcode, ops, line = parse_body(line, target_info) - if opcode is None: - return None - - target, line = parse_target(line) - - imm, line = parse_comment(line, target_info) - if line: - # Expecting complete parse - return None - - # Set base symbol for relative addresses - if address.base is None: - address.base = function_name - if target is not None and target.base is None: - target.base = function_name - - return Instruction(body, original_line.strip(), lineno, address, opcode, ops, target, imm, target_info) - - -class JumpTable: - """ - Holds info about branch sources and destinations in asm function. - """ - - def __init__(self, instructions): - # Address where the jump begins and value which address - # to jump to. This also includes calls. - self.abs_sources = {} - self.rel_sources = {} - - # Addresses where jumps end inside the current function. - self.abs_destinations = set() - self.rel_destinations = set() - - # Iterate over the lines and collect jump targets and branching points. - for inst in instructions: - if inst is None or not inst.is_direct_jump(): - continue - - self.abs_sources[inst.address.abs] = inst.target - self.abs_destinations.add(inst.target.abs) - - self.rel_sources[inst.address.offset] = inst.target - self.rel_destinations.add(inst.target.offset) - - def is_destination(self, address): - if address.abs is not None: - return address.abs in self.abs_destinations - if address.offset is not None: - return address.offset in self.rel_destinations - return False - - def get_target(self, address): - if address.abs is not None: - return self.abs_sources.get(address.abs) - if address.offset is not None: - return self.rel_sources.get(address.offset) - return None - - -def parse_lines(lines, skip_calls, target_name): # noqa pylint: disable=unused-argument - if target_name == 'x86': - target_info = X86TargetInfo() - elif target_name == 'arm': - target_info = ARMTargetInfo() - else: - print(f'Unsupported platform {target_name}') - sys.exit(1) - - instructions = [] - current_function_name = current_format = None - for num, line in enumerate(lines, 1): - fmt, function_name = parse_function_header(line) - if function_name is not None: - assert current_function_name is None, 'we handle only one function for now' - if VERBOSE: - print(f'New function {function_name} (format {fmt})') - current_function_name = function_name - current_format = fmt - continue - - instruction_or_encoding = parse_line(line, num, current_function_name, current_format, target_info) - if isinstance(instruction_or_encoding, Encoding): - # Partial encoding for previous instruction, skip it - continue - if instruction_or_encoding is not None: - instructions.append(instruction_or_encoding) - continue - - if line.startswith('End of assembler dump') or not line: - continue - - if line.strip() == '': - continue - - print(f'Unexpected assembly at line {num}:\n {line}') - sys.exit(1) - - # Infer target address for jump instructions - for instruction in instructions: - if (instruction.target is None or instruction.target.abs is None) \ - and instruction.is_direct_jump(): - if instruction.target is None: - instruction.target = Address(0) - instruction.target.abs = int(instruction.ops[0], 16) - - # Infer relative addresses (for objdump or stripped gdb) - start_address = instructions[0].address.abs - end_address = instructions[-1].address.abs - for instruction in instructions: - for address in (instruction.address, instruction.target): - if address is not None \ - and address.offset is None \ - and start_address <= address.abs <= end_address: - address.offset = address.abs - start_address - - if VERBOSE: - print('Instructions:') - for instruction in instructions: - if instruction is not None: - print(f' {instruction}') - - jump_table = JumpTable(instructions) - - if VERBOSE: - print('Absolute destinations:') - for dst in jump_table.abs_destinations: - print(f' {dst:#x}') - print('Relative destinations:') - for dst in jump_table.rel_destinations: - print(f' {dst}') - print('Absolute branches:') - for src, dst in jump_table.abs_sources.items(): - print(f' {src:#x} -> {dst}') - print('Relative branches:') - for src, dst in jump_table.rel_sources.items(): - print(f' {src} -> {dst}') - - # Now iterate over the assembly again and split it to basic blocks using - # the branching information from earlier. - basic_blocks = {} - current_basic_block = None - previous_jump_block = None - for line, instruction in zip(lines, instructions): - if instruction is None: - continue - - # Current offset/address inside the function. - program_point = instruction.address - jump_point = jump_table.get_target(program_point) - is_unconditional = instruction.is_unconditional_jump() - - if current_basic_block is None: - current_basic_block = BasicBlock(program_point.abs) - basic_blocks[current_basic_block.key] = current_basic_block - # Previous basic block ended in jump instruction. Add the basic - # block what follows if the jump was not taken. - if previous_jump_block is not None: - previous_jump_block.add_no_jump_edge(current_basic_block) - previous_jump_block = None - elif jump_table.is_destination(program_point): - temp_block = current_basic_block - current_basic_block = BasicBlock(program_point.abs) - basic_blocks[current_basic_block.key] = current_basic_block - temp_block.add_no_jump_edge(current_basic_block) - - current_basic_block.add_instruction(instruction) - - if jump_point is not None: - current_basic_block.add_jump_edge(jump_point.abs) - previous_jump_block = None if is_unconditional else current_basic_block - current_basic_block = None - elif instruction.is_sink(): - previous_jump_block = current_basic_block = None - - if previous_jump_block is not None: - # If last instruction of the function is jump/call, then add dummy - # block to designate end of the function. - end_block = BasicBlock('end_of_function') - dummy_instruction = Instruction('', 'end of function', 0, None, None, [], None, None, target_info) - end_block.add_instruction(dummy_instruction) - previous_jump_block.add_no_jump_edge(end_block.key) - basic_blocks[end_block.key] = end_block - - return current_function_name, basic_blocks - - -def draw_cfg(function_name, basic_blocks, view): - dot = Digraph(name=function_name, comment=function_name, engine='dot') - dot.attr('graph', label=function_name) - for address, basic_block in basic_blocks.items(): - key = str(address) - dot.node(key, shape='record', label=basic_block.get_label()) - for basic_block in basic_blocks.values(): - if basic_block.jump_edge: - if basic_block.no_jump_edge is not None: - dot.edge(f'{basic_block.key}:s0', str(basic_block.no_jump_edge)) - dot.edge(f'{basic_block.key}:s1', str(basic_block.jump_edge)) - elif basic_block.no_jump_edge: - dot.edge(str(basic_block.key), str(basic_block.no_jump_edge)) - if view: - dot.format = 'gv' - with tempfile.NamedTemporaryFile(mode='w+b', prefix=function_name) as filename: - dot.view(filename.name) - print(f'Opening a file {filename.name}.{dot.format} with default viewer. Don\'t forget to delete it later.') - else: - dot.format = 'pdf' - dot.render(filename=function_name, cleanup=True) - print(f'Saved CFG to a file {function_name}.{dot.format}') diff --git a/src/asm2cfg/command_line.py b/src/asm2cfg/command_line.py deleted file mode 100644 index 1a5c8eb..0000000 --- a/src/asm2cfg/command_line.py +++ /dev/null @@ -1,27 +0,0 @@ -""" -Command-line usage support. -""" - -import argparse -from . import asm2cfg - - -def main(): - """ Command-line entry point to the program. """ - parser = argparse.ArgumentParser( - description='Program to draw dot control-flow graph from GDB disassembly for a function.', - epilog='If function CFG rendering takes too long, try to skip function calls with -c flag.' - ) - parser.add_argument('assembly_file', - help='File to contain one function assembly dump') - parser.add_argument('-c', '--skip-calls', action='store_true', - help='Skip function calls from dividing code to blocks') - parser.add_argument('--target', choices=['x86', 'arm'], default='x86', - help='Specify target platform for assembly') - parser.add_argument('-v', '--view', action='store_true', - help='View as a dot graph instead of saving to a file') - args = parser.parse_args() - print('If function CFG rendering takes too long, try to skip function calls with -c flag') - lines = asm2cfg.read_lines(args.assembly_file) - function_name, basic_blocks = asm2cfg.parse_lines(lines, args.skip_calls, args.target) - asm2cfg.draw_cfg(function_name, basic_blocks, args.view) diff --git a/examples/arm.asm b/test/examples/arm.asm similarity index 99% rename from examples/arm.asm rename to test/examples/arm.asm index a3e0416..e165038 100644 --- a/examples/arm.asm +++ b/test/examples/arm.asm @@ -48,4 +48,3 @@ b8: 00000000 .word 0x00000000 bc: 00000103 .word 0x00000103 c0: 00000107 .word 0x00000107 - diff --git a/examples/arm.pdf b/test/examples/arm.pdf similarity index 100% rename from examples/arm.pdf rename to test/examples/arm.pdf diff --git a/examples/att_syntax.asm b/test/examples/att_syntax.asm similarity index 99% rename from examples/att_syntax.asm rename to test/examples/att_syntax.asm index 64c9f1c..d2950d4 100644 --- a/examples/att_syntax.asm +++ b/test/examples/att_syntax.asm @@ -1,5 +1,5 @@ Dump of assembler code for function main: - 0x000000000002ebd0 <+0>: endbr64 + 0x000000000002ebd0 <+0>: endbr64 0x000000000002ebd4 <+4>: push %r15 0x000000000002ebd6 <+6>: push %r14 0x000000000002ebd8 <+8>: push %r13 @@ -16,7 +16,7 @@ Dump of assembler code for function main: 0x000000000002ec05 <+53>: mov %rax,0x128(%rsp) 0x000000000002ec0d <+61>: xor %eax,%eax 0x000000000002ec0f <+63>: callq 0x2eab0 <__sigsetjmp@plt> - 0x000000000002ec14 <+68>: endbr64 + 0x000000000002ec14 <+68>: endbr64 0x000000000002ec18 <+72>: test %eax,%eax 0x000000000002ec1a <+74>: jne 0x2ec7c 0x000000000002ec1c <+76>: callq 0x42360 @@ -86,7 +86,7 @@ Dump of assembler code for function main: 0x000000000002ed58 <+392>: mov $0x1,%esi 0x000000000002ed5d <+397>: lea 0xf735c(%rip),%rdi # 0x1260c0 0x000000000002ed64 <+404>: callq 0x2eab0 <__sigsetjmp@plt> - 0x000000000002ed69 <+409>: endbr64 + 0x000000000002ed69 <+409>: endbr64 0x000000000002ed6d <+413>: test %eax,%eax 0x000000000002ed6f <+415>: je 0x2ed9d 0x000000000002ed71 <+417>: movl $0x0,0xf29bd(%rip) # 0x121738 @@ -157,7 +157,7 @@ Dump of assembler code for function main: 0x000000000002ef45 <+885>: lea 0xfbd14(%rip),%rdi # 0x12ac60 0x000000000002ef4c <+892>: movl $0x1,0xf27c2(%rip) # 0x121718 0x000000000002ef56 <+902>: callq 0x2eab0 <__sigsetjmp@plt> - 0x000000000002ef5b <+907>: endbr64 + 0x000000000002ef5b <+907>: endbr64 0x000000000002ef5f <+911>: test %eax,%eax 0x000000000002ef61 <+913>: jne 0x2ec7c 0x000000000002ef67 <+919>: mov 0x18(%rsp),%rax @@ -495,7 +495,7 @@ Dump of assembler code for function main: 0x000000000002f570 <+2464>: mov 0xf3982(%rip),%eax # 0x122ef8 0x000000000002f576 <+2470>: mov %eax,0x2c(%rsp) 0x000000000002f57a <+2474>: callq 0x2eab0 <__sigsetjmp@plt> - 0x000000000002f57f <+2479>: endbr64 + 0x000000000002f57f <+2479>: endbr64 0x000000000002f583 <+2483>: test %eax,%eax 0x000000000002f585 <+2485>: je 0x2f5b3 0x000000000002f587 <+2487>: sub $0x3,%eax diff --git a/examples/att_syntax.pdf b/test/examples/att_syntax.pdf similarity index 100% rename from examples/att_syntax.pdf rename to test/examples/att_syntax.pdf diff --git a/examples/huge.asm b/test/examples/huge.asm similarity index 99% rename from examples/huge.asm rename to test/examples/huge.asm index 409f5fe..ab9b1fa 100644 --- a/examples/huge.asm +++ b/test/examples/huge.asm @@ -274,7 +274,7 @@ Dump of assembler code for function main: 0x000055555556ff22 <+1490>: pop %r13 0x000055555556ff24 <+1492>: pop %r14 0x000055555556ff26 <+1494>: pop %r15 - 0x000055555556ff28 <+1496>: ret + 0x000055555556ff28 <+1496>: ret 0x000055555556ff29 <+1497>: mov 0x180(%rsp),%rax 0x000055555556ff31 <+1505>: cmpb $0x0,0x2a5a8(%rip) # 0x55555559a4e0 0x000055555556ff38 <+1512>: movq $0x0,0x180(%rsp) diff --git a/examples/huge.pdf b/test/examples/huge.pdf similarity index 100% rename from examples/huge.pdf rename to test/examples/huge.pdf diff --git a/examples/objdump.asm b/test/examples/objdump.asm similarity index 82% rename from examples/objdump.asm rename to test/examples/objdump.asm index b41a5d6..eced5b5 100644 --- a/examples/objdump.asm +++ b/test/examples/objdump.asm @@ -1,5 +1,5 @@ 0000000000016bb0 <_obstack_allocated_p@@Base>: - 16bb0: f3 0f 1e fa endbr64 + 16bb0: f3 0f 1e fa endbr64 16bb4: 48 8b 47 08 mov 0x8(%rdi),%rax 16bb8: 48 85 c0 test %rax,%rax 16bbb: 74 29 je 16be6 <_obstack_allocated_p@@Base+0x36> @@ -12,11 +12,11 @@ 16bce: 48 85 c0 test %rax,%rax 16bd1: 75 ed jne 16bc0 <_obstack_allocated_p@@Base+0x10> 16bd3: 31 c0 xor %eax,%eax - 16bd5: c3 retq + 16bd5: c3 retq 16bd6: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1) - 16bdd: 00 00 00 + 16bdd: 00 00 00 16be0: b8 01 00 00 00 mov $0x1,%eax - 16be5: c3 retq - 16be6: c3 retq + 16be5: c3 retq + 16be6: c3 retq 16be7: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1) - 16bee: 00 00 + 16bee: 00 00 diff --git a/examples/objdump.pdf b/test/examples/objdump.pdf similarity index 100% rename from examples/objdump.pdf rename to test/examples/objdump.pdf diff --git a/examples/stripped_function.asm b/test/examples/stripped_function.asm similarity index 100% rename from examples/stripped_function.asm rename to test/examples/stripped_function.asm diff --git a/examples/stripped_function.pdf b/test/examples/stripped_function.pdf similarity index 100% rename from examples/stripped_function.pdf rename to test/examples/stripped_function.pdf diff --git a/examples/stripped_objdump.asm b/test/examples/stripped_objdump.asm similarity index 87% rename from examples/stripped_objdump.asm rename to test/examples/stripped_objdump.asm index fd13a6c..c3f6c9a 100644 --- a/examples/stripped_objdump.asm +++ b/test/examples/stripped_objdump.asm @@ -1,5 +1,5 @@ 0000000000001000 <.text>: - 1000: f3 0f 1e fa endbr64 + 1000: f3 0f 1e fa endbr64 1004: 55 push %rbp 1005: 48 89 e5 mov %rsp,%rbp 1008: 89 7d fc mov %edi,-0x4(%rbp) @@ -11,4 +11,4 @@ 1019: 8b 45 fc mov -0x4(%rbp),%eax 101c: 0f af c0 imul %eax,%eax 101f: 5d pop %rbp - 1020: c3 retq + 1020: c3 retq diff --git a/examples/stripped_objdump.pdf b/test/examples/stripped_objdump.pdf similarity index 100% rename from examples/stripped_objdump.pdf rename to test/examples/stripped_objdump.pdf diff --git a/examples/test_function.asm b/test/examples/test_function.asm similarity index 99% rename from examples/test_function.asm rename to test/examples/test_function.asm index fbc06e9..75f817c 100644 --- a/examples/test_function.asm +++ b/test/examples/test_function.asm @@ -349,6 +349,6 @@ Dump of assembler code for function test_function: 0x00007ffff7fbf7cb <+1771>: pop %r14 0x00007ffff7fbf7cd <+1773>: pop %r15 0x00007ffff7fbf7cf <+1775>: pop %rbp - 0x00007ffff7fbf7d0 <+1776>: vzeroupper - 0x00007ffff7fbf7d3 <+1779>: ret + 0x00007ffff7fbf7d0 <+1776>: vzeroupper + 0x00007ffff7fbf7d3 <+1779>: ret End of assembler dump. diff --git a/examples/test_function.pdf b/test/examples/test_function.pdf similarity index 100% rename from examples/test_function.pdf rename to test/examples/test_function.pdf diff --git a/test/fixtures/simple_program/hello.c b/test/fixtures/simple_program/hello.c index 9869f83..45fd52e 100644 --- a/test/fixtures/simple_program/hello.c +++ b/test/fixtures/simple_program/hello.c @@ -1,5 +1,3 @@ #include -int main() { - printf("Hello World\n"); -} +int main() { printf("Hello World\n"); } diff --git a/test/templates/call.c b/test/templates/call.c index 14d1905..c587b36 100644 --- a/test/templates/call.c +++ b/test/templates/call.c @@ -5,6 +5,4 @@ __attribute__((visibility("hidden"))) void foo() { } -void bar() { - foo(); -} +void bar() { foo(); } diff --git a/test/templates/common.py b/test/templates/common.py index e1dd3cd..6103756 100644 --- a/test/templates/common.py +++ b/test/templates/common.py @@ -19,7 +19,7 @@ def error(msg): """ Print nicely-formatted error message and exit. """ - sys.stderr.write(f'{_ME}: error: {msg}\n') + sys.stderr.write(f"{_ME}: error: {msg}\n") sys.exit(1) @@ -29,13 +29,14 @@ def _run(cmd, stdin=None, verbose=0): """ if verbose: print(f"{_ME}: running command: {' '.join(cmd)}") - with subprocess.Popen(cmd, stdin=stdin, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) as process: + with subprocess.Popen( + cmd, stdin=stdin, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) as process: out, err = process.communicate() out = out.decode() err = err.decode() if process.returncode != 0: - cmds = ' '.join(cmd) + cmds = " ".join(cmd) error(f"'{cmds}' failed:\n{out}{err}") sys.stderr.write(err) return out @@ -45,7 +46,7 @@ def gcc(args): """ Run compiler with given arguments. """ - return _run(['gcc'] + args) + return _run(["gcc"] + args) def disasm(file, objdump_or_gdb, symbol, start, finish): @@ -53,11 +54,11 @@ def disasm(file, objdump_or_gdb, symbol, start, finish): Disassemble binary file. """ if objdump_or_gdb: - out = _run(['objdump', '-d', file]) + out = _run(["objdump", "-d", file]) elif symbol is not None: - out = _run(['gdb', '-batch', '-ex', f'disassemble {symbol}', file]) + out = _run(["gdb", "-batch", "-ex", f"disassemble {symbol}", file]) else: - out = _run(['gdb', '-batch', '-ex', f'disassemble {start},{finish}', file]) + out = _run(["gdb", "-batch", "-ex", f"disassemble {start},{finish}", file]) return out @@ -65,23 +66,23 @@ def strip_binary(file): """ Strip symbol info from binary file. """ - _run(['strip', '-s', file]) + _run(["strip", "-s", file]) def grep(text, regex): - lines = text.split('\n') + lines = text.split("\n") return list(filter(lambda s: re.search(regex, s), lines)) def find_address(file, name): - out = _run(['readelf', '-sW', file]) - lines = grep(out, fr'{name}$') - assert len(lines) >= 1, f'failed to locate symbol {name} in\n{out}' + out = _run(["readelf", "-sW", file]) + lines = grep(out, rf"{name}$") + assert len(lines) >= 1, f"failed to locate symbol {name} in\n{out}" line = lines[0] # Num: Value Size Type Bind Vis Ndx Name # 27: 0000000000001030 11 FUNC GLOBAL DEFAULT 9 foo line = line.strip() - words = re.split(r'\s+', line) + words = re.split(r"\s+", line) start = int(words[1], 16) size = int(words[2]) return start, start + size diff --git a/test/templates/gen_calls.py b/test/templates/gen_calls.py index 436026c..9fc1880 100755 --- a/test/templates/gen_calls.py +++ b/test/templates/gen_calls.py @@ -14,53 +14,58 @@ set_basename(os.path.basename(__file__)) -for gdb, pic, plt, direct, strip in itertools.product([False, True], - [False, True], # Do we need to test PIE too? - [False, True], - [False, True], - [False, True]): +for gdb, pic, plt, direct, strip in itertools.product( + [False, True], + [False, True], # Do we need to test PIE too? + [False, True], + [False, True], + [False, True], +): # Print config - disasm_type = 'GDB' if gdb else 'objdump' - pic_type = 'position-INdependent' if pic else 'position-dependent' - call_type = 'Non-PIC-call' if direct else 'PIC-call' - strip_type = 'stripped' if strip else 'UNstripped' - plt_type = 'PLT' if plt else 'PLT-less' - print(f'Checking {disasm_type} {pic_type} {plt_type} {call_type} {strip_type}') + disasm_type = "GDB" if gdb else "objdump" + pic_type = "position-INdependent" if pic else "position-dependent" + call_type = "Non-PIC-call" if direct else "PIC-call" + strip_type = "stripped" if strip else "UNstripped" + plt_type = "PLT" if plt else "PLT-less" + print(f"Checking {disasm_type} {pic_type} {plt_type} {call_type} {strip_type}") # Generate object code - flags = ['call.c', '-o', 'a.out', - '-Wl,--defsym,_start=0', '-nostdlib', '-nostartfiles'] + flags = ["call.c", "-o", "a.out", "-Wl,--defsym,_start=0", "-nostdlib", "-nostartfiles"] # DLL or executable? if pic: - flags += ['-fPIC', '-shared'] + flags += ["-fPIC", "-shared"] # Use PLT? if not plt: - flags += ['-fno-plt'] + flags += ["-fno-plt"] # Force non-PLT call for PIC code? if direct and pic: - flags.append('-DHIDDEN') + flags.append("-DHIDDEN") gcc(flags) - caller = 'bar' - start, finish = find_address('a.out', caller) + caller = "bar" + start, finish = find_address("a.out", caller) if strip: - strip_binary('a.out') + strip_binary("a.out") caller = None # Generate disasm - out = disasm('a.out', not gdb, caller, start, finish) + out = disasm("a.out", not gdb, caller, start, finish) # Print snippets - headers = grep(out, r':|Dump of') - calls = grep(out, r'call') - print('''\ + headers = grep(out, r":|Dump of") + calls = grep(out, r"call") + print( + """\ headers: {0} calls: {1} -'''.format('\n '.join(headers), '\n '.join(calls))) +""".format( + "\n ".join(headers), "\n ".join(calls) + ) + ) diff --git a/test/templates/gen_funtable.py b/test/templates/gen_funtable.py index 05a52b1..f0a5971 100755 --- a/test/templates/gen_funtable.py +++ b/test/templates/gen_funtable.py @@ -11,45 +11,56 @@ set_basename(os.path.basename(__file__)) -for gdb, pic, strip in itertools.product([False, True], - [False, True], # Do we need to test PIE too? - [False, True]): +for gdb, pic, strip in itertools.product( + [False, True], [False, True], [False, True] # Do we need to test PIE too? +): # Print config - disasm_type = 'GDB' if gdb else 'objdump' - pic_type = 'position-INdependent' if pic else 'position-dependent' - stripped = 'stripped' if strip else 'UNstripped' - print(f'Checking {disasm_type} {pic_type} {stripped}') + disasm_type = "GDB" if gdb else "objdump" + pic_type = "position-INdependent" if pic else "position-dependent" + stripped = "stripped" if strip else "UNstripped" + print(f"Checking {disasm_type} {pic_type} {stripped}") # Generate object code - flags = ['funtable.c', '-o', 'a.out', - '-Wl,--defsym,_start=0', '-nostdlib', '-nostartfiles', '-O2'] + flags = [ + "funtable.c", + "-o", + "a.out", + "-Wl,--defsym,_start=0", + "-nostdlib", + "-nostartfiles", + "-O2", + ] # DLL or executable? if pic: - flags += ['-fPIC', '-shared'] + flags += ["-fPIC", "-shared"] # Include debuginfo? if not strip: - flags.append('-g') + flags.append("-g") gcc(flags) # Strip - caller = 'bar' - start, finish = find_address('a.out', caller) + caller = "bar" + start, finish = find_address("a.out", caller) if strip: - strip_binary('a.out') + strip_binary("a.out") caller = None # Generate disasm - out = disasm('a.out', not gdb, caller, start, finish) + out = disasm("a.out", not gdb, caller, start, finish) # Print snippets - jumps = grep(out, r'\bcall') - print('''\ + jumps = grep(out, r"\bcall") + print( + """\ table calls: {} -'''.format('\n '.join(jumps))) +""".format( + "\n ".join(jumps) + ) + ) diff --git a/test/templates/gen_jumps.py b/test/templates/gen_jumps.py index 2bd7f74..8580394 100755 --- a/test/templates/gen_jumps.py +++ b/test/templates/gen_jumps.py @@ -11,46 +11,48 @@ set_basename(os.path.basename(__file__)) -for gdb, pic, opt, strip in itertools.product([False, True], - [False, True], # Do we need to test PIE too? - [False, True], - [False, True]): +for gdb, pic, opt, strip in itertools.product( + [False, True], [False, True], [False, True], [False, True] # Do we need to test PIE too? +): # Print config - disasm_type = 'GDB' if gdb else 'objdump' - pic_type = 'position-INdependent' if pic else 'position-dependent' - opt_type = 'optimized' if opt else 'UNoptimized' - stripped = 'stripped' if strip else 'UNstripped' - print(f'Checking {disasm_type} {pic_type} {opt_type} {stripped}') + disasm_type = "GDB" if gdb else "objdump" + pic_type = "position-INdependent" if pic else "position-dependent" + opt_type = "optimized" if opt else "UNoptimized" + stripped = "stripped" if strip else "UNstripped" + print(f"Checking {disasm_type} {pic_type} {opt_type} {stripped}") # Generate object code - flags = ['jump.c', '-o', 'a.out', - '-Wl,--defsym,_start=0', '-nostdlib', '-nostartfiles'] + flags = ["jump.c", "-o", "a.out", "-Wl,--defsym,_start=0", "-nostdlib", "-nostartfiles"] # DLL or executable? if pic: - flags += ['-fPIC', '-shared'] + flags += ["-fPIC", "-shared"] if opt: - flags.append('-O2') + flags.append("-O2") gcc(flags) # Strip - caller = 'bar' - start, finish = find_address('a.out', caller) + caller = "bar" + start, finish = find_address("a.out", caller) if strip: - strip_binary('a.out') + strip_binary("a.out") caller = None # Generate disasm - out = disasm('a.out', not gdb, caller, start, finish) + out = disasm("a.out", not gdb, caller, start, finish) # Print snippets - jumps = grep(out, r'\bj') - print('''\ + jumps = grep(out, r"\bj") + print( + """\ jumps: {} -'''.format('\n '.join(jumps))) +""".format( + "\n ".join(jumps) + ) + ) diff --git a/test/templates/gen_jumptable.py b/test/templates/gen_jumptable.py index 3bd3710..92d853a 100755 --- a/test/templates/gen_jumptable.py +++ b/test/templates/gen_jumptable.py @@ -11,42 +11,53 @@ set_basename(os.path.basename(__file__)) -for gdb, pic, strip in itertools.product([False, True], - [False, True], # Do we need to test PIE too? - [False, True]): +for gdb, pic, strip in itertools.product( + [False, True], [False, True], [False, True] # Do we need to test PIE too? +): # Print config - disasm_type = 'GDB' if gdb else 'objdump' - pic_type = 'position-INdependent' if pic else 'position-dependent' - stripped = 'stripped' if strip else 'UNstripped' - print(f'Checking {disasm_type} {pic_type} {stripped}') + disasm_type = "GDB" if gdb else "objdump" + pic_type = "position-INdependent" if pic else "position-dependent" + stripped = "stripped" if strip else "UNstripped" + print(f"Checking {disasm_type} {pic_type} {stripped}") # Generate object code - flags = ['jumptable.c', '-o', 'a.out', - '-Wl,--defsym,_start=0', '-nostdlib', '-nostartfiles', '-O2'] + flags = [ + "jumptable.c", + "-o", + "a.out", + "-Wl,--defsym,_start=0", + "-nostdlib", + "-nostartfiles", + "-O2", + ] # DLL or executable? if pic: - flags += ['-fPIC', '-shared'] + flags += ["-fPIC", "-shared"] gcc(flags) # Strip - caller = 'bar' - start, finish = find_address('a.out', caller) + caller = "bar" + start, finish = find_address("a.out", caller) if strip: - strip_binary('a.out') + strip_binary("a.out") caller = None # Generate disasm - out = disasm('a.out', not gdb, caller, start, finish) + out = disasm("a.out", not gdb, caller, start, finish) # Print snippets - jumps = grep(out, r'\bjmp') - print('''\ + jumps = grep(out, r"\bjmp") + print( + """\ table jumps: {} -'''.format('\n '.join(jumps))) +""".format( + "\n ".join(jumps) + ) + ) diff --git a/test/test_gdb.py b/test/test_gdb.py index 13d6ead..06c19a9 100644 --- a/test/test_gdb.py +++ b/test/test_gdb.py @@ -7,67 +7,80 @@ def test_savecfg_help(): - result = execute_gdb_command('help savecfg') - assert 'Save an assembly control-flow graph (CFG)' in result.stdout + result = execute_gdb_command("help savecfg") + assert "Save an assembly control-flow graph (CFG)" in result.stdout def test_viewcfg_help(): - result = execute_gdb_command('help viewcfg') - assert 'Draw an assembly control-flow graph (CFG)' in result.stdout + result = execute_gdb_command("help viewcfg") + assert "Draw an assembly control-flow graph (CFG)" in result.stdout def test_help_set(): - result = execute_gdb_command('help set') - assert 'set skipcalls -- Set whether savecfg and viewcfg commands will skip function' in result.stdout + result = execute_gdb_command("help set") + assert ( + "set skipcalls -- Set whether savecfg and viewcfg commands will skip function" + in result.stdout + ) def test_help_set_skipcalls(): - result = execute_gdb_command('help set skipcalls') - assert 'Set whether savecfg and viewcfg commands will skip function' in result.stdout + result = execute_gdb_command("help set skipcalls") + assert "Set whether savecfg and viewcfg commands will skip function" in result.stdout def test_help_show_skipcalls(): - result = execute_gdb_command('help show skipcalls') - assert 'Set whether savecfg and viewcfg commands will skip function' in result.stdout + result = execute_gdb_command("help show skipcalls") + assert "Set whether savecfg and viewcfg commands will skip function" in result.stdout def test_show_skipcalls(): - result = execute_gdb_command('show skipcalls') - assert 'Commands savecfg and viewcfg will' in result.stdout + result = execute_gdb_command("show skipcalls") + assert "Commands savecfg and viewcfg will" in result.stdout def test_skipcalls_inital_value(): - result = execute_gdb_command('show skipcalls') - assert parse_option_value(result.stdout) == 'off' + result = execute_gdb_command("show skipcalls") + assert parse_option_value(result.stdout) == "off" def test_change_skipcalls_value(): - result = execute_gdb_command('set skipcalls') - assert parse_option_value(result.stdout) == 'on' + result = execute_gdb_command("set skipcalls") + assert parse_option_value(result.stdout) == "on" def test_savecfg(): result = execute_gdb_commands( - ['set confirm off', 'set breakpoint pending on', - 'file test/fixtures/simple_program/hello', 'b main', - 'run', 'savecfg'] + [ + "set confirm off", + "set breakpoint pending on", + "file test/fixtures/simple_program/hello", + "b main", + "run", + "savecfg", + ] ) - assert os.path.isfile('main.pdf'), result.stdout - assert 'Saved CFG to a file main.pdf' in result.stdout, result.stdout + assert os.path.isfile("main.pdf"), result.stdout + assert "Saved CFG to a file main.pdf" in result.stdout, result.stdout def test_viewcfg(): - stdout = '' + stdout = "" try: result = execute_gdb_commands( - ['set confirm off', 'set breakpoint pending on', - 'file test/fixtures/simple_program/hello', 'b main', - 'run', 'viewcfg'] + [ + "set confirm off", + "set breakpoint pending on", + "file test/fixtures/simple_program/hello", + "b main", + "run", + "viewcfg", + ] ) stdout = result.stdout except subprocess.TimeoutExpired as ex: stdout = str(ex.stdout) - viewcfg_pattern = re.compile(r'Opening a file (.*) with default viewer') + viewcfg_pattern = re.compile(r"Opening a file (.*) with default viewer") result = viewcfg_pattern.search(stdout) assert result is not None, stdout @@ -83,23 +96,28 @@ def execute_gdb_command(command): def execute_gdb_commands(commands): project_root_path = os.getcwd() - os.environ['PYTHONPATH'] = f'{project_root_path}/src/' - gdb_script_path = f'{project_root_path}/src/gdb_asm2cfg.py' - gdb_command = ['gdb', '-ex', f'source {gdb_script_path}'] + os.environ["PYTHONPATH"] = f"{project_root_path}/src/" + gdb_script_path = f"{project_root_path}/src/gdb_asm2cfg.py" + gdb_command = ["gdb", "-ex", f"source {gdb_script_path}"] for command in commands: - gdb_command.append('-ex') + gdb_command.append("-ex") gdb_command.append(command) - gdb_command.append('-ex') - gdb_command.append('q') + gdb_command.append("-ex") + gdb_command.append("q") result = subprocess.run( - gdb_command, stdout=subprocess.PIPE, stdin=None, - stderr=None, timeout=2, check=True, universal_newlines=True, + gdb_command, + stdout=subprocess.PIPE, + stdin=None, + stderr=None, + timeout=2, + check=True, + universal_newlines=True, ) return result def parse_option_value(gdb_output): - output_pattern = re.compile(r'blocks: (on|off)') + output_pattern = re.compile(r"blocks: (on|off)") result = output_pattern.search(gdb_output) if result: return result.group(1) diff --git a/test/test_parser.py b/test/test_parser.py index e5edeb1..dc4ca7c 100644 --- a/test/test_parser.py +++ b/test/test_parser.py @@ -22,15 +22,15 @@ def setUp(self): self.arm_target_info = asm2cfg.ARMTargetInfo() def test_simple_inst(self): - line = '0x000055555556f957 <+7>: push %r14' - i = asm2cfg.parse_line(line, 1, 'main', asm2cfg.InputFormat.GDB, self.target_info) + line = "0x000055555556f957 <+7>: push %r14" + i = asm2cfg.parse_line(line, 1, "main", asm2cfg.InputFormat.GDB, self.target_info) self.assertIsNot(i, None) - self.assertEqual(i.body, 'push %r14') + self.assertEqual(i.body, "push %r14") self.assertEqual(i.lineno, 1) self.assertIsNot(i.address, None) - self.assertEqual(i.address.abs, 0x000055555556f957) - self.assertIs(i.address.base, 'main') + self.assertEqual(i.address.abs, 0x000055555556F957) + self.assertIs(i.address.base, "main") self.assertEqual(i.address.offset, 7) self.assertIs(i.target, None) @@ -39,21 +39,21 @@ def test_simple_inst(self): self.assertFalse(i.is_unconditional_jump()) def test_jump(self): - line = '''\ + line = """\ 0x00007ffff7fbf26b <+395>: jmp 0x7ffff7fbf55d \ -''' - i = asm2cfg.parse_line(line, 1, 'main', asm2cfg.InputFormat.GDB, self.target_info) +""" + i = asm2cfg.parse_line(line, 1, "main", asm2cfg.InputFormat.GDB, self.target_info) self.assertIsNot(i, None) - self.assertEqual(i.body, 'jmp 0x7ffff7fbf55d') + self.assertEqual(i.body, "jmp 0x7ffff7fbf55d") self.assertEqual(i.lineno, 1) self.assertIsNot(i.address, None) - self.assertEqual(i.address.abs, 0x00007ffff7fbf26b) - self.assertIs(i.address.base, 'main') + self.assertEqual(i.address.abs, 0x00007FFFF7FBF26B) + self.assertIs(i.address.base, "main") self.assertEqual(i.address.offset, 395) self.assertIsNot(i.target, None) self.assertIs(i.target.abs, None) # FIXME - self.assertEqual(i.target.base, 'test_function') + self.assertEqual(i.target.base, "test_function") self.assertEqual(i.target.offset, 1149) self.assertFalse(i.is_call()) @@ -61,21 +61,21 @@ def test_jump(self): self.assertTrue(i.is_unconditional_jump()) def test_branch(self): - line = '''\ + line = """\ 0x00007ffff7fbf565 <+1157>: je 0x7ffff7fbf635 \ -''' - i = asm2cfg.parse_line(line, 1, 'main', asm2cfg.InputFormat.GDB, self.target_info) +""" + i = asm2cfg.parse_line(line, 1, "main", asm2cfg.InputFormat.GDB, self.target_info) self.assertIsNot(i, None) - self.assertEqual(i.body, 'je 0x7ffff7fbf635') + self.assertEqual(i.body, "je 0x7ffff7fbf635") self.assertEqual(i.lineno, 1) self.assertIsNot(i.address, None) - self.assertEqual(i.address.abs, 0x00007ffff7fbf565) - self.assertIs(i.address.base, 'main') + self.assertEqual(i.address.abs, 0x00007FFFF7FBF565) + self.assertIs(i.address.base, "main") self.assertEqual(i.address.offset, 1157) self.assertIsNot(i.target, None) self.assertIs(i.target.abs, None) # FIXME - self.assertEqual(i.target.base, 'test_function') + self.assertEqual(i.target.base, "test_function") self.assertEqual(i.target.offset, 1365) self.assertFalse(i.is_call()) @@ -83,21 +83,21 @@ def test_branch(self): self.assertFalse(i.is_unconditional_jump()) def test_call(self): - line = '''\ + line = """\ 0x000000000002ec0f <+63>: callq 0x2eab0 <__sigsetjmp@plt> -''' - i = asm2cfg.parse_line(line, 1, 'main', asm2cfg.InputFormat.GDB, self.target_info) +""" + i = asm2cfg.parse_line(line, 1, "main", asm2cfg.InputFormat.GDB, self.target_info) self.assertIsNot(i, None) - self.assertEqual(i.body, 'callq 0x2eab0') + self.assertEqual(i.body, "callq 0x2eab0") self.assertEqual(i.lineno, 1) self.assertIsNot(i.address, None) - self.assertEqual(i.address.abs, 0x000000000002ec0f) - self.assertIs(i.address.base, 'main') + self.assertEqual(i.address.abs, 0x000000000002EC0F) + self.assertIs(i.address.base, "main") self.assertEqual(i.address.offset, 63) self.assertIsNot(i.target, None) self.assertIs(i.target.abs, None) # FIXME - self.assertEqual(i.target.base, '__sigsetjmp@plt') + self.assertEqual(i.target.base, "__sigsetjmp@plt") self.assertEqual(i.target.offset, 0) self.assertTrue(i.is_call()) @@ -105,20 +105,20 @@ def test_call(self): self.assertFalse(i.is_unconditional_jump()) def test_call_stripped(self): - line = '''\ + line = """\ 0x000055555556f9b0 <+96>: call *0x2731a(%rip) # 0x555555596cd0 -''' - i = asm2cfg.parse_line(line, 1, 'main', asm2cfg.InputFormat.GDB, self.target_info) +""" + i = asm2cfg.parse_line(line, 1, "main", asm2cfg.InputFormat.GDB, self.target_info) self.assertIsNot(i, None) - self.assertEqual(i.body, 'call *0x2731a(%rip)') + self.assertEqual(i.body, "call *0x2731a(%rip)") self.assertEqual(i.lineno, 1) self.assertIsNot(i.address, None) - self.assertEqual(i.address.abs, 0x000055555556f9b0) - self.assertIs(i.address.base, 'main') + self.assertEqual(i.address.abs, 0x000055555556F9B0) + self.assertIs(i.address.base, "main") self.assertEqual(i.address.offset, 96) self.assertIsNot(i.target, None) - self.assertEqual(i.target.abs, 0x555555596cd0) + self.assertEqual(i.target.abs, 0x555555596CD0) self.assertIs(i.target.base, None) self.assertIs(i.target.offset, None) @@ -127,60 +127,60 @@ def test_call_stripped(self): self.assertFalse(i.is_unconditional_jump()) def test_objdump(self): - line = '''\ + line = """\ 16bbb: 74 29 je 16be6 <_obstack_allocated_p@@Base+0x36> -''' - i = asm2cfg.parse_line(line, 1, 'main', asm2cfg.InputFormat.OBJDUMP, self.target_info) +""" + i = asm2cfg.parse_line(line, 1, "main", asm2cfg.InputFormat.OBJDUMP, self.target_info) self.assertIsNot(i, None) - self.assertEqual(i.body, 'je 16be6') + self.assertEqual(i.body, "je 16be6") self.assertEqual(i.lineno, 1) self.assertIsNot(i.address, None) - self.assertEqual(i.address.abs, 0x16bbb) - self.assertIs(i.address.base, 'main') + self.assertEqual(i.address.abs, 0x16BBB) + self.assertIs(i.address.base, "main") self.assertIsNot(i.target, None) self.assertIs(i.target.abs, None) - self.assertEqual(i.target.base, '_obstack_allocated_p@@Base') + self.assertEqual(i.target.base, "_obstack_allocated_p@@Base") self.assertIs(i.target.offset, 0x36) self.assertTrue(i.is_jump()) self.assertFalse(i.is_unconditional_jump()) def test_arm_branch(self): - line = '''\ + line = """\ 1c: 0a000001 beq 28 -''' - i = asm2cfg.parse_line(line, 1, 'main', asm2cfg.InputFormat.OBJDUMP, self.arm_target_info) +""" + i = asm2cfg.parse_line(line, 1, "main", asm2cfg.InputFormat.OBJDUMP, self.arm_target_info) self.assertIsNot(i, None) - self.assertEqual(i.body, 'beq 28') + self.assertEqual(i.body, "beq 28") self.assertEqual(i.lineno, 1) self.assertIsNot(i.address, None) - self.assertEqual(i.address.abs, 0x1c) - self.assertIs(i.address.base, 'main') + self.assertEqual(i.address.abs, 0x1C) + self.assertIs(i.address.base, "main") self.assertIsNot(i.target, None) self.assertIs(i.target.abs, None) - self.assertEqual(i.target.base, 'check_one_fd') + self.assertEqual(i.target.base, "check_one_fd") self.assertIs(i.target.offset, 0x28) self.assertTrue(i.is_jump()) self.assertFalse(i.is_unconditional_jump()) def test_arm_jump(self): - line = '''\ + line = """\ 1c: 0a000001 b 28 -''' - i = asm2cfg.parse_line(line, 1, 'main', asm2cfg.InputFormat.OBJDUMP, self.arm_target_info) +""" + i = asm2cfg.parse_line(line, 1, "main", asm2cfg.InputFormat.OBJDUMP, self.arm_target_info) self.assertIsNot(i, None) - self.assertEqual(i.body, 'b 28') + self.assertEqual(i.body, "b 28") self.assertEqual(i.lineno, 1) self.assertIsNot(i.address, None) - self.assertEqual(i.address.abs, 0x1c) - self.assertIs(i.address.base, 'main') + self.assertEqual(i.address.abs, 0x1C) + self.assertIs(i.address.base, "main") self.assertIsNot(i.target, None) self.assertIs(i.target.abs, None) - self.assertEqual(i.target.base, 'check_one_fd') + self.assertEqual(i.target.base, "check_one_fd") self.assertIs(i.target.offset, 0x28) self.assertTrue(i.is_jump()) @@ -193,7 +193,7 @@ class ParseLinesTestCase(unittest.TestCase): """ def test_linear_sequence(self): - lines = '''\ + lines = """\ Dump of assembler code for function main: 0x000055555556f952 <+2>: mov $0x1,%ecx 0x000055555556f957 <+7>: push %r14 @@ -201,8 +201,10 @@ def test_linear_sequence(self): 0x000055555556f95b <+11>: push %r12 0x000055555556f95d <+13>: push %rbp 0x000055555556f95e <+14>: push %rbx\ -'''.split('\n') - _, blocks = asm2cfg.parse_lines(lines, False, 'x86') +""".split( + "\n" + ) + _, blocks = asm2cfg.parse_lines(lines, False, "x86") self.assertEqual(len(blocks), 1) _, block = blocks.popitem() @@ -211,13 +213,15 @@ def test_linear_sequence(self): self.assertIs(block.no_jump_edge, None) def test_unconditional(self): - lines = '''\ + lines = """\ Dump of assembler code for function main: 0x000055555556fffb <+1707>: jmp 0x555555570058 0x0000555555570058 <+1800>: mov 0xe0(%rsp),%rdi 0x0000555555570060 <+1808>: test %rdi,%rdi -'''.split('\n') - _, blocks = asm2cfg.parse_lines(lines, False, 'x86') +""".split( + "\n" + ) + _, blocks = asm2cfg.parse_lines(lines, False, "x86") self.assertEqual(len(blocks), 2) @@ -232,7 +236,7 @@ def test_unconditional(self): self.assertEqual(len(dst_block.instructions), 2) def test_conditional(self): - lines = '''\ + lines = """\ Dump of assembler code for function main: 0x000055555556fffb <+1707>: je 0x555555570058 0x000055555556fffd <+1709>: push %rbx @@ -240,8 +244,10 @@ def test_conditional(self): 0x000055555556fffe <+1710>: mov %r15,%r8 0x0000555555570058 <+1800>: mov 0xe0(%rsp),%rdi 0x0000555555570060 <+1808>: test %rdi,%rdi -'''.split('\n') - _, blocks = asm2cfg.parse_lines(lines, False, 'x86') +""".split( + "\n" + ) + _, blocks = asm2cfg.parse_lines(lines, False, "x86") self.assertEqual(len(blocks), 3) @@ -261,13 +267,15 @@ def test_conditional(self): self.assertEqual(len(dst_block.instructions), 2) def test_return(self): - lines = '''\ + lines = """\ Dump of assembler code for function main: 0x000055555556fffb <+1707>: retq 0x0000555555570058 <+1800>: mov 0xe0(%rsp),%rdi 0x0000555555570060 <+1808>: test %rdi,%rdi -'''.split('\n') - _, blocks = asm2cfg.parse_lines(lines, False, 'x86') +""".split( + "\n" + ) + _, blocks = asm2cfg.parse_lines(lines, False, "x86") self.assertEqual(len(blocks), 2) @@ -285,7 +293,7 @@ def test_return(self): @unittest.expectedFailure def test_jumptables(self): - lines = '''\ + lines = """\ Dump of assembler code for function bar: 0x0000000000001070 <+0>: endbr64 0x0000000000001074 <+4>: cmp $0x9,%edi @@ -331,19 +339,23 @@ def test_jumptables(self): 0x0000000000001119 <+169>: retq 0x000000000000111a <+170>: nopw 0x0(%rax,%rax,1) 0x0000000000001120 <+176>: retq -'''.split('\n') - _, blocks = asm2cfg.parse_lines(lines, False, 'x86') +""".split( + "\n" + ) + _, blocks = asm2cfg.parse_lines(lines, False, "x86") # TODO: special block for indirect jumps self.assertEqual(len(blocks), 4) def test_dummy_block(self): - lines = '''\ + lines = """\ Dump of assembler code for function main: 0x000055555556fffb <+1709>: push %rbx 0x000055555556fffd <+1707>: je 0x000055555556fffb -'''.split('\n') - _, blocks = asm2cfg.parse_lines(lines, False, 'x86') +""".split( + "\n" + ) + _, blocks = asm2cfg.parse_lines(lines, False, "x86") self.assertEqual(len(blocks), 2) @@ -356,7 +368,7 @@ def test_dummy_block(self): self.assertIs(fall_block.jump_edge, None) self.assertIs(fall_block.no_jump_edge, None) self.assertEqual(len(fall_block.instructions), 1) - self.assertEqual(fall_block.instructions[0].text, 'end of function') + self.assertEqual(fall_block.instructions[0].text, "end of function") # TODO: # - functions (with and w/o calls) diff --git a/test/test_regex.py b/test/test_regex.py index 801d2a8..c896601 100644 --- a/test/test_regex.py +++ b/test/test_regex.py @@ -13,25 +13,25 @@ class FunctionHeaderTestCase(unittest.TestCase): """ def test_gdb_unstripped(self): - line = 'Dump of assembler code for function test_function:' + line = "Dump of assembler code for function test_function:" fmt, fun = asm2cfg.parse_function_header(line) self.assertEqual(fmt, asm2cfg.InputFormat.GDB) - self.assertEqual(fun, 'test_function') + self.assertEqual(fun, "test_function") def test_gdb_stripped(self): - line = 'Dump of assembler code from 0x555555555faf to 0x555555557008:' + line = "Dump of assembler code from 0x555555555faf to 0x555555557008:" fmt, fun = asm2cfg.parse_function_header(line) self.assertEqual(fmt, asm2cfg.InputFormat.GDB) - self.assertEqual(fun, '0x555555555faf-0x555555557008') + self.assertEqual(fun, "0x555555555faf-0x555555557008") def test_objdump(self): - line = '000000000000100b :' + line = "000000000000100b :" fmt, fun = asm2cfg.parse_function_header(line) self.assertEqual(fmt, asm2cfg.InputFormat.OBJDUMP) - self.assertEqual(fun, 'bar') + self.assertEqual(fun, "bar") class ParseAddressTestCase(unittest.TestCase): @@ -40,24 +40,24 @@ class ParseAddressTestCase(unittest.TestCase): """ def test_absolute(self): - line = '0x000055555557259c: XYZ' + line = "0x000055555557259c: XYZ" address, rest = asm2cfg.parse_address(line) self.assertIsNot(address, None) - self.assertEqual(address.abs, 0x55555557259c) + self.assertEqual(address.abs, 0x55555557259C) self.assertIs(address.base, None) self.assertIs(address.offset, None) - self.assertEqual(rest, ' XYZ') + self.assertEqual(rest, " XYZ") def test_relative(self): - line = '0x000055555557259c <+11340>: XYZ' + line = "0x000055555557259c <+11340>: XYZ" address, rest = asm2cfg.parse_address(line) self.assertIsNot(address, None) - self.assertEqual(address.abs, 0x55555557259c) + self.assertEqual(address.abs, 0x55555557259C) self.assertIs(address.base, None) self.assertEqual(address.offset, 11340) - self.assertEqual(rest, ' XYZ') + self.assertEqual(rest, " XYZ") class ParseBodyTestCase(unittest.TestCase): @@ -69,54 +69,54 @@ def setUp(self): self.target_info = asm2cfg.X86TargetInfo() def test_gdb_stripped_known(self): - line = ' call 0x55555558add0 <_Z19exportDebugifyStats>' + line = " call 0x55555558add0 <_Z19exportDebugifyStats>" body, opcode, ops, rest = asm2cfg.parse_body(line, self.target_info) self.assertIsNot(body, None) - self.assertEqual(body, 'call 0x55555558add0') - self.assertEqual(opcode, 'call') - self.assertEqual(ops, ['0x55555558add0']) - self.assertEqual(rest, '<_Z19exportDebugifyStats>') + self.assertEqual(body, "call 0x55555558add0") + self.assertEqual(opcode, "call") + self.assertEqual(ops, ["0x55555558add0"]) + self.assertEqual(rest, "<_Z19exportDebugifyStats>") def test_gdb_stripped_pic(self): - line = ' call *0x26a16(%rip) # 0x5555555967a8' + line = " call *0x26a16(%rip) # 0x5555555967a8" body, opcode, ops, rest = asm2cfg.parse_body(line, self.target_info) self.assertIsNot(body, None) - self.assertEqual(body, 'call *0x26a16(%rip)') - self.assertEqual(opcode, 'call') - self.assertEqual(ops, ['*0x26a16(%rip)']) - self.assertEqual(rest, '# 0x5555555967a8') + self.assertEqual(body, "call *0x26a16(%rip)") + self.assertEqual(opcode, "call") + self.assertEqual(ops, ["*0x26a16(%rip)"]) + self.assertEqual(rest, "# 0x5555555967a8") def test_gdb_plt(self): - line = ' callq 0x1020 ' + line = " callq 0x1020 " body, opcode, ops, rest = asm2cfg.parse_body(line, self.target_info) self.assertIsNot(body, None) - self.assertEqual(body, 'callq 0x1020') - self.assertEqual(opcode, 'callq') - self.assertEqual(ops, ['0x1020']) - self.assertEqual(rest, '') + self.assertEqual(body, "callq 0x1020") + self.assertEqual(opcode, "callq") + self.assertEqual(ops, ["0x1020"]) + self.assertEqual(rest, "") def test_gdb_stripped_nonpic(self): - line = ' call 0x555555555542' + line = " call 0x555555555542" body, opcode, ops, rest = asm2cfg.parse_body(line, self.target_info) self.assertIsNot(body, None) - self.assertEqual(body, 'call 0x555555555542') - self.assertEqual(opcode, 'call') - self.assertEqual(ops, ['0x555555555542']) - self.assertEqual(rest, '') + self.assertEqual(body, "call 0x555555555542") + self.assertEqual(opcode, "call") + self.assertEqual(ops, ["0x555555555542"]) + self.assertEqual(rest, "") def test_gdb_indirect_call(self): - line = ' callq *(%rsi)' + line = " callq *(%rsi)" body, opcode, ops, rest = asm2cfg.parse_body(line, self.target_info) self.assertIsNot(body, None) - self.assertEqual(body, 'callq *(%rsi)') - self.assertEqual(opcode, 'callq') - self.assertEqual(ops, ['*(%rsi)']) - self.assertEqual(rest, '') + self.assertEqual(body, "callq *(%rsi)") + self.assertEqual(opcode, "callq") + self.assertEqual(ops, ["*(%rsi)"]) + self.assertEqual(rest, "") class ParseTargetTestCase(unittest.TestCase): @@ -125,44 +125,44 @@ class ParseTargetTestCase(unittest.TestCase): """ def test_with_offset(self): - line = '<_Z19exportDebugifyStats+123>' + line = "<_Z19exportDebugifyStats+123>" address, rest = asm2cfg.parse_target(line) self.assertIsNot(address, None) self.assertIs(address.abs, None) - self.assertEqual(address.base, '_Z19exportDebugifyStats') + self.assertEqual(address.base, "_Z19exportDebugifyStats") self.assertEqual(address.offset, 123) - self.assertEqual(rest, '') + self.assertEqual(rest, "") def test_with_neg_offset(self): - line = '<_Z19exportDebugifyStats-123>' + line = "<_Z19exportDebugifyStats-123>" address, rest = asm2cfg.parse_target(line) self.assertIsNot(address, None) self.assertIs(address.abs, None) - self.assertEqual(address.base, '_Z19exportDebugifyStats') + self.assertEqual(address.base, "_Z19exportDebugifyStats") self.assertEqual(address.offset, -123) - self.assertEqual(rest, '') + self.assertEqual(rest, "") def test_without_offset(self): - line = '<_Z19exportDebugifyStats>' + line = "<_Z19exportDebugifyStats>" address, rest = asm2cfg.parse_target(line) self.assertIsNot(address, None) self.assertIs(address.abs, None) - self.assertEqual(address.base, '_Z19exportDebugifyStats') + self.assertEqual(address.base, "_Z19exportDebugifyStats") self.assertEqual(address.offset, 0) - self.assertEqual(rest, '') + self.assertEqual(rest, "") def test_with_dot(self): - line = '' + line = "" address, rest = asm2cfg.parse_target(line) self.assertIsNot(address, None) self.assertIs(address.abs, None) - self.assertEqual(address.base, 'stdin@GLIBC_2.2.5') + self.assertEqual(address.base, "stdin@GLIBC_2.2.5") self.assertEqual(address.offset, 0) - self.assertEqual(rest, '') + self.assertEqual(rest, "") class ParseCommentTestCase(unittest.TestCase): @@ -174,31 +174,31 @@ def setUp(self): self.target_info = asm2cfg.X86TargetInfo() def test_absolute(self): - line = '# 0x5555555967a8' + line = "# 0x5555555967a8" address, rest = asm2cfg.parse_comment(line, self.target_info) self.assertIsNot(address, None) - self.assertEqual(address.abs, 0x5555555967a8) + self.assertEqual(address.abs, 0x5555555967A8) self.assertIs(address.base, None) self.assertIs(address.offset, None) - self.assertEqual(rest, '') + self.assertEqual(rest, "") def test_symbolic(self): - line = '# 0x5555555967a8 ' + line = "# 0x5555555967a8 " address, rest = asm2cfg.parse_comment(line, self.target_info) self.assertIsNot(address, None) - self.assertEqual(address.abs, 0x5555555967a8) - self.assertEqual(address.base, 'foo') + self.assertEqual(address.abs, 0x5555555967A8) + self.assertEqual(address.base, "foo") self.assertIs(address.offset, 0) - self.assertEqual(rest, '') + self.assertEqual(rest, "") def test_complete(self): - line = '# 3ff8 ' + line = "# 3ff8 " address, rest = asm2cfg.parse_comment(line, self.target_info) self.assertIsNot(address, None) - self.assertEqual(address.abs, 0x3ff8) # FIXME: support hex offsets - self.assertEqual(address.base, 'foo') - self.assertEqual(address.offset, 0x2ff8) - self.assertEqual(rest, '') + self.assertEqual(address.abs, 0x3FF8) # FIXME: support hex offsets + self.assertEqual(address.base, "foo") + self.assertEqual(address.offset, 0x2FF8) + self.assertEqual(rest, "")