diff --git a/chb/app/CHVersion.py b/chb/app/CHVersion.py
index 62bec65c..ad754b54 100644
--- a/chb/app/CHVersion.py
+++ b/chb/app/CHVersion.py
@@ -1 +1 @@
-chbversion: str = "0.3.0-20251022"
+chbversion: str = "0.3.0-20260122"
diff --git a/chb/app/InstrXData.py b/chb/app/InstrXData.py
index 3017943d..b7ba00cd 100644
--- a/chb/app/InstrXData.py
+++ b/chb/app/InstrXData.py
@@ -432,7 +432,9 @@ def has_call_target(self) -> bool:
key = self.tags[0]
if key.startswith("a:"):
keyletters = key[2:]
- return len(self.args) == len(keyletters) + 1
+ return (
+ len(self.args) == len(keyletters) + 1
+ and self.args[-1] > 0)
else:
return False
elif len(self.tags) >= 2 and self.tags[1] == "call":
@@ -470,9 +472,9 @@ def has_indirect_call_target_exprs(self) -> bool:
return (len(self.tags) == 2 and self.tags[1] == "u" and len(self.args) > 1)
def call_target(self, ixd: "InterfaceDictionary") -> "CallTarget":
- if self.has_call_target() and self.is_bx_call:
+ if self.has_call_target() and self.is_bx_call and self.args[-5] > 0:
return ixd.call_target(self.args[-5])
- elif self.has_call_target():
+ elif self.has_call_target() and self.args[-1] > 0:
return ixd.call_target(self.args[-1])
else:
raise UF.CHBError(
diff --git a/chb/arm/opcodes/ARMPreloadData.py b/chb/arm/opcodes/ARMPreloadData.py
index 740564db..df7ba0ff 100644
--- a/chb/arm/opcodes/ARMPreloadData.py
+++ b/chb/arm/opcodes/ARMPreloadData.py
@@ -4,7 +4,7 @@
# ------------------------------------------------------------------------------
# The MIT License (MIT)
#
-# Copyright (c) 2021 Aarno Labs LLC
+# Copyright (c) 2021-2025 Aarno Labs LLC
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -30,9 +30,11 @@
from chb.app.InstrXData import InstrXData
from chb.arm.ARMDictionaryRecord import armregistry
-from chb.arm.ARMOpcode import ARMOpcode, simplify_result
+from chb.arm.ARMOpcode import ARMOpcode, ARMOpcodeXData, simplify_result
from chb.arm.ARMOperand import ARMOperand
+from chb.invariants.XXpr import XXpr
+
import chb.util.fileutil as UF
from chb.util.IndexedTable import IndexedTableValue
@@ -41,6 +43,29 @@
import chb.arm.ARMDictionary
+class ARMPreloadDataXData(ARMOpcodeXData):
+ """Data format:
+ - expressions:
+ 0: xbase
+ 1: xmem
+ """
+
+ def __init__(self, xdata: InstrXData) -> None:
+ ARMOpcodeXData.__init__(self, xdata)
+
+ @property
+ def xbase(self) -> "XXpr":
+ return self.xpr(0, "xbase")
+
+ @property
+ def xmem(self) -> "XXpr":
+ return self.xpr(1, "xmem")
+
+ @property
+ def annotation(self) -> str:
+ return "Preload-data(" + str(self.xmem)
+
+
@armregistry.register_tag("PLDW", ARMOpcode)
@armregistry.register_tag("PLD", ARMOpcode)
class ARMPreloadData(ARMOpcode):
@@ -72,6 +97,5 @@ def annotation(self, xdata: InstrXData) -> str:
xprs[0]: value of base register
xprs[1]: value of memory location
"""
-
- rhs = str(xdata.xprs[1])
- return "Preload-data(" + rhs + ")"
+ xd = ARMPreloadDataXData(xdata)
+ return xd.annotation
diff --git a/chb/astinterface/ASTIProvenance.py b/chb/astinterface/ASTIProvenance.py
index 511049f7..a092cd5c 100644
--- a/chb/astinterface/ASTIProvenance.py
+++ b/chb/astinterface/ASTIProvenance.py
@@ -446,10 +446,17 @@ def resolve_reaching_defs(self) -> None:
# Allow for change of name of return value
if str(instr.lhs) == v or v == "R0" or v == "S0":
self.add_reaching_definition(xid, instrid)
+ elif instr.lhs is None:
+ chklogger.logger.info(
+ "Lhs variable %s is suppressed in call to "
+ "%s for reaching def address %s",
+ v, str(instr.tgt), addr)
+ self.add_reaching_definition(xid, instrid)
else:
chklogger.logger.warning(
- "Variable names don't match: %s vs %s",
- str(instr.lhs), v)
+ "Lhs variable names don't match: %s vs %s"
+ + " to %s for reaching def address %s",
+ str(instr.lhs), v, str(instr.tgt), addr)
else:
chklogger.logger.warning(
"Expression is defined by unknown instruction: "
diff --git a/chb/astinterface/ASTInterface.py b/chb/astinterface/ASTInterface.py
index 4b504572..4c3cdb93 100644
--- a/chb/astinterface/ASTInterface.py
+++ b/chb/astinterface/ASTInterface.py
@@ -1016,12 +1016,7 @@ def introduce_stack_variables(
stackvartypes: Dict[int, "BCTyp"]) -> None:
"""Creates stack variables/buffers for all stack offsets with types."""
- # local variable stack offsets from the type inference are positive,
- # so they must be negated here. For the same reason, to capture the
- # largest extent of every varinfo, offsets must be traversed in reverse
- # order.
- for (offset, bctype) in sorted(stackvartypes.items(), reverse=True):
- offset = -offset
+ for (offset, bctype) in sorted(stackvartypes.items()):
vtype = bctype.convert(self.typconverter)
self.mk_stack_variable_lval(offset, vtype=vtype)
@@ -1115,6 +1110,7 @@ def mk_stack_variable_lval(
if varinfo.vtype is None:
return lval
+ # create stack variables for all fields and array elements
if varinfo.vtype.is_compound:
structtyp = cast(AST.ASTTypComp, varinfo.vtype)
ckey = structtyp.compkey
@@ -1159,6 +1155,13 @@ def mk_stack_variable_lval(
self._stack_variables[elementoffset + cfoff] = fieldlval
elementoffset += elsize
+ else:
+ elementoffset = offset
+ for i in range(arraysize):
+ indexoffset = self.mk_scalar_index_offset(i)
+ elemlval = self.astree.mk_vinfo_lval(varinfo, offset=indexoffset)
+ self._stack_variables[elementoffset] = elemlval
+ elementoffset += elsize
return lval
diff --git a/chb/cmdline/chkx b/chb/cmdline/chkx
index 4ea0fab4..b0cf38be 100755
--- a/chb/cmdline/chkx
+++ b/chb/cmdline/chkx
@@ -881,6 +881,14 @@ def parse() -> argparse.Namespace:
resultsclassifyfunctions.add_argument(
"classification_file",
help="name of json classification file")
+ resultsclassifyfunctions.add_argument(
+ "--output", "-o",
+ required=True,
+ help="name of file to save results")
+ resultsclassifyfunctions.add_argument(
+ "--showapicalls",
+ action="store_true",
+ help="list classified functions individually in output file")
resultsclassifyfunctions.set_defaults(func=UCC.results_classifyfunctions)
# --- results functions ---
@@ -1202,6 +1210,13 @@ def parse() -> argparse.Namespace:
+ " source for callgraph path"))
report_calls.set_defaults(func=REP.report_calls_cmd)
+ # -- report arguments
+ report_arguments = reportparsers.add_parser("string_arguments")
+ report_arguments.add_argument("xname", help="name of executable")
+ report_arguments.add_argument(
+ "--output", "-o", required=True, help="name of json output file")
+ report_arguments.set_defaults(func=REP.report_string_arguments)
+
# -- report function api's
report_functionapis = reportparsers.add_parser("function_apis")
report_functionapis.add_argument("xname", help="name of executable")
diff --git a/chb/cmdline/commandutil.py b/chb/cmdline/commandutil.py
index 9d9dec60..d8dbc09c 100644
--- a/chb/cmdline/commandutil.py
+++ b/chb/cmdline/commandutil.py
@@ -930,6 +930,8 @@ def results_classifyfunctions(args: argparse.Namespace) -> NoReturn:
xname: str = str(args.xname)
classificationfile: str = str(args.classification_file)
+ showapicalls: bool = args.showapicalls
+ outputfilename: str = args.output
with open(classificationfile, "r") as fp:
classifier = json.load(fp)
@@ -953,44 +955,76 @@ def results_classifyfunctions(args: argparse.Namespace) -> NoReturn:
fns = app.appfunction_addrs
classification: Dict[str, Dict[str, int]] = {} # faddr -> libcat -> count
+ classificationapi: Dict[str, Dict[str, Dict[str, int]]] = {}
for faddr in fns:
- classification.setdefault(faddr, {})
+ if showapicalls:
+ classificationapi.setdefault(faddr, {})
+ else:
+ classification.setdefault(faddr, {})
f = app.function(faddr)
fcalls = f.call_instructions()
for baddr in fcalls:
for instr in fcalls[baddr]:
tgtname = instr.call_target.name
if tgtname in revclassifier:
- category = revclassifier[tgtname]
- classification[faddr].setdefault(category, 0)
- classification[faddr][category] += 1
+ if showapicalls:
+ category = revclassifier[tgtname]
+ classificationapi[faddr].setdefault(category, {})
+ classificationapi[faddr][category].setdefault(tgtname, 0)
+ classificationapi[faddr][category][tgtname] += 1
+ else:
+ category = revclassifier[tgtname]
+ classification[faddr].setdefault(category, 0)
+ classification[faddr][category] += 1
catfprevalence: Dict[str, int] = {}
catcprevalence: Dict[str, int] = {}
catstats: Dict[int, int] = {}
singlecat: Dict[str, int] = {}
doublecat: Dict[Tuple[str, str], int] = {}
- for faddr in classification:
- for cat in classification[faddr]:
- catfprevalence.setdefault(cat, 0)
- catcprevalence.setdefault(cat, 0)
- catfprevalence[cat] += 1
- catcprevalence[cat] += classification[faddr][cat]
-
- numcats = len(classification[faddr])
- catstats.setdefault(numcats, 0)
- catstats[numcats] += 1
- if numcats == 1:
- cat = list(classification[faddr].keys())[0]
- singlecat.setdefault(cat, 0)
- singlecat[cat] += 1
-
- if numcats == 2:
- cats = sorted(list(classification[faddr].keys()))
- cattuple = (cats[0], cats[1])
- doublecat.setdefault(cattuple, 0)
- doublecat[cattuple] += 1
+
+ if showapicalls:
+ for faddr in classificationapi:
+ for cat in classificationapi[faddr]:
+ catfprevalence.setdefault(cat, 0)
+ catcprevalence.setdefault(cat, 0)
+ catfprevalence[cat] += 1
+ catcprevalence[cat] += sum(classificationapi[faddr][cat].values())
+ numcats = len(classificationapi[faddr])
+ catstats.setdefault(numcats, 0)
+ catstats[numcats] += 1
+ if numcats == 1:
+ cat = list(classificationapi[faddr].keys())[0]
+ singlecat.setdefault(cat, 0)
+ singlecat[cat] = 1
+
+ if numcats == 2:
+ cats = sorted(list(classificationapi[faddr].keys()))
+ cattuple = (cats[0], cats[1])
+ doublecat.setdefault(cattuple, 0)
+ doublecat[cattuple] += 1
+ else:
+
+ for faddr in classification:
+ for cat in classification[faddr]:
+ catfprevalence.setdefault(cat, 0)
+ catcprevalence.setdefault(cat, 0)
+ catfprevalence[cat] += 1
+ catcprevalence[cat] += classification[faddr][cat]
+ numcats = len(classification[faddr])
+ catstats.setdefault(numcats, 0)
+ catstats[numcats] += 1
+ if numcats == 1:
+ cat = list(classification[faddr].keys())[0]
+ singlecat.setdefault(cat, 0)
+ singlecat[cat] += 1
+
+ if numcats == 2:
+ cats = sorted(list(classification[faddr].keys()))
+ cattuple = (cats[0], cats[1])
+ doublecat.setdefault(cattuple, 0)
+ doublecat[cattuple] += 1
for (m, c) in sorted(catstats.items()):
print(str(m).rjust(5) + ": " + str(c).rjust(5))
@@ -1006,10 +1040,16 @@ def results_classifyfunctions(args: argparse.Namespace) -> NoReturn:
classificationresults: Dict[str, Any] = {}
classificationresults["catfprevalence"] = catfprevalence
classificationresults["catcprevalence"] = catcprevalence
- classificationresults["functions"] = classification
+ if showapicalls:
+ classificationresults["functions"] = classificationapi
+ else:
+ classificationresults["functions"] = classification
+
+ jresult = JU.jsonok("none", classificationresults)
+ jresult["meta"]["app"] = JU.jsonappdata(xinfo, includepath=False)
- with open("classification_results.json", "w") as fp:
- json.dump(classificationresults, fp, indent=2)
+ with open(outputfilename, "w") as fp:
+ json.dump(jresult, fp, indent=2)
exit(0)
diff --git a/chb/cmdline/jsonresultutil.py b/chb/cmdline/jsonresultutil.py
index 0b84d8f5..02e0ca49 100644
--- a/chb/cmdline/jsonresultutil.py
+++ b/chb/cmdline/jsonresultutil.py
@@ -77,9 +77,10 @@ def jsonok(schemaname: str, content: Dict[str, Any]) -> Dict[str, Any]:
return jresult
-def jsonappdata(xinfo: "XInfo") -> Dict[str, str]:
+def jsonappdata(xinfo: "XInfo", includepath=True) -> Dict[str, str]:
result: Dict[str, str] = {}
- result["path"] = xinfo.path
+ if includepath:
+ result["path"] = xinfo.path
result["file"] = xinfo.file
result["md5"] = xinfo.md5
result["arch"] = xinfo.architecture
diff --git a/chb/cmdline/reportcmds.py b/chb/cmdline/reportcmds.py
index 73fd9dde..df442494 100644
--- a/chb/cmdline/reportcmds.py
+++ b/chb/cmdline/reportcmds.py
@@ -72,6 +72,7 @@
from chb.app.AppAccess import AppAccess
from chb.app.BasicBlock import BasicBlock
from chb.app.Instruction import Instruction
+ from chb.invariants.XConstant import XIntConst
from chb.mips.MIPSInstruction import MIPSInstruction
from chb.models.BTerm import BTerm, BTermArithmetic
from chb.models.FunctionSummary import FunctionSummary
@@ -602,6 +603,59 @@ def report_calls_cmd(args: argparse.Namespace) -> NoReturn:
exit(1)
+def report_string_arguments(args: argparse.Namespace) -> NoReturn:
+
+ # arguments
+ xname: str = args.xname
+ outputfilename: str = args.output
+
+ try:
+ (path, xfile) = UC.get_path_filename(xname)
+ UF.check_analysis_results(path, xfile)
+ except UF.CHBError as e:
+ print(str(e.wrap()))
+ exit(1)
+
+ xinfo = XI.XInfo()
+ xinfo.load(path, xfile)
+
+ app = UC.get_app(path, xfile, xinfo)
+ fns = app.functions
+
+ argvals: Dict[str, Dict[str, Any]] = {}
+
+ for (faddr, f) in fns.items():
+ fcalls = f.call_instructions()
+ for baddr in fcalls:
+ for instr in fcalls[baddr]:
+ callee = instr.call_target.name
+ callargs = instr.call_arguments
+ for (index, callarg) in enumerate(callargs):
+ if callarg.is_string_reference:
+ constcallarg = cast("XprConstant", callarg).constant
+ intcallarg = cast("XIntConst", constcallarg)
+ argvals.setdefault(faddr, {})
+ argvals[faddr].setdefault("call-string-args", [])
+ argrec = {
+ "iaddr": instr.iaddr,
+ "callee": callee,
+ "index": index + 1,
+ "value": intcallarg.string_reference()
+ }
+ argvals[faddr]["call-string-args"].append(argrec)
+
+ result: Dict[str, Any] = {}
+ result["functions"] = argvals
+
+ jresult = JU.jsonok("none", result)
+ jresult["meta"]["app"] = JU.jsonappdata(xinfo, includepath=False)
+
+ with open(outputfilename, "w") as fp:
+ json.dump(jresult, fp, indent=2)
+
+ exit(0)
+
+
def report_function_apis(args: argparse.Namespace) -> NoReturn:
# arguments
diff --git a/chb/invariants/FnVarDictionary.py b/chb/invariants/FnVarDictionary.py
index 5a044d5a..58842ffb 100644
--- a/chb/invariants/FnVarDictionary.py
+++ b/chb/invariants/FnVarDictionary.py
@@ -43,6 +43,7 @@
import chb.util.fileutil as UF
import chb.util.IndexedTable as IT
+from chb.util.loggingutil import chklogger
if TYPE_CHECKING:
from chb.api.InterfaceDictionary import InterfaceDictionary
@@ -201,4 +202,4 @@ def initialize(self, xnode: ET.Element) -> None:
t.reset()
t.read_xml(xtable, "n")
else:
- raise UF.CHBError("Var dictionary table " + t.name + " not found")
+ chklogger.logger.error("Var dictionary table %s not found", t.name)
diff --git a/chb/invariants/XXpr.py b/chb/invariants/XXpr.py
index 511ef2e5..b1836e3b 100644
--- a/chb/invariants/XXpr.py
+++ b/chb/invariants/XXpr.py
@@ -806,7 +806,7 @@ def stack_address_offset(self) -> int:
elif self.is_stack_address and self.is_addressof_var:
xvar = self.get_addressof_var
if xvar is not None:
- return xvar.denotation.offset.offsetvalue()
+ return xvar.denotation.offset.offsetconstant
raise UF.CHBError(
"Expression is not a stack address: " + str(self))
diff --git a/chb/invariants/XXprUtil.py b/chb/invariants/XXprUtil.py
index 093f0af8..855efea8 100644
--- a/chb/invariants/XXprUtil.py
+++ b/chb/invariants/XXprUtil.py
@@ -449,6 +449,8 @@ def memory_variable_to_lval_expression(
offset = cast("VMemoryOffsetFieldOffset", offset)
astoffset: AST.ASTOffset = field_offset_to_ast_offset(
offset, xdata, iaddr, astree, anonymous=anonymous)
+ elif offset.is_no_offset:
+ astoffset = nooffset
elif offset.is_array_index_offset:
offset = cast("VMemoryOffsetArrayIndexOffset", offset)
astoffset = array_offset_to_ast_offset(
@@ -460,6 +462,11 @@ def memory_variable_to_lval_expression(
return astree.mk_memref_expr(
astbase, offset=astoffset, anonymous=anonymous)
+ elif offset.is_no_offset:
+ astlval = xvariable_to_ast_def_lval_expression(
+ base.basevar, xdata, iaddr, astree, anonymous=anonymous)
+ return astree.mk_memref_expr(astlval, anonymous=anonymous)
+
elif (
offset.is_field_offset
or offset.is_array_index_offset
@@ -1615,6 +1622,12 @@ def stack_variable_to_ast_lval(
fldoffset, xdata, iaddr, astree, anonymous=anonymous)
return astree.mk_vinfo_lval(vinfo, offset=astoffset, anonymous=anonymous)
+ if offset.offset.is_array_index_offset:
+ idxoffset = cast("VMemoryOffsetArrayIndexOffset", offset.offset)
+ astoffset = array_offset_to_ast_offset(
+ idxoffset, xdata, iaddr, astree, anonymous=anonymous)
+ return astree.mk_vinfo_lval(vinfo, offset=astoffset, anonymous=anonymous)
+
if not anonymous:
chklogger.logger.warning(
"Stack variable with offset %s not yet supported at address %s",
@@ -1946,6 +1959,8 @@ def basevar_variable_to_ast_lval(
offset = cast("VMemoryOffsetArrayIndexOffset", offset)
astoffset = array_offset_to_ast_offset(
offset, xdata, iaddr, astree, anonymous=anonymous)
+ elif offset.is_no_offset:
+ astoffset = nooffset
elif offset.is_constant_value_offset:
astoffset = astree.mk_scalar_index_offset(offset.offsetvalue())
else:
diff --git a/doc/user-guide/userdata.md b/doc/user-guide/userdata.md
new file mode 100644
index 00000000..ccd322ac
--- /dev/null
+++ b/doc/user-guide/userdata.md
@@ -0,0 +1,77 @@
+# Userdata
+
+User data can improve analysis and decompilation. Userdata can be provided in two
+ways: json files and C header files. This section describes the json files; C
+header files are described here.
+
+
+## Add userdata
+
+Userdata files are passed to the analyzer via the command-line with the
+command-line option --hints. Multiple userdata files can be
+passed with this option. If data in multiple files conflict the data from
+the last file passed is taken; previous version of the same data are
+overwritten.
+
+Some command that provide the --hints option include
+```
+> chkx analyze ... --hints ...
+> chkx results ast ... --hints ...
+> chkx relational prepare ... --hints ...
+...
+```
+
+## Userdata file layout
+
+Userdata format is json. The general layout of the json file is
+```
+{
+ "userdata": {
+ "": { ... },
+ "": { ... },
+ "": { ... },
+ ....
+ "": { ... }
+ }
+}
+```
+where section-i is the name of of a particular kind of userdata that is
+supported. Each kind of userdata has its own format and meaning, as explained
+below. It is recommended to add some additional top-level properties to the file,
+such as a hash (e.g., md5 or sha256) to identify the binary to which the userdata
+applies, or the name and release date of the binary. These additional properties,
+however, are not enforced or used otherwise.
+
+**Caution** The section names must be exact. Sections with misspelled names are
+silently ignored. To check if a section was read correctly, inspect the file
+.ch/u/_system_u.xml after initiating the analysis, to verify
+the corresponding xml section that is passed to the back-end ocaml analyzer.
+
+
+## Kinds of userdata
+
+The kinds of userdata that can be passed to the analysis is varied and tends to
+grow/change over time. Below is a list of the kinds of userdata currently
+supported.
+
+- **ARM-Thumb switch points** ([arm-thumb](userdata/arm-thumb.md)):
+ A list of addresses where an ARM binary
+ switches from ARM representation to Thumb-2 and v.v.
+
+- **Call-back Tables** ([call-back-tables](userdata/call-back-tables.md)):
+ A table of addresses
+ mapped to the declared name of a call-back table in memory.
+
+- **Call Targets for Indirect Calls** ([call-targets](userdata/call-targets.md)):
+ A list of targets for indirect function calls.
+
+- **Data Regions within Code** ([data-blocks](userdata/data-blocks.md)):
+ A list of start and end addresses
+ of regions within the code section that contain data.
+
+- **Function Annotations** ([function-annotations](userdata/function-annotations.md)):
+ Annotations with the aim to improve the quality of a decompilation to C, including
+ names/types for register and stack variables.
+
+- **Function Entry Points** ([function-entry-points](userdata/function-entry-points.md)):
+ A list of addresses that are the start of a function.
\ No newline at end of file
diff --git a/doc/user-guide/userdata/arm-thumb.md b/doc/user-guide/userdata/arm-thumb.md
new file mode 100644
index 00000000..e1516122
--- /dev/null
+++ b/doc/user-guide/userdata/arm-thumb.md
@@ -0,0 +1,35 @@
+### Arm-Thumb switch points
+
+**Description**
+
+ARM binaries may mix the ARM and Thumb-2 representation for code. The analyzer
+supports both representations. In many binaries these switch points are indicated
+in the binary itself by the compiler (this is always the case for binaries
+compiled with debug, and often in other binaries as well). However, if the
+swich points are not explicitly present in the binary, the current version of
+the disassembler cannot automatically
+determine them. For these binaries the user has the option to manually indicate
+the switch points in the userdata.
+
+**Format**
+
+A list of addresses followed by a colon and the letter 'T' or 'A'
+that indicate starting addresses of Thumb-2 and ARM code representation regions.
+
+
+**Example**
+
+```
+{
+ "userdata": {
+ ....
+ "arm-thumb": [
+ "0x18638:A",
+ "0x18908:T",
+ "0x18950:A",
+ "0x18974:T",
+ "0x21210:A"
+ ]
+ }
+}
+```
diff --git a/doc/user-guide/userdata/call-back-tables.md b/doc/user-guide/userdata/call-back-tables.md
new file mode 100644
index 00000000..ca4ce0c5
--- /dev/null
+++ b/doc/user-guide/userdata/call-back-tables.md
@@ -0,0 +1,71 @@
+### Call-back Tables
+
+**Description**
+
+Call-back tables are arrays of structs in global memory that contain related
+function pointers, usually associated with some other identifying data.
+Common examples of call-back tables are in binaries that serve requests based
+on a particular keyword. In such systems the response to the request is often
+invoked by matching the key to the identifying key in the table and executing
+the associated function pointer.
+
+The userdata representation for such call-back tables consists of three elements:
+1. The definition of the table in C (in the C header file)
+2. The start address of the table in memory (in userdata)
+3. The addresses of the indirect calls into the table (in userdata)
+
+This section only shows the format for (2). The addresses of the indirect
+calls are specified in a separate section, described in
+[call-targets](call-targets.md).
+
+
+**Format**
+
+A table of virtual addresses in memory mapped to names of defined tables.
+
+
+**Example**
+
+```
+{
+ "userdata": {
+ ....
+ "call-back-tables": {
+ "0x4a5910": "request_table",
+ "0x4a5c30": "cgi_setobject_table"
+ }
+ }
+}
+```
+
+This section must be accompanied by a definition of the corresponding table
+in a header file that is passed to the analyzer at the same time. The
+corresponding header definition in this case could be something like:
+
+```
+struct _cbt_http_request {
+ char *formname;
+ char *filetype;
+ char *cachecontrol;
+ int (*cpb_request_12)(void *state, void *stream, int len);
+ int (*cbp_request_16)(char *filename, void *stream);
+ int (*cbp_request_20)(char *level);
+} cbt_http_request;
+
+
+struct _cbt_http_request *request_table;
+
+
+struct _cbt_cgi_setobject {
+ char *tag;
+ int num;
+ int (*cbp_cgi_setobject)(struct keyvaluepair_t *kvp, int len);
+} cbt_cgi_setobject;
+
+
+struct _cbt_cgi_setobject *cgi_setobject_table;
+```
+
+
+
+
\ No newline at end of file
diff --git a/doc/user-guide/userdata/call-targets.md b/doc/user-guide/userdata/call-targets.md
new file mode 100644
index 00000000..60aa7e53
--- /dev/null
+++ b/doc/user-guide/userdata/call-targets.md
@@ -0,0 +1,65 @@
+### Call targets
+
+**Description**
+
+In many cases the analyzer is able to resolve indirect function calls. For
+those cases where automatic resolution of targets fails the user can supply
+a list of targets explicitly in the userdata.
+
+A call target may be specified in a number of ways depending on the kind of
+target:
+- *application function:* app:\
+- *shared-object function:* so:\
+- *java native interface:* jni:\
+- *call-back table function:* cba:\:\
+
+**Format**
+
+A list of records of the following structure:
+```
+ {"fa":,
+ "ia":,
+ "tgts": [
+ | {"app":}
+ | {"so":}
+ | {"jni": }
+ | {"cba": :}
+ ]
+ }
+```
+
+**Example**
+
+```
+{
+ "userdata": {
+ ...
+ "call-targets": [
+ {"ia": "0x40d5dc",
+ "fa": "0x40d510",
+ "tgts": [{"cba": "0x4a5c30:8"}]
+ },
+ {"ia": "0x40a6a4",
+ "fa": "0x409dd0",
+ "tgts": [{"cba": "0x4a5910:12"}]
+ },
+ {"ia": "0x40aba8",
+ "fa": "0x409dd0",
+ "tgts": [{"cba": "0x4a5910:16"}]
+ },
+ {"ia": "0x40afd8",
+ "fa": "0x409dd0",
+ "tgts": [{"cba": "0x4a5910:20"}]
+ },
+ {"ia": "0x40b304",
+ "fa": "0x40b288",
+ "tgts": [{"app": "0x401018"}, {"app": "0x403200"}]
+ },
+ {"ia": "0x40c800",
+ "fa": "0x40c780",
+ "tgts": [{"so": "memcpy"}]
+ }
+ ]
+ }
+}
+```
\ No newline at end of file
diff --git a/doc/user-guide/userdata/data-blocks.md b/doc/user-guide/userdata/data-blocks.md
new file mode 100644
index 00000000..31d2f9fc
--- /dev/null
+++ b/doc/user-guide/userdata/data-blocks.md
@@ -0,0 +1,34 @@
+### Data blocks
+
+**Description**
+
+Code sections may interleave code with data regions. This is particularly common
+in ARM binaries. Most of these data regions are detected automatically by the
+disassembler. For the cases where this fails the user can point out these data
+regions in the userdata with the data-blocks section.
+
+**Format**
+
+A list of records that specify the start (inclusive) and end (exclusive) address
+of a data region, where the record has the format:
+```
+ {"r": [, ]}
+```
+
+
+**Example**
+
+```
+{
+ "userdata": {
+ ....
+ "data-blocks": [
+ {"r": ["0xa02425fc", "0xa0242674"]},
+ {"r": ["0xa0255e68", "0xa0255e94"]},
+ {"r": ["0xa03005d4", "0xa03005f8"]},
+ {"r": ["0xa0300a9e", "0xa0300ab0"]},
+ ...
+ ]
+ }
+}
+```
\ No newline at end of file
diff --git a/doc/user-guide/userdata/function-annotations.md b/doc/user-guide/userdata/function-annotations.md
new file mode 100644
index 00000000..e6e60d4e
--- /dev/null
+++ b/doc/user-guide/userdata/function-annotations.md
@@ -0,0 +1,190 @@
+### Function Annotations
+
+Function annotations can be used to improve the quality of a decompilation of
+a function to C code. A function annotation ranges from names and types for
+register and stack
+variables to corrections to reaching definitions and typing inference rules.
+
+**Format**
+
+The top-level format of function annotations is a list of individual function
+annotations:
+```
+{
+ "userdata": {
+ ...
+ {
+ "function-annotations": [
+ {
+ "faddr": ,
+ "register-variable-introductions": [
+ ...
+ ],
+ "stack-variable-introductions: [
+ ...
+ ],
+ "typing-rules": [
+ ...
+ ],
+ "remove-reaching-definitions": [
+ ...
+ ]
+ },
+ ...
+ }
+ }
+}
+```
+where all properties are optional except for the function address.
+
+**Format: register-variable-introductions**:
+
+The format for **register-variable introductions** is a list of individual
+register annotations
+```
+ [
+ {
+ "iaddr": ,
+ "name": ,
+ "typename": ,
+ "mods": []
+ },
+ {
+ ...
+
+ ]
+```
+The instruction address is the address of the instruction where the
+register to be renamed gets assigned, that is, the register is the
+left-hand side in an instruction (assignment or call). If a register
+gets assigned in multiple paths in parallel, the instruction address
+should be the lowest address. These introductions can be considered
+as ssa (static single assignment) locations.
+
+The chosen name is the name to be given to the register. The name will
+be used in the lifting as long as the register has the current definition.
+It is the user's responsibility to ensure that there are no name clashes
+with other variables.
+
+The type name is the name of the type of the register for that particular
+assignment (a register can have many types during its lifetime within a
+function). The type name is either a primitive C type (like int or
+unsigned short, etc.) or the name of a type for which a typedef is given
+in the header file. The reason for restricting the type name to simple
+names is that full-featured C parsing needs to be applied when reading
+in these files. For convenience, some modifications can be added to the
+mods property to modify the typename:
+- ptrto: indicating that the register type is a pointer to
+ the type indicated by the type name
+- cast: indicating that the type given should override the
+ type that may have been inferred by type inference. Adding cast
+ furthermore ensures that the assigning instruction will be exposed in
+ the lifting.
+
+*Note:* The name of the register itself does not have to be included in
+the record, as it is automatically inferred from the instruction address.
+At present the annotation is limited to instructions with a single LHS
+register. That is, instructions that assign to multiple registers such
+as the ARM instructions LDM or ARM call instructions that
+assign to both R0 and R1 are currently not
+handled.
+
+*Note:* The typename is optional. The analyzer performs its own type inference
+based on function signatures and other type information. Unless types are
+introduced that are not present in any function signatures or other type
+information it is often better to omit the typename initially and only add
+a typename if a typename is not inferred automatically.
+
+**Example: register-variable-introductions:**
+
+```
+ "register-variable-introductions": [
+ {
+ "iaddr": "0xe2b34",
+ "name": "t",
+ "typename": "EVP_PKEY_ASN1_METHOD",
+ "mods": ["ptrto", "cast"]
+ },
+ {
+ "iaddr": "0xe2b40",
+ "name": "flags",
+ "typename": "unsigned long"
+ },
+ {
+ "iaddr": "0xe2b88",
+ "name": "obj"
+ },
+ ...
+```
+
+**Format: stack-variable-introductions:**
+
+The format for **stack-variable-introductions** is a list of individual
+(local) stack variable annotations:
+```
+ [
+ {
+ "offset": ,
+ "name":
+ "typename": ,
+ "mods": []
+ },
+ {
+ ...
+ ]
+```
+The offset is the offset *in bytes* where the stack variable is located, defined
+as
+```
+ -
+```
+Note that this number must be positive as the stack grows down, and thus any
+local stack variable is located at an address that is less in value than the
+address of the stack-pointer at function entry.
+
+The name, typename, and mods are the same as for register-variable introductions
+with the exception that stack variables can have an additional type of modification
+expressed in the mods property:
+- array:\: indicating that the stack variable type is an array
+ of n elements of the type given.
+
+It is the user's responsibility to ensure that stack variables do not overlap and
+that names do not clash with each other or with register variables.
+
+
+**Example: stack-variable-introductions:**
+
+```
+ "stack-variable-introductions": [
+ {
+ "offset": 32,
+ "name": "md",
+ "typename": "unsigned char",
+ "mods": ["array:16"]
+ },
+ {
+ "offset": 56,
+ "name": "md_ctx",
+ "typename": "EVP_MD_CTX"
+ }
+ ]
+```
+
+**Format: remove-reaching-definitions:**
+
+The format for **remove-reaching-definitions** is a list of register variables
+associated with the reaching definitions to be removed:
+```
+ [
+ {
+ "var": ,
+ "uselocs": [ hex-addresses ],
+ "rdeflocs": [ hex-addresses ]
+ },
+ {
+ ...
+ ]
+```
+The var property holds the name of the register for which the
+addresses given in the rdeflocs property are to be removed
+from the instructions with addresses given in the uselocs property.
diff --git a/doc/user-guide/userdata/function-entry-points.md b/doc/user-guide/userdata/function-entry-points.md
new file mode 100644
index 00000000..a24b450f
--- /dev/null
+++ b/doc/user-guide/userdata/function-entry-points.md
@@ -0,0 +1,106 @@
+### Function Entry Points
+
+**Description**
+
+For most binaries the disassembler is able to determine all function entry points
+automatically. In some cases, however, some function entry points may be missed,
+and may be manually pointed out in the userdata.
+
+**Format**
+
+A list of addresses that are the starting address of a function.
+
+**Example**
+```
+{
+ "userdata": {
+ ...
+ "function-entry-points": [
+ "0xa0100044",
+ "0xa010011c",
+ "0xa0100292",
+ "0xa010029c",
+ "0xa0100710",
+ "0xa010072a",
+ ...
+ ]
+ }
+}
+```
+
+**Finding Function Entry Points**
+
+Low function coverage may be an indicator of function entry points missed.
+Function coverage is defined as the ratio of the number of instructions that
+are part of some function and the total number of instructions in the code
+sections (minus confirmed embedded data regions). Function coverage is
+displayed in the printed output when running the disassembler (without
+analysis):
+
+```
+> chkx analyze -d
+...
+Disassembly : 0.16
+Construct functions: 0.86
+Disassembly information:
+ Instructions : 32699
+ Unknown instructions : 0
+ Functions : 429 (coverage: 96.68%)
+ Function overlap : 993 (counting multiples: 993)
+ Jumptables : 16
+ Data blocks : 20
+...
+```
+
+To aid the identificaton of function entry points, the disassembler prints
+out a (text) file that contains a listing of all instructions not contained
+in functions. E.g.,
+```
+> chkx analyze -d
+...
+> more .cch/a/_orphan.log
+...
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Data block (size: 12 bytes)
+
+ 0x9870 Code:<0x295d4>
+ 0x9874 FAddr:<0x9914>
+ 0x9878 Code:<0x9300>
+================================================================================
+
+ 0x987c 08 40 2d e9 PUSH {R3,LR}
+ 0x9880 2c 30 9f e5 LDR R3, 0x98b4
+ 0x9884 00 30 d3 e5 LDRB R3, [R3]
+ 0x9888 00 00 53 e3 CMP R3, #0
+ 0x988c 08 80 bd 18 POPNE {R3,PC}
+ B 0x9890 20 30 9f e5 LDR R3, 0x98b8
+ 0x9894 00 00 53 e3 CMP R3, #0
+ 0x9898 01 00 00 0a BEQ 0x98a4
+ B 0x989c 18 00 9f e5 LDR R0, 0x98bc
+ 0x98a0 23 ff ff eb BL 0x9534
+ B 0x98a4 08 30 9f e5 LDR R3, 0x98b4
+ 0x98a8 01 20 a0 e3 MOV R2, #1
+ 0x98ac 00 20 c3 e5 STRB R2, [R3]
+ 0x98b0 08 80 bd e8 POP {R3,PC}
+ B 0x98b4 38 64 03 00 ANDEQ R6, R3, R8,LSR R4
+ 0x98b8 00 00 00 00 ANDEQ R0, R0, R0
+ 0x98bc cc dd 02 00 ANDEQ SP, R2, R12,ASR#27
+ 0x98c0 08 40 2d e9 PUSH {R3,LR}
+ 0x98c4 34 30 9f e5 LDR R3, 0x9900
+ 0x98c8 00 00 53 e3 CMP R3, #0
+ 0x98cc 02 00 00 0a BEQ 0x98dc
+ B 0x98d0 2c 00 9f e5 LDR R0, 0x9904
+ 0x98d4 2c 10 9f e5 LDR R1, 0x9908
+ 0x98d8 cc ff ff eb BL 0x9810
+ B 0x98dc 28 00 9f e5 LDR R0, 0x990c
+ 0x98e0 00 30 90 e5 LDR R3, [R0]
+ 0x98e4 00 00 53 e3 CMP R3, #0
+ 0x98e8 08 80 bd 08 POPEQ {R3,PC}
+ B 0x98ec 1c 30 9f e5 LDR R3, 0x9910
+ 0x98f0 00 00 53 e3 CMP R3, #0
+ 0x98f4 08 80 bd 08 POPEQ {R3,PC}
+ B 0x98f8 33 ff 2f e1 BLX R3
+ 0x98fc 08 80 bd e8 POP {R3,PC}
+...
+```
+Missing function entry points are easy to spot at 0x987c and 0x98c0.
\ No newline at end of file