from .base_tokenizer import Tokens
from .base_normalizer import DEFAULT_IMMEDIATE_THRESHOLD
from .norm_utils import imm_to_int, scan_for_token, DISPLACEMENT_IMMEDIATE_STR, IMMEDIATE_VALUE_STR, STRING_LITERAL_STR, \
JUMP_DESTINATION_STR, MEMORY_EXPRESSION_STR, FUNCTION_CALL_STR, MULTI_FUNCTION_CALL_STR, RECURSIVE_FUNCTION_CALL_STR, \
EXTERNAL_FUNCTION_CALL_STR, INTERNAL_FUNCTION_CALL_STR
from ..cfg.cfg_edge import EdgeType
import bincfg
[docs]
def ignore(self, state):
"""Ignores information (if using for rose info, then it will also ignore negatives)"""
pass
[docs]
def identity(self, state):
"""Returns the original token"""
return state.token
[docs]
def return_token(self, state):
"""Returns the original token"""
return state.token
[docs]
def return_dispmem(self, state):
"""Replaces memory addressing displacement values with the string 'dispmem'"""
return DISPLACEMENT_IMMEDIATE_STR
[docs]
def replace_string_literal(*args, replace_previous_immediate=False):
"""Builds a function that replaces string literal values with the string 'str'
This will return a function to be called as a part of a normalizer. This function takes no arguments and only 1 keyword
argument: whether to replace the previous immediate, or keep it and add in a 'str' string
NOTE: This is meant to be a higher-order function. But, just in case the user forgets that (or is too lazy to add in
two extra characters to call this function), if you pass multiple args then it will be assumed this is being called
as if it is the _repl_func() function below and will simply return the default result
Args:
args: args for this function. Ideally empty
replace_previous_immediate (bool): if True, then any previous immediate value will be replaced with the 'str'
string, otherwise the 'str' string will just be added
Returns:
Union[Callable[..., str], str]: either a function that will handle immediate strings (if this function was
called correctly), or a handled immediate string
"""
def _ret_string_imm(self, state):
# If we are not replacing any previous immediate, just return now
if not replace_previous_immediate:
return STRING_LITERAL_STR
# Scan for a previous immediate token, and if not present, just return the string now
idx = scan_for_token(state.line, type=Tokens.IMMEDIATE, ignore_type=[Tokens.SPACING], stop_unmatched=True, start=-1, increment=-1)
if idx is None:
return STRING_LITERAL_STR
# We have a previous immediate value, replace it with our string, and return None to not insert token
state.line = state.line[:idx] + [(Tokens.STRING_LITERAL, STRING_LITERAL_STR, state.orig_token)]
return None
return _ret_string_imm if len(args) == 0 else _ret_string_imm(*args)
[docs]
def replace_jump_destination(self, state):
"""Replaces the jump destination immediate with 'jmpdst' iff the jump destination is an immediate value, not a segment address
Args:
idx (int): the index in ``line`` of the 'jump' opcode
line (List[TokenTuple]): a list of (token_type, token) tuples. the current assembly line
Returns:
int: integer index in line of last handled token
"""
state.line[state.token_idx + 1] = (state.line[state.token_idx + 1][0], JUMP_DESTINATION_STR if state.line[state.token_idx + 1][0] == Tokens.IMMEDIATE else \
state.line[state.token_idx + 1][1], state.line[state.token_idx + 1][2])
return state.token_idx + 2
[docs]
def replace_memory_expression(*args):
"""Builds a function that replaces memory expressions with the given replacement string
This will return a function to be called as a part of a normalizer. This only takes one argument: the replacement string.
If no arguments are passed, then the replacement string will default to 'memexpr'
NOTE: This is meant to be a higher-order function. But, just in case the user forgets that (or is too lazy to add in
two extra characters to call this function), if you pass multiple args then it will be assumed this is being called
as if it is the _repl_func() function below and will simply return the default result
Args:
args: args for this function. Ideally either empty to use default memory expression string, or a string to replace
all memory expressions with.
Returns:
Union[Callable[..., None], None]: either a function that will handle memory expressions (if this function was
called correctly), or a handled memory expression
"""
replacement = args[0] if len(args) == 1 else MEMORY_EXPRESSION_STR
if not isinstance(replacement, str):
raise TypeError('Replacement must be str, instead got `%s`: %s' % (type(replacement).__name__, replacement))
def repl_mem(self, state):
"""Replace memory expressions with 'memexpr'"""
# Using a brand new token so it's not confused with anything else. Keep track of the old value as well
state.line[state.memory_start] = (Tokens.MEMORY_EXPRESSION, replacement, ' '.join([l[2] for l in state.line[state.memory_start + 1:]]))
del state.line[state.memory_start + 1:] # Delete the rest of the line after memory_start index
return repl_mem if len(args) <= 1 else repl_mem(*args)
[docs]
def special_function_call(self, state, ret_only_call_type=False):
"""Handles special function calls
Special external functions have their name kept. Recursive calls are replaced with 'self', other internal function
calls are replaced with 'internfunc', other external function calls are replaced with 'externfunc'. If a block has
multiple function calls out, then it will be replaced with 'multifunc'.
NOTE: This can all only happen if cfg and block information is passed. If it is not passed, then all function
calls will be replaced with 'func'
Args:
idx (int): the index in ``line`` of the 'call' opcode
line (List[TokenTuple]): a list of (token_type, token) tuples. the current assembly line
special_functions (Set[str]): a set of string special function names.
cfg (Union[CFG, MemCFG], optional): either a ``CFG`` or ``MemCFG`` object that these lines occur in. Used for
determining function calls to self, internal functions, and external functions. If not passed, then these
will not be used. Defaults to None.
block (Union[CFGBasicBlock, int], optional): either a ``CFGBasicBlock`` or integer block_idx in a ``MemCFG``
object. Used for determining function calls to self, internal functions, and external functions. If not
passed, then these will not be used. Defaults to None.
ret_only_call_type (bool): if True, will return only the call type being used as a string. This is only for testing
purposes and should likely not be used in normalization as this function already can handle the normalizing.
This will return a string if it is not a special function call (for the appropriate function call type), or
a tuple with one element for a special function call (the name of the special function).
Returns:
int: integer index in line of last handled token
"""
idx, line, block, cfg = state.token_idx, state.line, state.block, state.cfg
# If cfg is None, then we cannot determine intern/extern/self function calls. Just return 'func' for function call
if cfg is None:
line[idx + 1] = (Tokens.IMMEDIATE, FUNCTION_CALL_STR, line[idx + 1][2])
return idx + 2
# Check that we even need to replace the value. We should only replace immediate values, so check to make sure
# the old_token starts with a digit
if not ret_only_call_type and line[idx + 1][2][0] not in "0123456789":
return idx + 2
# Find the function call info in cfg
if isinstance(cfg, bincfg.MemCFG):
# Find the outgoing edges of this block. Normally the first edge is the function call edge, however we still need
# to check because there can be times where it only contains the normal return edge as the function call
# address couldn't be resolved to a known basic block
func_call_block_inds, edge_types = cfg.get_block_edges_out(block, ret_edge_types=True)
# Make sure the first value is in fact a function call
if len(edge_types) > 0 and edge_types[0] == EdgeType.FUNCTION_CALL.value:
# Check for a self call by comparing the two block's function indices
func_call_block_idx = func_call_block_inds[0]
self_call = cfg.block_func_idx[block] == cfg.block_func_idx[func_call_block_idx]
extern_func_name = cfg.get_block_function_name(func_call_block_idx) if cfg.is_block_extern_function(func_call_block_idx) else None
# Check for a multi-function call
multi_call = len(edge_types[edge_types == EdgeType.FUNCTION_CALL.value]) > 1
# Otherwise, we have no clue where the function call goes to, treat it as just some innerfunc
else:
self_call = False
extern_func_name = None
multi_call = False
# Otherwise this is a plain CFG. Get the call address from the next immediate value, and look up its info in the CFG
elif isinstance(cfg, bincfg.CFG):
fc_out = block.get_sorted_edges(edge_types='function', direction='out')[0]
if len(fc_out) > 1:
multi_call = True
else:
multi_call = False
# Need to check that the disassembler was able to figure out the call location
if len(fc_out) == 0:
self_call = False
extern_func_name = None
else:
func = cfg.get_block(fc_out[0].to_block).parent_function
self_call = func.address == block.parent_function.address
extern_func_name = func.name if func.is_extern_function else None
else:
raise TypeError("Unknown cfg type for special_function_call normalization: %s" % repr(type(cfg)))
# Determine what function call strings to return
# A multi-function call
if multi_call:
if ret_only_call_type: return MULTI_FUNCTION_CALL_STR
line[idx + 1] = (line[idx + 1][0], MULTI_FUNCTION_CALL_STR, line[idx + 1][2])
# A recursive function call
elif self_call:
if ret_only_call_type: return RECURSIVE_FUNCTION_CALL_STR
line[idx + 1] = (line[idx + 1][0], RECURSIVE_FUNCTION_CALL_STR, line[idx + 1][2])
# An external function call
elif extern_func_name is not None:
extern_func_name = extern_func_name.split('@')[0]
if extern_func_name in self.special_functions:
if ret_only_call_type: return (extern_func_name,)
line[idx + 1] = (line[idx + 1][0], '{' + extern_func_name + '}', line[idx + 1][2])
else:
if ret_only_call_type: return EXTERNAL_FUNCTION_CALL_STR
line[idx + 1] = (line[idx + 1][0], EXTERNAL_FUNCTION_CALL_STR, line[idx + 1][2])
# An internal function call
else:
if ret_only_call_type: return INTERNAL_FUNCTION_CALL_STR
line[idx + 1] = (line[idx + 1][0], INTERNAL_FUNCTION_CALL_STR, line[idx + 1][2])
return idx + 2