Source code for bincfg.normalization.norm_utils

"""
An assortment of helper/utility functions for tokenization/normalization.
"""

import bincfg
import re
import traceback
import json
from functools import reduce


# The string/char that is inserted at the start and end of special tokens to signify them as special (and to make sure
#   they don't conflict with other actual tokens and whatnot)
SPECIAL_TOKEN_DESIGNATOR = '#'

# Token to insert at the start of each instruction for opcode-level tokenization
INSTRUCTION_START_TOKEN = '{0}start_instr{0}'.format(SPECIAL_TOKEN_DESIGNATOR)

# Token that is used to identify the start of a split immediate token
SPLIT_IMMEDIATE_TOKEN = '{0}split_imm{0}'.format(SPECIAL_TOKEN_DESIGNATOR)

# Constant string values for normalized tokens
STRING_LITERAL_STR = '{0}str{0}'.format(SPECIAL_TOKEN_DESIGNATOR)
IMMEDIATE_VALUE_STR = '{0}immval{0}'.format(SPECIAL_TOKEN_DESIGNATOR)
FUNCTION_CALL_STR = '{0}func{0}'.format(SPECIAL_TOKEN_DESIGNATOR)
RECURSIVE_FUNCTION_CALL_STR = '{0}self{0}'.format(SPECIAL_TOKEN_DESIGNATOR)
INTERNAL_FUNCTION_CALL_STR = '{0}innerfunc{0}'.format(SPECIAL_TOKEN_DESIGNATOR)
EXTERNAL_FUNCTION_CALL_STR = '{0}externfunc{0}'.format(SPECIAL_TOKEN_DESIGNATOR)
MULTI_FUNCTION_CALL_STR = '{0}multifunc{0}'.format(SPECIAL_TOKEN_DESIGNATOR)
JUMP_DESTINATION_STR = '{0}jmpdst{0}'.format(SPECIAL_TOKEN_DESIGNATOR)
MEMORY_EXPRESSION_STR = '{0}memexpr{0}'.format(SPECIAL_TOKEN_DESIGNATOR)
GENERAL_REGISTER_STR = '{0}reg{0}'.format(SPECIAL_TOKEN_DESIGNATOR)
SEGMENT_STR = '{0}seg{0}'.format(SPECIAL_TOKEN_DESIGNATOR)
SEGMENT_ADDRESS_STR = '{0}segaddr{0}'.format(SPECIAL_TOKEN_DESIGNATOR)
MEM_SIZE_TOKEN_STR = '{0}memptr{0}'.format(SPECIAL_TOKEN_DESIGNATOR)
DISPLACEMENT_IMMEDIATE_STR = '{0}dispmem{0}'.format(SPECIAL_TOKEN_DESIGNATOR)
LARGE_BLOCK_STR = '{0}large_block{0}'.format(SPECIAL_TOKEN_DESIGNATOR)


# Some helpful regular expressions

# Possible immediate values: hexadecimal, octal, decimal (NOTE: the need to be processed in that order so decimal immediate
#    doesn't take up the initial '0' in front of hexadecimal/octal)
RE_IMM_HEX = r'-?0x[0-9a-f]+'
RE_IMM_OCT = r'-?0o[0-7]+'
RE_IMM_BIN = r'-?0b[01]+'
RE_IMM_INT = r'-?[0-9]+'
RE_IMMEDIATE = r'(?:{hex}|{oct}|{bin}|{int})'.format(hex=RE_IMM_HEX, oct=RE_IMM_OCT, int=RE_IMM_INT, bin=RE_IMM_BIN)

# Match string literals. Allows for strings starting/ending with either single or double quotes, and one can escape
#   quotes with \' or \", and can escape the escape with \\
# Also matches '#str#' as a string literal
RE_STRING_LITERAL = r'(?:"[^"\\]*(?:\\.[^"\\]*)*"|\'[^\'\\]*(?:\\.[^\'\\]*)*\')|{str}'.format(str=STRING_LITERAL_STR)

# Various symbol characters
RE_PLUS_SIGN = r'\+'
RE_TIMES_SIGN = r'\*'
RE_OPEN_BRACKET = r'\['
RE_CLOSE_BRACKET = r'\]'
RE_COLON = r':'

# Spacing and newline characters
RE_SPACING = r'[, \t.]+'
RE_NEWLINE = r'[|\n]'

# Extra information given by disassemblers
# We could just assume there will only ever be one set of disassembler information per line, and greedily match from
#   every open bracket '<' to the furthest end bracket '>', but I'm not sure this will always be the case.
# We first check for any string literals before looking at end brackets '>', in case those appear within the strings.
# So, we have the following restrictions on how disassembler information may be formatted:
#   1. All info is within '<>' brackets
#   2. You may nest '>' brackets within disassembler info, so long as they are always opened previously, and can go up to
#       a max depth of _DIS_INFO_MAX_REC_DEPTH
#   3. You may have any number of '<' brackets
#   4. Any '<>' brackets within well-formatted strings inside the disassembler info will have no effect as those are
#       treated as plain strings within the disassembler info
# This means the following disassembler infos are valid:
#   * <>
#   * <info>
#   * <lots of <<<<<<< things>
#   * <string="this is <> weird >>>>> string <<<<<">
#   * <nested <brackets> within data <are <ok> up to a certain <depth>>
# While the following are invalid and will break things:
#   * <
#   * <data>>
#   * <super<deep<nested<...<thing>>...>>
# NOTE: this recursive depth restriction exists because 1. I can't find any way of doing recursive regex's with python's
#   re module and 2. They'd probably be slower even if they did exist since we probably don't need that much depth
# NOTE: currently, the largest depth I've seen is 1 from something like: "invokespecial 0x0001<java/lang/object::<init>>"
DISINFO_START = '<'
DISINFO_END = '>'
_DIS_INFO_MAX_REC_DEPTH = 2  # The depth is 0 for no nested, +1 for each depth of nesting. EG: 2 would match up to "<0 <1 <2> 1> 0>"
RE_DISASSEMBLER_INFO = reduce(lambda rs, s: rs % s, [r'{ds}(?:{strlit}|%s|[^{de}])*{de}'] * _DIS_INFO_MAX_REC_DEPTH + [r'{ds}(?:{strlit}|[^{de}])*{de}'])\
    .format(strlit=RE_STRING_LITERAL, ds=DISINFO_START, de=DISINFO_END)


_RAISE_ERR = object()

[docs]
def imm_to_int(token, on_err=_RAISE_ERR):
    """Convert the given value to integer
    
    If token is an integer, returns token. Otherwise, converts a string token to an integer, then back to a string, 
        accounting for hexadecimal, decimal, octal, and binary values

    Args:
        token (Union[str, int]): the immediate token to convert to integer
        on_err (Optional[Any]): if passed, then this value will be returned if there is an error while trying to parse
            the immediate value. Otherwise the error will just be raised like normal

    Returns:
        int: integer value of given token
    """
    try:
        return int(token, 0) if isinstance(token, str) else token
    except Exception:
        if on_err is _RAISE_ERR:
            raise
        return on_err




[docs]
def scan_for_token(token_list, type=None, token=None, stop_on_type=None, stop_on_token=None, ignore_type=None, ignore_token=None, 
                   stop_unmatched=False, match_re=False, ignore_re_case=True, start=0, increment=1, wrap=True, max_matches=1, 
                   ret_list=False, ret='index', on_no_match=None):
    """Scans the given token list looking for a specific token(s) or token type(s)

    Will return None if no match is found.

    Detects tokens in the order:

        1. 'ignore' tokens
        2. 'stop' tokens
        3. accepted tokens (from `type` or `token` parameters)
    
    So, if one passes multiple parameters that conflict with one another, the above ordering is what takes precedent.
    
    Args:
        token_list (List[Tuple[str, str, ...]]): the list of tokens. Each element should be a tuple of (token_type, token, ...).
            The first element is the type of the token, second is the string token, and anything else is ignored. This means
            this function can work with either the 2-tuple token lists from Tokenizer() objects as well as the 3-tuple
            token lists from Normalizer() objects.
        type (Optional[Union[str, Iterable[str]]]): the type or types of tokens to return. Can be a string to only return one
            type of token, or an iterable of strings to return the first token found that has any of those types. If 
            `token` is not None, then the returned token must also match that argument.
            NOTE: you can match "not X" by using python re's negative lookahead: r'(?![X]).*', where '[X]' is the thing to not match
        token (Optional[Union[str, Iterable[str]]]): the token to return. Can be a string to only return one matching token, or
            an iterable of strings to return the first token found that matches any of those tokens. If `type` is None, then
            the returned token must also match that type.
            NOTE: you can match "not X" by using python re's negative lookahead: r'(?![X]).*', where '[X]' is the thing to not match
        stop_on_type (Optional[Union[str, Iterable[str]]]): if a token of this type is found, then we immediately stop
            searching and return whatever we currently have. Can be a string to only stop at one type of token, or an 
            iterable of strings to stop at the first token found that has any of those types.
            NOTE: you can match "not X" by using python re's negative lookahead: r'(?![X]).*', where '[X]' is the thing to not match
        stop_on_token (Optional[Union[str, Iterable[str]]]): if this token is found, then we immediately stop
            searching and return whatever we currently have. Can be a string to only stop at one token, or an 
            iterable of strings to stop at the first token found that matches any of these.
            NOTE: you can match "not X" by using python re's negative lookahead: r'(?![X]).*', where '[X]' is the thing to not match
        ignore_type (Optional[Union[str, Iterable[str]]]): ignores token types. Can be a string to only ignore one token type, or an 
            iterable of strings to ignore any token types that match any of these. These tokens will not be added to return
            lists or considered tokens to keep. Since these are checked before 'stop' token types, this will override
            the stopping on any tokens also matched with `stop_on_type`.
            NOTE: you can match "not X" by using python re's negative lookahead: r'(?![X]).*', where '[X]' is the thing to not match
        ignore_token (Optional[Union[str, Iterable[str]]]): ignores tokens. Can be a string to only ignore one token, or an 
            iterable of strings to ignore any tokens that match any of these. These tokens will not be added to return
            lists or considered tokens to keep. Since these are checked before 'stop' token types, this will override
            the stopping on any tokens also matched with `stop_on_token`.
            NOTE: you can match "not X" by using python re's negative lookahead: r'(?![X]).*', where '[X]' is the thing to not match
        stop_unmatched (bool): if True, will stop on the first unmatched token. IE: a token that was not ignored, was not
            already stopped on, and was not considered a token to keep
        match_re (bool): if True, will assume any match values in `type` or `token` are to be considered regular expressions to fullmatch()
        ignore_re_case (bool): if True, will pass re.IGNORECASE as a flag when making the regular expressions
        start (int): the index to start at within token_list
        increment (int): the increment to use when searching for tokens. Set to a negative number to move backwards through the list
            NOTE: if returning multiple values, they will be returned in the order they appear in the input list, regardless
            of the `increment` value
        wrap (bool): if True, then the initial `start` index will be wrapped to the length of the `token_list`. If False,
            then an initial `start` index that is out of bounds of the `token_list` will immediately stop.
        max_matches (Union[int, None]): the number of matches to find. If 1, then values will be returned as normal. If >1, then this will
            search through the list finding up to `max_matches` matching tokens and return their `ret` values as a list in the
            order that they were found. If None, then all matches found will be returned
            NOTE: if `max_matches` != 1, then the return value will always either be None if no matches were found, or a list (even if
            only one match was found)
        ret_list (bool): if True, will always return a list, even if only a single return value was present
        ret (Union[str, Iterable[str]]): what value(s) to return. Can be a single string to return a single value, or an 
            iterable of strings to return multiple values as a tuple in the order they were passed. Valid strings:

            - 'index': return the index in `token_list` of the matched token
            - 'type': return the token type of the matched token
            - 'token': return the string token that was matched
            - 'all': return all of the above. If in a passed list, ignores all other values in the list. Will return values
              in the order above.
        
        on_no_match (Optional[Any]): value to return if there were no matches found. Defaults to None
    
    Returns:
        Union[None, int, str, Tuple, List]: None if no match is found, or one of the return types designated by `ret` argument,
            or a tuple of multiple return values if user passed multiple values in `ret`, or a list of one of the previous
            if collecting matches for multiple tokens.
            NOTE: if returning multiple values, they will be returned in the order they appear in the input list, regardless
            of the `increment` value
    """
    # Compile into RE's if using
    re_flags = re.IGNORECASE if ignore_re_case else 0
    _mre = lambda val: [(re.compile(t, flags=re_flags) if match_re and not isinstance(t, re.Pattern) else t) for t in 
                        ([] if val is None else [val] if isinstance(val, str) else list(val))]
    type, token, stop_on_type, stop_on_token, ignore_type, ignore_token = \
        map(_mre, [type, token, stop_on_type, stop_on_token, ignore_type, ignore_token])

    # Make sure the user passed at least one type or token
    if len(type) == 0 and len(token) == 0:
        raise ValueError("Must pass at least one `type` or `token` to match to")
    
    # Make sure some integer values are good
    if increment == 0:
        raise ValueError("`increment` cannot be 0")
    max_matches = 2**100000 if max_matches is None else max_matches
    if max_matches < 1:
        raise ValueError("`num_matches` must be >= 1")
    
    # Make sure the user passed a valid return value
    ret = [ret] if isinstance(ret, str) else list(ret)
    ret = [r.lower() for r in ret]
    for i, r in enumerate(ret):
        if r in ['ind', 'idx', 'index', 'loc']:
            ret[i] = 0
        elif r in ['type', 'token_type']:
            ret[i] = 1
        elif r in ['token', 'token_string']:
            ret[i] = 2
        elif r in ['all']:
            ret = list(range(3))
            break
        else:
            raise ValueError("Unknown return type: %s" % repr(r))
    
    # If the token_list is empty, return None. Otherwise, insert the indices into the token_list at the beginning
    if len(token_list) == 0:
        return None
    token_list = [[i, t[0], t[1]] for i, t in enumerate(token_list)]
    
    # Function to check string matches depending on whether it is a regular expression or just normal string
    _str_match = lambda _token, _match: _match.fullmatch(_token) is not None if isinstance(_match, re.Pattern) else (_token == _match)
    
    # Iterate through the token_list finding all matches until reaching the end
    idx = (start % len(token_list)) if wrap else start
    ret_inds = []
    while 0 <= idx < len(token_list):
        # Add in the increment now to make the code nicer
        curr_idx = idx
        idx += increment

        # Check if this element is one that should be ignored, and skip it if so
        if any(any(_str_match(token_list[curr_idx][i + 1], t) for t in arr) for i, arr in enumerate([ignore_type, ignore_token])):
            continue
        # Check if this element matches a stop type or token, and break if so
        elif any(any(_str_match(token_list[curr_idx][i + 1], t) for t in arr) for i, arr in enumerate([stop_on_type, stop_on_token])):
            break
        # Check if this element doesn't match either type or token, and skip it if so (or, break if stop_unmatched=True)
        elif any(len(arr) > 0 and not any(_str_match(token_list[curr_idx][i + 1], t) for t in arr) for i, arr in enumerate([type, token])):
            if stop_unmatched:
                break
            continue

        # Otherwise, this matches! Add it into the list, and check if we have found our max number of matches
        ret_inds.append(curr_idx)
        if len(ret_inds) >= max_matches:
            break
    
    # Now that we have all of the matched indices, make the return objects
    _make_ret = lambda i: token_list[i][ret[0]] if len(ret) == 1 else tuple(token_list[i][r] for r in ret)
    _rev = lambda l: list(reversed(l)) if increment < 0 else l
    ret = on_no_match if len(ret_inds) == 0 else _make_ret(ret_inds[0]) if max_matches == 1 else _rev([_make_ret(i) for i in ret_inds])

    return ret if isinstance(ret, list) or not ret_list else [ret]




[docs]
def get_normalizer(normalizer):
    """Returns the normalizer being used.

    Args:
        normalizer (Union[str, Normalizer, type]): either a ``Normalizer`` object (IE: has a callable 'normalize' function), 
            or a string name of a built-in normalizer to use, or a type of a normalizer to instantiate with no args/kwargs
            passed. Accepted strings include: 'innereye', 'deepbindiff', 'safe', 'deepsemantic', 'unnormalized', 
            'compressed_stats', 'hpc_data'

    Raises:
        ValueError: for unknown string name of normalizer
        TypeError: if `normalizer` was not a string or ``Normalizer`` object

    Returns:
        Normalizer: a ``Normalizer`` object
    """
    
    if isinstance(normalizer, str):
        norm_str = normalizer.lower()

        # You can specify the opcode/instruction level tokenization
        tl_names = bincfg.TokenizationLevel.AUTO.value + bincfg.TokenizationLevel.INSTRUCTION.value + bincfg.TokenizationLevel.OPCODE.value
        matched = re.fullmatch(r'(.*)[\-_](%s)' % '|'.join(tl_names), norm_str)
        if matched is not None:
            tl = 'op' if matched.groups()[1] in bincfg.TokenizationLevel.OPCODE.value else \
                'inst' if matched.groups()[1] in bincfg.TokenizationLevel.INSTRUCTION.value else 'auto'
            norm_str = matched.groups()[0]
        else:
            tl = 'auto'

        if norm_str.endswith("_normalizer"):
            norm_str, *_ = norm_str.rpartition("_normalizer")
        elif norm_str.endswith("_norm"):
            norm_str, *_ = norm_str.rpartition("_norm")

        known_isa = None
        for s in ['x86', 'java']:
            if norm_str.startswith(s):
                _, known_isa, norm_str = norm_str.partition(s)
        
        if norm_str.startswith('_'):
            norm_str = norm_str[1:]
        
        def _check_isa(needed, allow_none=False):
            if not allow_none and known_isa is None:
                raise ValueError("Ambiguous normalizer string: %s, must pass ISA name in front (IE: 'x86_%s')" % (repr(norm_str), norm_str))
            
            if isinstance(needed, str):
                needed = (needed,)
            
            if known_isa is not None and known_isa not in needed:
                raise ValueError("Normalizer %s can only be used with %s ISA's, not %s" % (repr(norm_str), repr(needed), repr(known_isa)))

        
        if norm_str in ['innereye', 'inner', 'innereyenormalizer']:
            _check_isa('x86', allow_none=True)
            return bincfg.normalization.X86InnerEyeNormalizer(tokenization_level=tl)
        elif norm_str in ['deepbindiff', 'bindiff', 'deepbin', 'deepbindiffnormalizer']:
            _check_isa('x86', allow_none=True)
            return bincfg.normalization.X86DeepBinDiffNormalizer(tokenization_level=tl)
        elif norm_str in ['safe', 'safenormalizer']:
            _check_isa('x86', allow_none=True)
            return bincfg.normalization.X86SafeNormalizer(tokenization_level=tl)
        elif norm_str in ['deepsem', 'deepsemantic', 'semantic', 'deepsemanticnormalizer']:
            _check_isa('x86', allow_none=True)
            return bincfg.normalization.X86DeepSemanticNormalizer(tokenization_level=tl)
        
        elif norm_str in ['none', 'unnorm', 'unnormalized', 'base', 'basenormalizer']:
            _check_isa(('x86', 'java'), allow_none=False)
            return bincfg.normalization.X86BaseNormalizer(tokenization_level=tl) if known_isa == 'x86' else\
                bincfg.normalization.JavaBaseNormalizer(tokenization_level=tl)
        elif norm_str in ['compressed', 'stats', 'comp_stats', 'compressed_stats', 'statistics', 'compressedstats', 'compressedstatsnormalizer']:
            _check_isa(('x86', 'java'), allow_none=False)
            if known_isa == 'java':
                raise NotImplementedError("Need to implement the java compressed stats normalizer")
            return bincfg.normalization.X86CompressedStatsNormalizer(tokenization_level=tl) if known_isa == 'x86' else\
                bincfg.normalization.JavaCompressedStatsNormalizer(tokenization_level=tl)
        elif norm_str in ['hpc', 'hpc_data', 'hpcdata', 'hpcdatanormalizer', 'hpcdatanorm', 'hpcnorm', 'hpcnormalizer']:
            _check_isa(('x86', 'java'), allow_none=False)
            if known_isa == 'java':
                raise NotImplementedError("Need to implement the java hpcdata normalizer")
            return bincfg.normalization.X86HPCDataNormalizer(tokenization_level=tl) if known_isa == 'x86' else\
                bincfg.normalization.JavaHPCDataNormalizer(tokenization_level=tl)
        else:
            raise ValueError("Unknown normalization string: '%s'" % normalizer)
    
    elif isinstance(normalizer, type):
        try:
            return get_normalizer(normalizer())
        except Exception as e:
            raise ValueError("Could not build a default normalizer from type: %s\nError Message: %s\n Traceback:%s"
                             % (repr(normalizer.__name__), e, traceback.format_exc()))
    
    elif hasattr(normalizer, 'normalize') and callable(normalizer.normalize):
        return normalizer
    
    else:
        raise TypeError("Unknown normalizer type: '%s'" % normalizer)




[docs]
def parse_disinfo_json(string):
    """Attempts to pase a JSON object inside of disassembler info tokens
    
    Assumes the `DISINFO_START` and `DISINFO_END` have already been removed from the string.

    Args:
        string (str): the string to attempt to parse into json
    
    Returns:
        Union[None, JSONObject]: returns the resulting JSON object, or None if the string could not be parsed as JSON
    """
    try:
        return json.loads(string)
    except Exception:
        return None