Source code for bincfg.normalization.multi_normalizer

"""Class that can use multiple normalization methods"""

from .base_tokenizer import _USE_DEFAULT_NT
from ..utils import hash_obj



[docs]
class MultiNormalizer:
    """A normalizer that can work with multiple sub-normalizers based on architecture
    
    This does not inheret from BaseNormalizer, and thus you cannot modify or call most normalizer functions from this
    normalizer itself. It essentially just acts as a wrapper around multiple different normalizers.

    Parameters
    ----------
    normalizers: `BaseNormalizer`
        One or more normalizers to use together. May only use one per architecture.
    """
    def __init__(self, *normalizers):
        self._arch_to_norm = {}

        for n in normalizers:
            if n.tokenizer.ARCHITECTURE is None:
                raise ValueError("Cannot have a None architecture for a MultiNormalizer `normalizer`: %s" % repr(str(n)))
            elif n.tokenizer.ARCHITECTURE in self._arch_to_norm:
                raise ValueError("Found multiple tokenizers for the same architecture: %s" % repr(n.tokenizer.ARCHITECTURE))
            self._arch_to_norm[n.tokenizer.ARCHITECTURE] = n
    

[docs]
    def normalize(self, *strings, cfg=None, block=None, newline_tup=_USE_DEFAULT_NT, match_instruction_address=True):
        """Normalizes the given iterable of strings.

        Args:
            strings (str): arbitrary number of strings to normalize
            cfg (Union[CFG, MemCFG], optional): either a ``CFG`` or ``MemCFG`` object that these lines occur 
                in. Used for determining function calls to self, internal functions, and external functions. If not 
                passed, then these will not be used. Defaults to None.
            block (Union[CFGBasicBlock, int], optional): either a ``CFGBasicBlock`` or integer block_idx in a ``MemCFG``
                object. Used for determining function calls to self, internal functions, and external functions. If not 
                passed, then these will not be used. Defaults to None.
            newline_tup (Tuple[str, str], optional): the tuple to insert inbetween each passed string, or None to not 
                insert anything. Defaults to self.tokenizer.DEFAULT_NEWLINE_TUPLE
            match_instruction_address (bool, optional): if True, will match instruction addresses. If there is an immediate
                value at the start of a line (IE: start of a string in `strings`, or immediately after a Tokens.NEWLINE
                or Tokens.INSTRUCTION_START [ignoring any Tokens.SPACING]), then that token will be converted into a
                Tokens.INSTRUCTION_ADDRESS token. If there is a Tokens.COLON immediately after that token (again, ignoring
                any Tokens.SPACING), then that first Tokens.COLON match will be appended (along with any inbetween Tokens.SPACING)
                to that Tokens.INSTRUCTION_ADDRESS token. For example, using the x86 tokenization scheme:

                    - "0x1234: add rax rax" -> [(Tokens.INSTRUCTION_ADDRESS, '0x1234:'), ...]
                    - "  0x1234     : add rax rax" -> [(Tokens.SPACING, '  '), (Tokens.INSTRUCTION_ADDRESS, '0x1234     :'), ...]
                    - "0x1234 add rax rax" -> [(Tokens.INSTRUCTION_ADDRESS, '0x1234'), ...]
                
            kwargs (Any): extra kwargs to pass along to tokenization method, and to store in normalizer state

        Returns:
            List[str]: a list of normalized string instruction lines
        """
        if cfg is None:
            raise ValueError("Must pass the cfg being used when calling the MultiNormalizer")
        
        return self._arch_to_norm[cfg.architecture](*strings, cfg=cfg, block=block, newline_tup=newline_tup, 
                                                  match_instruction_address=match_instruction_address)

    
    def __call__(self, *strings, cfg=None, block=None, newline_tup=_USE_DEFAULT_NT, match_instruction_address=True):
        """Normalizes the given iterable of strings.

        Args:
            strings (str): arbitrary number of strings to normalize
            cfg (Union[CFG, MemCFG], optional): either a ``CFG`` or ``MemCFG`` object that these lines occur 
                in. Used for determining function calls to self, internal functions, and external functions. If not 
                passed, then these will not be used. Defaults to None.
            block (Union[CFGBasicBlock, int], optional): either a ``CFGBasicBlock`` or integer block_idx in a ``MemCFG``
                object. Used for determining function calls to self, internal functions, and external functions. If not 
                passed, then these will not be used. Defaults to None.
            newline_tup (Tuple[str, str], optional): the tuple to insert inbetween each passed string, or None to not 
                insert anything. Defaults to self.tokenizer.DEFAULT_NEWLINE_TUPLE
            match_instruction_address (bool, optional): if True, will match instruction addresses. If there is an immediate
                value at the start of a line (IE: start of a string in `strings`, or immediately after a Tokens.NEWLINE
                or Tokens.INSTRUCTION_START [ignoring any Tokens.SPACING]), then that token will be converted into a
                Tokens.INSTRUCTION_ADDRESS token. If there is a Tokens.COLON immediately after that token (again, ignoring
                any Tokens.SPACING), then that first Tokens.COLON match will be appended (along with any inbetween Tokens.SPACING)
                to that Tokens.INSTRUCTION_ADDRESS token. For example, using the x86 tokenization scheme:

                    - "0x1234: add rax rax" -> [(Tokens.INSTRUCTION_ADDRESS, '0x1234:'), ...]
                    - "  0x1234     : add rax rax" -> [(Tokens.SPACING, '  '), (Tokens.INSTRUCTION_ADDRESS, '0x1234     :'), ...]
                    - "0x1234 add rax rax" -> [(Tokens.INSTRUCTION_ADDRESS, '0x1234'), ...]
                
            kwargs (Any): extra kwargs to pass along to tokenization method, and to store in normalizer state

        Returns:
            List[str]: a list of normalized string instruction lines
        """
        return self.normalize(*strings, cfg=cfg, block=block, newline_tup=newline_tup, match_instruction_address=match_instruction_address)
    
    def __eq__(self, other):
        return type(self) == type(other) and all(t1 == t2 for t1, t2 in zip(self._arch_to_norm.items(), other.normalizers.items()))
    
    def __hash__(self):
        return hash_obj(self._arch_to_norm, return_int=True)
    
    def __repr__(self):
        return "%s(%s)" % (self.__class__.__name__, ', '.join([repr(n) for n in self._arch_to_norm.values()]))
    
    def __str__(self):
        return '-'.join([self.__class__.__name__.lower()] + [str(n) for n in self._arch_to_norm.values()])