Source code for bincfg.normalization.base_tokenizer

"""Class for tokenizing assembly lines, as well as other tokenization constants"""

import re
from ..utils import ParameterSaver, eq_obj, hash_obj
from .norm_utils import *
from enum import Enum
from ..utils.type_utils import *


# Token type names

[docs]
class Tokens:
    INSTRUCTION_ADDRESS = 'inst_addr'
    INSTRUCTION_START = 'inst_start'
    SPLIT_IMMEDIATE = 'split_imm'
    DISASSEMBLER_INFO = 'disassembler_info'
    NEWLINE = 'newline'
    SPACING = 'spacing'

    OPEN_BRACKET = 'open_bracket'
    CLOSE_BRACKET = 'close_bracket'
    PLUS_SIGN = 'plus_sign'
    TIMES_SIGN = 'times_sign'
    COLON = 'colon'

    INSTRUCTION_PREFIX = 'prefix'
    OPCODE = 'opcode'
    REGISTER = 'register'
    IMMEDIATE = 'immediate'

    MEMORY_SIZE = 'memory_size'
    MEMORY_EXPRESSION = 'memory_expression'
    BRANCH_PREDICTION = 'branch_prediction'
    STRING_LITERAL = 'string_literal'
    SEGMENT_ADDRESS = 'segment_address'

    MISMATCH = 'mismatch'




[docs]
class TokenizationLevel(Enum):
    """Different levels to perform tokenization"""
    OPCODE = ['op', 'opcode', 'operand', 'opcodes', 'operands']
    INSTRUCTION = ['inst', 'instruction', 'line', 'instructions', 'lines']
    AUTO = ['auto', 'automatic', 'default']




[docs]
class Architectures(Enum):
    """Known (but not necessarily supported) architectures"""
    X86 = ['x86', 'i686', 'x86_64']
    JAVA = ['java', 'java_bytecode']




[docs]
def get_architecture(arch: 'Union[str, Architectures]') -> 'Architectures':
    """Returns the architecture
    
    Args:
        arch (Union[str, Architectures])
    """
    if isinstance(arch, Architectures):
        return arch
    elif isinstance(arch, str):
        arch = arch.lower().replace('-', '_')
        for a in Architectures:
            if any(arch == v for v in a.value):
                return a
        raise ValueError("Unknown architecture string: %s" % repr(arch))
    else:
        raise TypeError("Cannot get architecture from object of type: %s" % repr(type(arch).__name__))




[docs]
class TokenMismatchError(Exception):
    pass



[docs]
class UnknownTokenError(Exception):
    pass



# Special tokens to insert into tokenizer by default
SPECIAL_TOKENS_START = [
    (Tokens.STRING_LITERAL, RE_STRING_LITERAL),
    (Tokens.DISASSEMBLER_INFO, RE_DISASSEMBLER_INFO),
    (Tokens.INSTRUCTION_START, INSTRUCTION_START_TOKEN),
    (Tokens.SPLIT_IMMEDIATE, SPLIT_IMMEDIATE_TOKEN),
    (Tokens.PLUS_SIGN, RE_PLUS_SIGN),
    (Tokens.TIMES_SIGN, RE_TIMES_SIGN),
    (Tokens.OPEN_BRACKET, RE_OPEN_BRACKET),
    (Tokens.CLOSE_BRACKET, RE_CLOSE_BRACKET),
    (Tokens.COLON, RE_COLON),
    (Tokens.SPACING, RE_SPACING),
    (Tokens.NEWLINE, RE_NEWLINE),
    (Tokens.IMMEDIATE, RE_IMMEDIATE),
]
SPECIAL_TOKENS_END = [
    (Tokens.MISMATCH, r'.'),
]


# An indication that the default newline tuple should be used when tokenizing/normalizing
_USE_DEFAULT_NT = object()



[docs]
def parse_tokenization_level(tokenization_level, auto_tl):
    """Returns the bincfg.TokenizationLevel enum based on the given tokenization_level.

    Args:
        tokenization_level (Union[bincfg.TokenizationLevel, str]): either a string tokenization level, or a class from the 
            bincfg.TokenizationLevels enum
        auto_tl (bincfg.TokenizationLevel): the default tokenization level to use if we get an 'auto' tokenization level

    Returns:
        bincfg.TokenizationLevel: a class from the ``bincfg.TokenizationLevels`` enum
    """
    if not isinstance(auto_tl, TokenizationLevel) or auto_tl == TokenizationLevel.AUTO:
        raise TypeError("`auto_tl` must be a bincfg.TokenizationLevel, and cannot be bincfg.TokenizationLevel.AUTO. Got: %s" % repr(auto_tl))
    
    if isinstance(tokenization_level, str):
        tl = tokenization_level.lower().replace('-', '_')
        for l in TokenizationLevel:
            if tl in l.value:
                ret = l
                break
        else:
            raise ValueError("Unknown tokenization_level string: '%s'" % tokenization_level)
    elif isinstance(tokenization_level, TokenizationLevel):
        ret = tokenization_level
    else:
        raise TypeError("Unknown tokenization_level type: '%s'" % type(tokenization_level))
    
    # Check for auto
    if ret is TokenizationLevel.AUTO:
        return auto_tl
    
    return ret




[docs]
class BaseTokenizer(metaclass=ParameterSaver):
    """A default class to tokenize instructions

    Should be subclassed once for each instruction set, providing the tokens being used.

    Many functions may be overriden to change tokenization behavior. These functions all start with the name `token\_...`
    and take as input a single state dictionary and return either a string for the next token to append to the current
    line being tokenized, or None to not add anything to the line. The state dictionary contains the following:

        - 'tokenizer' (BaseTokenizer): this tokenizer 
        - 'kwargs' (Dict[str, Any]): dictionary of extra kwargs passed to the initial call to the `tokenize` function
        - 'all_strings' (List[str]): list of input strings (args) passed to the initial call to the `tokenize` function
        - 'token_handlers' (Dict[str, Callable[]]): dictionary mapping token types to the function that handles that token
        - 'sentence' (List[Tuple[str, str]]): list of processed token tuples to return, each a tuple of (token\_name, token)
        - 'newline_tup' (Union[None, Tuple[str, str]]): token tuple to add at the end of each line to indicate a new line
        - 'match_instruction_address' (bool): whether or not we are matching instruction addresses
        - 'split_imm' (bool): whether or not we are currently handling an immediate token that was split 
        - 'line' (List[Tuple[str, str]]): the current line of tokens we are working on
        - 'string' (str): the current string being tokenized
        - 'token_type' (str): the type of the 'token', should be from `bincfg.normalization.base_tokenizer.Tokens`
        - 'token' (str): the currently matched token string
        - 'match' (re.Match): the re match object that matched this token
    
    Some extra functions are available for overriding including:

        - handle_line(): called at the end of each line being tokenized (an individual string passed to the tokenizer)
        - handle_sentence(): called at the end of each sentence being tokenized (aggregation of all lines passed to the tokenizer)
    
    Each instruction set architecture (ISA) should have its own ``Tokenizer`` class that inherits from ``BaseTokenizer``. The 
    tokenization process uses python's ``re`` module to perform tokenization, converting strings into streams of 
    (token\_name, token\_string) tuples. For more information on how to use regex to create tokenizers, see: 
    https://docs.python.org/3/library/re.html#writing-a-tokenizer
    
    TOKENIZATION PROCESS
    
        1. Clean the incomming instruction strings using the passed `clean_instruction_func`
        2. Iterate through the strings finding all tokens
        
           a. Each token is sent to its corresponding token handler function
           b. At the end of each 'line' (EG: end of a passed `string`, reaching Tokens.NEWLINE token, etc.), that line
              is handled with the `handle_line()` function
           c. All tokens are added to the same return 'sentence', even if multiple strings in `strings` were passed

        3. After all strings have been tokenized and lines handled, the final return 'sentence' is sent to `handle_sentence()`
    
    SPECIAL TOKENS

    There are some 'special tokens' that are assumed to exist for all ISA's as they are a part of the tokenization
    process itself. These tokens will be inserted into the passed `tokens` parameter at the beginning of the list
    (IE: they are the first tokens searched for), except for the 'mismatch' token which is inserted at the end,
    and are inserted in the following order:

        1. String literals (Tokens.STRING_LITERAL) - matches strings which can start/end with matching single or double 
           quotes, and can escape inner quotes with \\' or \\", and can escape the escape character with \\\\. Any extra
           escape characters (not behind a ' or " or \\) will be left as-is.
        2. Disassembler information (Tokens.DISASSEMBLER_INFO) - matches disassembler information of the form "<...>". 
           This info must be within open/close angle brackets. It is also possible to nest angle brackets within the 
           disassembler info up to a maximum current depth of 3. IE: we can match the following:
        
            * "<no angle brackets inside>" - depth of 1
            * "<angle <brackets> depth <2>>" - depth of 2
            * "<level <3 angle <bracket>> depth>" - depth of 3
        
           We also do not check that every open has a matching close, just that every close has a matching open. So, the
           following could still be matched:

            * "<lots of <<<<<<< things>"
        
           However, missing or unmatched ending angle brackets will fail, as well as very deep nesting:

            - "<" : no matching '>' only for the first occurance of '<'
            - "<data>>" : no matching '<' for both of the '>' brackets
            - "<super<deep<nested<...<thing>>...>>" : too large nesting depth

           String literals are checked first within the disassembler info so that any end brackets '>' within the strings
           won't affect the parsing of the disassembler info.
        
           This limit on nesting depth is present due to the inability for python's re engine to handle recursive matching
           of nested brackets, and I can't think of any way to implement it entirely within re's (which is needed in order
           to continue using the python re tokenization method). I don't see any reason why this would be needed as we
           already go down to a depth of 3 to handle more than what I would expect as output from disassemblers, and if
           the user is inserting information themselves, they could simply input the information within the brackets
           using a different delimiter and parse it themselves by overriding things like `token_disassembler_info()`
           and `handle_disassembler_info` in the `Tokenizer` and `Normalizer` classes respectively. If a larger
           depth is needed, one can manually alter the `_DIS_INFO_MAX_REC_DEPTH` variable at the top of this file. It will
           increase the valid nesting depth at the cost of slower regular expression matching for disassembler info.
        3. Instruction start token "#start_instr#" (Tokens.INSTRUCTION_START) - used to determine when instructions 
           start/stop when using an op-level tokenization scheme. When tokenizing, we need to know when a new instruction 
           is started to decide if an immediate value found should be considered an instruction address or just a plain 
           immediate. New instructions occur whenever we reach a newline token, an instruction start token, or the start 
           of a new string passed in the args of the `tokenize()` method. This instruction start token is removed when 
           found, and won't appear during normalization.
        4. Split immediate token "#split_imm#" (Tokens.SPLIT_IMMEDIATE) - used to designate a split immediate value. This 
           is useful for reducing the number of unique tokens present while keeping full immediate information. When using 
           split immediates during normalization, immediate values with more digits than some threshold will be split into multiple
           immediate tokens and placed one after the other, prepended with this "#split_imm#" token. In order to keep
           that output as renormalizable, the tokenizer, when finding one of these split immediate tokens, will concatenate
           all of the following immediate tokens until reaching some non-immediate (and, non-spacing) token to rebuild
           the original immediate token. This split immediate token is removed when found, and won't appear during normalization
        5. Plus sign (Tokens.PLUS_SIGN) - '+'
        6. Times sign (Tokens.TIMES_SIGN) - '*'
        7. Open bracket (Tokens.OPEN_BRACKET) - '['
        8. Close bracket (Tokens.CLOSE_BRACKET) - ']'
        9. Colon (Tokens.COLON) - ':'
        10. Spacing (Tokens.SPACING) - One or more space ' ', comma ',', or tab '\\t' characters in a row
        11. Newline (Tokens.NEWLINE) - Either the newline character '\\n' or a pipe character '|'
        12. Immediate values (Tokens.IMMEDIATE) - any integer immediate value in hex, decimal, octal, or binary. Hex values
            must start with '0x', octal with '0o', and binary with '0b'
        13. Mismatch token (Tokens.MISMATCH) - matches any character. Inserted at the very end of `tokens` and is used 
            to designate the start of an unknown token or character so that can be handled (by default, an error is raised)

    If you wish to keep some of the above tokens, but overwrite others, you can set that token's regex in the passed
    `tokens` parameter, and that will overwrite these special tokens. You may also set it to None to not insert it at all.

    INSTRUCTION ADDRESSES

    If match_instruction_address=True when tokenizing, the tokenizer will attempt to match instruction addresses at the
    beginning of each line. If there is an immediate value at the start of a line (IE: start of a string in `strings`, 
    or immediately after a Tokens.NEWLINE or Tokens.INSTRUCTION_START [ignoring any Tokens.SPACING]), then that token
    will be converted into a Tokens.INSTRUCTION_ADDRESS token. If there is a Tokens.COLON immediately after that token 
    (again, ignoring any Tokens.SPACING), then that first Tokens.COLON match will be appended to that Tokens.INSTRUCTION_ADDRESS 
    token, removing any Tokens.SPACING inbetween them. For example, using the x86 tokenization scheme:

        - "0x1234: add rax rax" -> [(Tokens.INSTRUCTION_ADDRESS, '0x1234:'), ...]
        - "  0x1234     : add rax rax" -> [(Tokens.SPACING, '  '), (Tokens.INSTRUCTION_ADDRESS, '0x1234:'), ...]
        - "0x1234 add rax rax" -> [(Tokens.INSTRUCTION_ADDRESS, '0x1234'), ...]
    
           
    Parameters
    ----------
    tokens: `Optional[List[Tuple[str, str]]]`
        the tokens to use. Should be a list of 2-tuples. Each tuple is a pair of (name, regex) where
        name is the string name of the token, and regex is a regular expression to find that token. These
        tuples should be ordered in the preferred order to search for tokens. If None, then this will default to 
        self.DEFAULT_TOKENS (which should be set when defining the class)
    token_handlers: `Optional[Dict[str, Callable[[Dict[str, Any]], Union[None, str]]]]`
        optional dictionary mapping token type strings to functions to handle those token types when tokenizing. This is
        intended to be used when you wish to add entirely new token types not present in `bincfg.normalization.base_tokenizer.Tokens`.
        If you wish to change the behavior of handling an already-present token type, just override that token handler function.
        These will override the default token handlers.
    insert_special_tokens: `bool`
        by default, some special tokens will be inserted at the front of `tokens` (see the 'special tokens' listed above).
        If you wish to stop this from happening, you can set `insert_special_tokens` to False
    case_sensitive: `bool`
        If True, then regular expressions will be matched exactly as they appear. If False, then the re.IGNORECASE flag
        will be passed when compiling the regular expressions
    """

    DEFAULT_NEWLINE_TUPLE = (Tokens.NEWLINE, '\n')
    """The default (token_type, token) tuple to use for newlines"""

    ARCHITECTURE = None
    """The architecture this tokenizer works on"""

    def __init__(self, tokens=None, token_handlers=None, insert_special_tokens=True, case_sensitive=False):
        self.tokens = tokens if tokens is not None else self.DEFAULT_TOKENS
        self.case_sensitive = case_sensitive

        # Insert the special tokens, make sure user values override regex's and positions of default tokens, and remove
        #   any tokens that the user has set to None
        if insert_special_tokens:
            user_tokens = set(t[0] for t in self.tokens)
            self.tokens = [t for t in SPECIAL_TOKENS_START if t[0] not in user_tokens] + self.tokens + \
                [t for t in SPECIAL_TOKENS_END if t[0] not in user_tokens]
            self.tokens = [t for t in self.tokens if t[1] is not None]

        self.token_handlers = token_handlers if token_handlers is not None else {}
        self._init_tokenizer()
    
    def _init_tokenizer(self):
        """Initializes the tokenizer from self.tokens"""
        flags = (re.M|re.UNICODE)
        flags = (flags|re.IGNORECASE) if not self.case_sensitive else flags
        self.tokenizer = re.compile('|'.join([("(?P<%s>%s)" % pair) for pair in self.tokens]), flags=flags)
    

[docs]
    def tokenize(self, *strings, newline_tup=_USE_DEFAULT_NT, match_instruction_address=True, **kwargs):
        """Tokenizes the input
        
        Subclasses should override any self.token_* methods they wish to inject behavior into. Each one of those functions
        takes in a 'state' dictionary as input and should return either a new string token or None to use the old token.

        See the docs for :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info on how tokenization
        works, how to create subclasses, etc.

        Args:
            strings (str): arbitrary number of strings to tokenize.
            newline_tup (Optional[Tuple[str, str]]): the tuple to insert inbetween each passed string, or None to not 
                insert anything. Defaults to `self.__class__.DEFAULT_NEWLINE_TUPLE`.
            match_instruction_address (bool, optional): if True, will match instruction addresses. If there is an immediate
                value at the start of a line (IE: start of a string in `strings`, or immediately after a Tokens.NEWLINE
                or Tokens.INSTRUCTION_START [ignoring any Tokens.SPACING]), then that token will be converted into a
                Tokens.INSTRUCTION_ADDRESS token. If there is a Tokens.COLON immediately after that token (again, ignoring
                any Tokens.SPACING), then that first Tokens.COLON match will be appended to that Tokens.INSTRUCTION_ADDRESS 
                token, removing any Tokens.SPACING inbetween them. For example, using the x86 tokenization scheme:

                    - "0x1234: add rax rax" -> [(Tokens.INSTRUCTION_ADDRESS, '0x1234:'), ...]
                    - "  0x1234     : add rax rax" -> [(Tokens.SPACING, '  '), (Tokens.INSTRUCTION_ADDRESS, '0x1234:'), ...]
                    - "0x1234 add rax rax" -> [(Tokens.INSTRUCTION_ADDRESS, '0x1234'), ...]
                
            kwargs (Any): extra kwargs to store in the tokenizer state, for use in child classes

        Returns:
            List[Tuple[str, str]]: list of (token_type, token) tuples
        """
        if len(strings) == 0:
            return []
        
        # Replace the newline_tup if using
        newline_tup = self.DEFAULT_NEWLINE_TUPLE if newline_tup is _USE_DEFAULT_NT else newline_tup
        
        # If the first element starts with an `INSTRUCTION_START` token, then concatenate all of the strings together
        if strings[0].strip().startswith(INSTRUCTION_START_TOKEN):
            strings = [' '.join(strings)]
        
        # Dictionary mapping token types to self functions to handle those tokens. Only have to add tokens to handle
        token_handler_dict = {
            Tokens.INSTRUCTION_ADDRESS: self.token_instruction_address,
            #Tokens.INSTRUCTION_START: None,
            #Tokens.SPLIT_IMMEDIATE: None,
            Tokens.DISASSEMBLER_INFO: self.token_disassembler_info,
            Tokens.NEWLINE: self.token_newline,
            Tokens.SPACING: self.token_spacing,

            Tokens.OPEN_BRACKET: self.token_all_symbols,
            Tokens.CLOSE_BRACKET: self.token_all_symbols,
            Tokens.PLUS_SIGN: self.token_all_symbols,
            Tokens.TIMES_SIGN: self.token_all_symbols,
            Tokens.COLON: self.token_all_symbols,

            Tokens.INSTRUCTION_PREFIX: self.token_instruction_prefix,
            Tokens.OPCODE: self.token_opcode,
            Tokens.REGISTER: self.token_register,
            Tokens.IMMEDIATE: self.token_immediate,

            Tokens.MEMORY_SIZE: self.token_memory_size,
            #Tokens.MEMORY_EXPRESSION: 'memory_expression'
            Tokens.BRANCH_PREDICTION: self.token_branch_prediction,
            Tokens.STRING_LITERAL: self.token_string_literal,

            Tokens.MISMATCH: self.token_mismatch,
        }
        token_handler_dict.update(self.token_handlers)
        
        token_state = {'kwargs': kwargs, 'all_strings': strings, 'token_handlers': token_handler_dict, 'sentence': [], 
                       'newline_tup': newline_tup, 'match_instruction_address': match_instruction_address, 'tokenizer': self}
        
        for string in strings:
            token_state.update({'previous_newline': True, 'split_imm': False, 'line': [], 'string': string, 'matched_ia_colon': False})

            for mo in self.tokenizer.finditer(string):
                token_state['token_type'], token_state['token'], token_state['match'] = mo.lastgroup, mo.group(), mo
                
                # Handle all the various tokens. Keep all tokens
                new_token = token_state['token_handlers'][token_state['token_type']](token_state) if token_state['token_type'] in token_state['token_handlers'] \
                    else token_state['token'] if token_state['token_type'] in [Tokens.SPLIT_IMMEDIATE, Tokens.INSTRUCTION_START] \
                    else self.token_unknown(token_state)
                        
                if new_token is not None:
                    token_state['line'].append((token_state['token_type'], new_token))
            
            # If there are split immediates, parse them out now in line
            token_state['line'] = self._merge_split_immediates(token_state['line'])

            # Check for instruction addresses
            if token_state['match_instruction_address']:
                token_state['line'] = self._check_instruction_address(token_state['line'])

            # Handle the new line and add on the newline_tup if using, and we don't have an empty sentence
            new_line = self.handle_line(token_state)
            token_state['sentence'] += new_line if new_line is not None else token_state['line']
            if token_state['newline_tup'] is not None and len(token_state['sentence']) > 0:
                token_state['sentence'].append(token_state['newline_tup'])
        
        return self.handle_sentence(token_state)

    
    def _merge_split_immediates(self, line):
        """Merges any split immediates into one, ignoring any spacing inbetween them"""
        ret_line, idx = [], 0
        while idx < len(line):
            # If this isn't a split immediate, just go ahead to the next token
            if line[idx][0] not in [Tokens.SPLIT_IMMEDIATE]:
                ret_line.append(line[idx])
                idx += 1
                continue

            # This is a split immediate token, scan ahead for all immediate tokens (including instruction address), 
            #   ignoring spacing, until reaching a non-immediate token. Concatenate these into a single immediate value
            imm_inds = scan_for_token(line, type=Tokens.IMMEDIATE, ignore_type=Tokens.SPACING, stop_unmatched=True,
                                      start=idx + 1, max_matches=None, wrap=False, on_no_match=[])
            if len(imm_inds) > 0:
                ret_line.append((line[imm_inds[0]][0], ''.join([line[i][1] for i in imm_inds])))
            idx = max(imm_inds + [idx]) + 1
        
        return ret_line
    
    def _check_instruction_address(self, line):
        """Scans through the current line checking for possible instruction address locations"""
        _newlines = [Tokens.NEWLINE, Tokens.INSTRUCTION_START]
        ret_line, idx = [], 0
        while idx < len(line):
            token_tup, token_type = line[idx], line[idx][0]

            # If we are at the start of a line, check for instruction address
            if idx == 0 or token_type in _newlines:
                inst_addr_inds = scan_for_token(line, type=Tokens.IMMEDIATE, ignore_type=Tokens.SPACING, stop_unmatched=True, 
                                                start=(idx+1) if token_type in _newlines else idx, wrap=False, on_no_match=[], ret_list=True)
                
                if len(inst_addr_inds) > 0 and imm_to_int(line[inst_addr_inds[0]][1]) > 0:
                    # Check for possible colon token
                    inst_addr_inds += scan_for_token(line, type=Tokens.COLON, ignore_type=Tokens.SPACING, stop_unmatched=True, 
                                                     start=max(inst_addr_inds)+1, wrap=False, on_no_match=[], ret_list=True)
                    
                    # Build the instruction address token. Insert any tokens before the start if present
                    for i in range(idx if token_type in _newlines + [Tokens.SPACING] else (idx + 1), min(inst_addr_inds)):
                        ret_line.append(line[i])
                    token_tup = (Tokens.INSTRUCTION_ADDRESS, ''.join([line[i][1] for i in inst_addr_inds]))
                    idx = max(inst_addr_inds)

            # Add in this token and increment the index
            ret_line.append(token_tup)
            idx += 1

        return ret_line
    

[docs]
    def handle_line(self, state):
        """Handles a single line (one string passed to the tokenizer)
        
        Each line could contain newlines and whatnot, but no newline_tup's will have been inserted.

        Subclasses may override this function for more behavior, but it defaults to just returning the passed line.

        Args:
            state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info
        
        Returns:
            List[Tuple[str, str]]: list of (token_type, token) tuples for this line
        """
        return state['line']

    

[docs]
    def handle_sentence(self, state):
        """Handles an entire sentence (aggregation of all strings passed to one call of this tokenizer)
        
        Inbetween each line, a newline_tup will have already been inserted (if using)

        Subclasses may override this function for more behavior, but it defaults to just returning the passed sentence

        Args:
            state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info
        
        Returns:
            List[Tuple[str, str]]: the final list of tokens
        """
        return state['sentence']



[docs]
    def token_branch_prediction(self, state):
        """Handles any branch_prediction tokens
        
        This can be overriden by subclasses for more functionality, but defaults to just returning the original token

        Args:
            state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info
        
        Returns:
            Union[str, None]: either a string token for the next token to append to line, or None to not append anything
        """
        return state['token']



[docs]
    def token_immediate(self, state):
        """Handles any immediate tokens
        
        This can be overriden by subclasses for more functionality, but defaults to just returning the original token

        Args:
            state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info
        
        Returns:
            Union[str, None]: either a string token for the next token to append to line, or None to not append anything
        """
        return state['token']



[docs]
    def token_instruction_prefix(self, state):
        """Handles any instruction_prefix tokens
        
        This can be overriden by subclasses for more functionality, but defaults to just returning the original token

        Args:
            state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info
        
        Returns:
            Union[str, None]: either a string token for the next token to append to line, or None to not append anything
        """
        return state['token']



[docs]
    def token_memory_size(self, state):
        """Handles any memory_size tokens
        
        This can be overriden by subclasses for more functionality, but defaults to just returning the original token

        Args:
            state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info
        
        Returns:
            Union[str, None]: either a string token for the next token to append to line, or None to not append anything
        """
        return state['token']



[docs]
    def token_opcode(self, state):
        """Handles any opcode tokens
        
        This can be overriden by subclasses for more functionality, but defaults to just returning the original token

        Args:
            state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info
        
        Returns:
            Union[str, None]: either a string token for the next token to append to line, or None to not append anything
        """
        return state['token']



[docs]
    def token_register(self, state):
        """Handles any register tokens
        
        This can be overriden by subclasses for more functionality, but defaults to just returning the original token

        Args:
            state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info
        
        Returns:
            Union[str, None]: either a string token for the next token to append to line, or None to not append anything
        """
        return state['token']



[docs]
    def token_disassembler_info(self, state):
        """Handles any disassembler information tokens
        
        This can be overriden by subclasses for more functionality, but defaults to just returning the original token

        Args:
            state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info
        
        Returns:
            Union[str, None]: either a string token for the next token to append to line, or None to not append anything
        """
        return state['token']

    

[docs]
    def token_string_literal(self, state):
        """Handles any string literals
        
        This can be overriden by subclasses for more functionality, but defaults to just returning the original token

        Args:
            state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info
        
        Returns:
            Union[str, None]: either a string token for the next token to append to line, or None to not append anything
        """
        return state['token']

    

[docs]
    def token_spacing(self, state):
        """Handles any spacing tokens
        
        This can be overriden by subclasses for more functionality, but defaults to just returning the original token

        Args:
            state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info
        
        Returns:
            Union[str, None]: either a string token for the next token to append to line, or None to not append anything
        """
        return state['token']

    

[docs]
    def token_newline(self, state):
        """Handles any newline tokens
        
        This can be overriden by subclasses for more functionality, but defaults to just returning the original token

        Args:
            state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info
        
        Returns:
            Union[str, None]: either a string token for the next token to append to line, or None to not append anything
        """
        return state['token']

    

[docs]
    def token_instruction_address(self, state):
        """Handles any instruction address tokens
        
        This can be overriden by subclasses for more functionality, but defaults to just returning the original token

        Args:
            state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info
        
        Returns:
            Union[str, None]: either a string token for the next token to append to line, or None to not append anything
        """
        return state['token']

    

[docs]
    def token_all_symbols(self, state):
        """Handles all symbol tokens ('+', '*', '[', ']', ':')
        
        This can be overriden by subclasses for more functionality, but defaults to just returning the original token, 
        except for colons ':', for which we check if the previous non-spacing token was an immediate value. If so, and
        `match_instruction_address` is True, then we append any inbetween spacing and the colon to that immediate and 
        replace its type with Token.INSTRUCTION_ADDRESS.

        Args:
            state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info
        
        Returns:
            Union[str, None]: either a string token for the next token to append to line, or None to not append anything
        """
        return state['token']

    

[docs]
    def token_mismatch(self, state):
        """What to do when there is a token mismatch in a string
        
        This can be overriden by subclasses for more functionality, bet defaults to raising a ``TokenMismatchError`` with 
        info on the mismatch

        Args:
            state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info

        Raises:
            TokenMismatchError: by default
        """
        raise TokenMismatchError("Mismatched token '%s' at index %d in string: %s" % (state['token'], state['match'].start(), repr(state['string'])))

    

[docs]
    def token_unknown(self, state):
        """What to do when there is a token type that we don't know how to handle
        
        This can be overriden by subclasses for more functionality, bet defaults to raising a ``UnknownTokenError`` with 
        info on the unknown token

        Args:
            state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info

        Raises:
            UnknownTokenError: by default
        """
        raise UnknownTokenError("Unknown token type %s, token: %s" % (repr(state['token_type']), repr(state['token'])))


    def __call__(self, *strings, newline_tup=_USE_DEFAULT_NT, match_instruction_address=True, **kwargs):
        """Tokenizes the input
        
        Subclasses should override any self.token_* methods they wish to inject behavior into. Each one of those functions
        takes in a 'state' dictionary as input and should return either a new string token or None to use the old token.

        See the docs for :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info on how tokenization
        works, how to create subclasses, etc.

        Args:
            strings (str): arbitrary number of strings to tokenize.
            newline_tup (Optional[Tuple[str, str]]): the tuple to insert inbetween each passed string, or None to not 
                insert anything. Defaults to `self.__class__.DEFAULT_NEWLINE_TUPLE`.
            match_instruction_address (bool, optional): if True, will match instruction addresses. If there is an immediate
                value at the start of a line (IE: start of a string in `strings`, or immediately after a Tokens.NEWLINE
                or Tokens.INSTRUCTION_START [ignoring any Tokens.SPACING]), then that token will be converted into a
                Tokens.INSTRUCTION_ADDRESS token. If there is a Tokens.COLON immediately after that token (again, ignoring
                any Tokens.SPACING), then that first Tokens.COLON match will be appended (along with any inbetween Tokens.SPACING)
                to that Tokens.INSTRUCTION_ADDRESS token. For example, using the x86 tokenization scheme:

                    - "0x1234: add rax rax" -> [(Tokens.INSTRUCTION_ADDRESS, '0x1234:'), ...]
                    - "  0x1234     : add rax rax" -> [(Tokens.SPACING, '  '), (Tokens.INSTRUCTION_ADDRESS, '0x1234     :'), ...]
                    - "0x1234 add rax rax" -> [(Tokens.INSTRUCTION_ADDRESS, '0x1234'), ...]
                
            kwargs (Any): extra kwargs to store in the tokenizer state, for use in child classes

        Returns:
            List[Tuple[str, str]]: list of (token_type, token) tuples
        """
        return self.tokenize(*strings, newline_tup=newline_tup, match_instruction_address=match_instruction_address, **kwargs)
    
    def __repr__(self) -> str:
        if eq_obj(self.tokens, self.DEFAULT_TOKENS):
            tokens_str = ''
        elif len(repr(self.tokens)) > 30:
            tokens_str = '...'
        else:
            tokens_str = 'tokens=' + repr(self.tokens)
        return self.__class__.__name__ + "(%s)" % tokens_str
    
    def __str__(self) -> str:
        return self.__class__.__name__
    
    def __eq__(self, other):
        return type(self) == type(other) and eq_obj(self.tokens, other.tokens) and self.case_sensitive == other.case_sensitive \
            and eq_obj(self.token_handlers, other.token_handlers)
    
    def __hash__(self):
        return hash_obj([type(self).__name__, self.tokens, self.case_sensitive, self.token_handlers], return_int=True)