Source code for bincfg.normalization.base_tokenizer

"""Class for tokenizing assembly lines, as well as other tokenization constants"""

import re
from ..utils import ParameterSaver, eq_obj, hash_obj
from .norm_utils import *
from enum import Enum
from ..utils.type_utils import *


# Token type names
[docs] class Tokens: INSTRUCTION_ADDRESS = 'inst_addr' INSTRUCTION_START = 'inst_start' SPLIT_IMMEDIATE = 'split_imm' DISASSEMBLER_INFO = 'disassembler_info' NEWLINE = 'newline' SPACING = 'spacing' OPEN_BRACKET = 'open_bracket' CLOSE_BRACKET = 'close_bracket' PLUS_SIGN = 'plus_sign' TIMES_SIGN = 'times_sign' COLON = 'colon' INSTRUCTION_PREFIX = 'prefix' OPCODE = 'opcode' REGISTER = 'register' IMMEDIATE = 'immediate' MEMORY_SIZE = 'memory_size' MEMORY_EXPRESSION = 'memory_expression' BRANCH_PREDICTION = 'branch_prediction' STRING_LITERAL = 'string_literal' SEGMENT_ADDRESS = 'segment_address' MISMATCH = 'mismatch'
[docs] class TokenizationLevel(Enum): """Different levels to perform tokenization""" OPCODE = ['op', 'opcode', 'operand', 'opcodes', 'operands'] INSTRUCTION = ['inst', 'instruction', 'line', 'instructions', 'lines'] AUTO = ['auto', 'automatic', 'default']
[docs] class Architectures(Enum): """Known (but not necessarily supported) architectures""" X86 = ['x86', 'i686', 'x86_64'] JAVA = ['java', 'java_bytecode']
[docs] def get_architecture(arch: 'Union[str, Architectures]') -> 'Architectures': """Returns the architecture Args: arch (Union[str, Architectures]) """ if isinstance(arch, Architectures): return arch elif isinstance(arch, str): arch = arch.lower().replace('-', '_') for a in Architectures: if any(arch == v for v in a.value): return a raise ValueError("Unknown architecture string: %s" % repr(arch)) else: raise TypeError("Cannot get architecture from object of type: %s" % repr(type(arch).__name__))
[docs] class TokenMismatchError(Exception): pass
[docs] class UnknownTokenError(Exception): pass
# Special tokens to insert into tokenizer by default SPECIAL_TOKENS_START = [ (Tokens.STRING_LITERAL, RE_STRING_LITERAL), (Tokens.DISASSEMBLER_INFO, RE_DISASSEMBLER_INFO), (Tokens.INSTRUCTION_START, INSTRUCTION_START_TOKEN), (Tokens.SPLIT_IMMEDIATE, SPLIT_IMMEDIATE_TOKEN), (Tokens.PLUS_SIGN, RE_PLUS_SIGN), (Tokens.TIMES_SIGN, RE_TIMES_SIGN), (Tokens.OPEN_BRACKET, RE_OPEN_BRACKET), (Tokens.CLOSE_BRACKET, RE_CLOSE_BRACKET), (Tokens.COLON, RE_COLON), (Tokens.SPACING, RE_SPACING), (Tokens.NEWLINE, RE_NEWLINE), (Tokens.IMMEDIATE, RE_IMMEDIATE), ] SPECIAL_TOKENS_END = [ (Tokens.MISMATCH, r'.'), ] # An indication that the default newline tuple should be used when tokenizing/normalizing _USE_DEFAULT_NT = object()
[docs] def parse_tokenization_level(tokenization_level, auto_tl): """Returns the bincfg.TokenizationLevel enum based on the given tokenization_level. Args: tokenization_level (Union[bincfg.TokenizationLevel, str]): either a string tokenization level, or a class from the bincfg.TokenizationLevels enum auto_tl (bincfg.TokenizationLevel): the default tokenization level to use if we get an 'auto' tokenization level Returns: bincfg.TokenizationLevel: a class from the ``bincfg.TokenizationLevels`` enum """ if not isinstance(auto_tl, TokenizationLevel) or auto_tl == TokenizationLevel.AUTO: raise TypeError("`auto_tl` must be a bincfg.TokenizationLevel, and cannot be bincfg.TokenizationLevel.AUTO. Got: %s" % repr(auto_tl)) if isinstance(tokenization_level, str): tl = tokenization_level.lower().replace('-', '_') for l in TokenizationLevel: if tl in l.value: ret = l break else: raise ValueError("Unknown tokenization_level string: '%s'" % tokenization_level) elif isinstance(tokenization_level, TokenizationLevel): ret = tokenization_level else: raise TypeError("Unknown tokenization_level type: '%s'" % type(tokenization_level)) # Check for auto if ret is TokenizationLevel.AUTO: return auto_tl return ret
[docs] class BaseTokenizer(metaclass=ParameterSaver): """A default class to tokenize instructions Should be subclassed once for each instruction set, providing the tokens being used. Many functions may be overriden to change tokenization behavior. These functions all start with the name `token\_...` and take as input a single state dictionary and return either a string for the next token to append to the current line being tokenized, or None to not add anything to the line. The state dictionary contains the following: - 'tokenizer' (BaseTokenizer): this tokenizer - 'kwargs' (Dict[str, Any]): dictionary of extra kwargs passed to the initial call to the `tokenize` function - 'all_strings' (List[str]): list of input strings (args) passed to the initial call to the `tokenize` function - 'token_handlers' (Dict[str, Callable[]]): dictionary mapping token types to the function that handles that token - 'sentence' (List[Tuple[str, str]]): list of processed token tuples to return, each a tuple of (token\_name, token) - 'newline_tup' (Union[None, Tuple[str, str]]): token tuple to add at the end of each line to indicate a new line - 'match_instruction_address' (bool): whether or not we are matching instruction addresses - 'split_imm' (bool): whether or not we are currently handling an immediate token that was split - 'line' (List[Tuple[str, str]]): the current line of tokens we are working on - 'string' (str): the current string being tokenized - 'token_type' (str): the type of the 'token', should be from `bincfg.normalization.base_tokenizer.Tokens` - 'token' (str): the currently matched token string - 'match' (re.Match): the re match object that matched this token Some extra functions are available for overriding including: - handle_line(): called at the end of each line being tokenized (an individual string passed to the tokenizer) - handle_sentence(): called at the end of each sentence being tokenized (aggregation of all lines passed to the tokenizer) Each instruction set architecture (ISA) should have its own ``Tokenizer`` class that inherits from ``BaseTokenizer``. The tokenization process uses python's ``re`` module to perform tokenization, converting strings into streams of (token\_name, token\_string) tuples. For more information on how to use regex to create tokenizers, see: https://docs.python.org/3/library/re.html#writing-a-tokenizer TOKENIZATION PROCESS 1. Clean the incomming instruction strings using the passed `clean_instruction_func` 2. Iterate through the strings finding all tokens a. Each token is sent to its corresponding token handler function b. At the end of each 'line' (EG: end of a passed `string`, reaching Tokens.NEWLINE token, etc.), that line is handled with the `handle_line()` function c. All tokens are added to the same return 'sentence', even if multiple strings in `strings` were passed 3. After all strings have been tokenized and lines handled, the final return 'sentence' is sent to `handle_sentence()` SPECIAL TOKENS There are some 'special tokens' that are assumed to exist for all ISA's as they are a part of the tokenization process itself. These tokens will be inserted into the passed `tokens` parameter at the beginning of the list (IE: they are the first tokens searched for), except for the 'mismatch' token which is inserted at the end, and are inserted in the following order: 1. String literals (Tokens.STRING_LITERAL) - matches strings which can start/end with matching single or double quotes, and can escape inner quotes with \\' or \\", and can escape the escape character with \\\\. Any extra escape characters (not behind a ' or " or \\) will be left as-is. 2. Disassembler information (Tokens.DISASSEMBLER_INFO) - matches disassembler information of the form "<...>". This info must be within open/close angle brackets. It is also possible to nest angle brackets within the disassembler info up to a maximum current depth of 3. IE: we can match the following: * "<no angle brackets inside>" - depth of 1 * "<angle <brackets> depth <2>>" - depth of 2 * "<level <3 angle <bracket>> depth>" - depth of 3 We also do not check that every open has a matching close, just that every close has a matching open. So, the following could still be matched: * "<lots of <<<<<<< things>" However, missing or unmatched ending angle brackets will fail, as well as very deep nesting: - "<" : no matching '>' only for the first occurance of '<' - "<data>>" : no matching '<' for both of the '>' brackets - "<super<deep<nested<...<thing>>...>>" : too large nesting depth String literals are checked first within the disassembler info so that any end brackets '>' within the strings won't affect the parsing of the disassembler info. This limit on nesting depth is present due to the inability for python's re engine to handle recursive matching of nested brackets, and I can't think of any way to implement it entirely within re's (which is needed in order to continue using the python re tokenization method). I don't see any reason why this would be needed as we already go down to a depth of 3 to handle more than what I would expect as output from disassemblers, and if the user is inserting information themselves, they could simply input the information within the brackets using a different delimiter and parse it themselves by overriding things like `token_disassembler_info()` and `handle_disassembler_info` in the `Tokenizer` and `Normalizer` classes respectively. If a larger depth is needed, one can manually alter the `_DIS_INFO_MAX_REC_DEPTH` variable at the top of this file. It will increase the valid nesting depth at the cost of slower regular expression matching for disassembler info. 3. Instruction start token "#start_instr#" (Tokens.INSTRUCTION_START) - used to determine when instructions start/stop when using an op-level tokenization scheme. When tokenizing, we need to know when a new instruction is started to decide if an immediate value found should be considered an instruction address or just a plain immediate. New instructions occur whenever we reach a newline token, an instruction start token, or the start of a new string passed in the args of the `tokenize()` method. This instruction start token is removed when found, and won't appear during normalization. 4. Split immediate token "#split_imm#" (Tokens.SPLIT_IMMEDIATE) - used to designate a split immediate value. This is useful for reducing the number of unique tokens present while keeping full immediate information. When using split immediates during normalization, immediate values with more digits than some threshold will be split into multiple immediate tokens and placed one after the other, prepended with this "#split_imm#" token. In order to keep that output as renormalizable, the tokenizer, when finding one of these split immediate tokens, will concatenate all of the following immediate tokens until reaching some non-immediate (and, non-spacing) token to rebuild the original immediate token. This split immediate token is removed when found, and won't appear during normalization 5. Plus sign (Tokens.PLUS_SIGN) - '+' 6. Times sign (Tokens.TIMES_SIGN) - '*' 7. Open bracket (Tokens.OPEN_BRACKET) - '[' 8. Close bracket (Tokens.CLOSE_BRACKET) - ']' 9. Colon (Tokens.COLON) - ':' 10. Spacing (Tokens.SPACING) - One or more space ' ', comma ',', or tab '\\t' characters in a row 11. Newline (Tokens.NEWLINE) - Either the newline character '\\n' or a pipe character '|' 12. Immediate values (Tokens.IMMEDIATE) - any integer immediate value in hex, decimal, octal, or binary. Hex values must start with '0x', octal with '0o', and binary with '0b' 13. Mismatch token (Tokens.MISMATCH) - matches any character. Inserted at the very end of `tokens` and is used to designate the start of an unknown token or character so that can be handled (by default, an error is raised) If you wish to keep some of the above tokens, but overwrite others, you can set that token's regex in the passed `tokens` parameter, and that will overwrite these special tokens. You may also set it to None to not insert it at all. INSTRUCTION ADDRESSES If match_instruction_address=True when tokenizing, the tokenizer will attempt to match instruction addresses at the beginning of each line. If there is an immediate value at the start of a line (IE: start of a string in `strings`, or immediately after a Tokens.NEWLINE or Tokens.INSTRUCTION_START [ignoring any Tokens.SPACING]), then that token will be converted into a Tokens.INSTRUCTION_ADDRESS token. If there is a Tokens.COLON immediately after that token (again, ignoring any Tokens.SPACING), then that first Tokens.COLON match will be appended to that Tokens.INSTRUCTION_ADDRESS token, removing any Tokens.SPACING inbetween them. For example, using the x86 tokenization scheme: - "0x1234: add rax rax" -> [(Tokens.INSTRUCTION_ADDRESS, '0x1234:'), ...] - " 0x1234 : add rax rax" -> [(Tokens.SPACING, ' '), (Tokens.INSTRUCTION_ADDRESS, '0x1234:'), ...] - "0x1234 add rax rax" -> [(Tokens.INSTRUCTION_ADDRESS, '0x1234'), ...] Parameters ---------- tokens: `Optional[List[Tuple[str, str]]]` the tokens to use. Should be a list of 2-tuples. Each tuple is a pair of (name, regex) where name is the string name of the token, and regex is a regular expression to find that token. These tuples should be ordered in the preferred order to search for tokens. If None, then this will default to self.DEFAULT_TOKENS (which should be set when defining the class) token_handlers: `Optional[Dict[str, Callable[[Dict[str, Any]], Union[None, str]]]]` optional dictionary mapping token type strings to functions to handle those token types when tokenizing. This is intended to be used when you wish to add entirely new token types not present in `bincfg.normalization.base_tokenizer.Tokens`. If you wish to change the behavior of handling an already-present token type, just override that token handler function. These will override the default token handlers. insert_special_tokens: `bool` by default, some special tokens will be inserted at the front of `tokens` (see the 'special tokens' listed above). If you wish to stop this from happening, you can set `insert_special_tokens` to False case_sensitive: `bool` If True, then regular expressions will be matched exactly as they appear. If False, then the re.IGNORECASE flag will be passed when compiling the regular expressions """ DEFAULT_NEWLINE_TUPLE = (Tokens.NEWLINE, '\n') """The default (token_type, token) tuple to use for newlines""" ARCHITECTURE = None """The architecture this tokenizer works on""" def __init__(self, tokens=None, token_handlers=None, insert_special_tokens=True, case_sensitive=False): self.tokens = tokens if tokens is not None else self.DEFAULT_TOKENS self.case_sensitive = case_sensitive # Insert the special tokens, make sure user values override regex's and positions of default tokens, and remove # any tokens that the user has set to None if insert_special_tokens: user_tokens = set(t[0] for t in self.tokens) self.tokens = [t for t in SPECIAL_TOKENS_START if t[0] not in user_tokens] + self.tokens + \ [t for t in SPECIAL_TOKENS_END if t[0] not in user_tokens] self.tokens = [t for t in self.tokens if t[1] is not None] self.token_handlers = token_handlers if token_handlers is not None else {} self._init_tokenizer() def _init_tokenizer(self): """Initializes the tokenizer from self.tokens""" flags = (re.M|re.UNICODE) flags = (flags|re.IGNORECASE) if not self.case_sensitive else flags self.tokenizer = re.compile('|'.join([("(?P<%s>%s)" % pair) for pair in self.tokens]), flags=flags)
[docs] def tokenize(self, *strings, newline_tup=_USE_DEFAULT_NT, match_instruction_address=True, **kwargs): """Tokenizes the input Subclasses should override any self.token_* methods they wish to inject behavior into. Each one of those functions takes in a 'state' dictionary as input and should return either a new string token or None to use the old token. See the docs for :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info on how tokenization works, how to create subclasses, etc. Args: strings (str): arbitrary number of strings to tokenize. newline_tup (Optional[Tuple[str, str]]): the tuple to insert inbetween each passed string, or None to not insert anything. Defaults to `self.__class__.DEFAULT_NEWLINE_TUPLE`. match_instruction_address (bool, optional): if True, will match instruction addresses. If there is an immediate value at the start of a line (IE: start of a string in `strings`, or immediately after a Tokens.NEWLINE or Tokens.INSTRUCTION_START [ignoring any Tokens.SPACING]), then that token will be converted into a Tokens.INSTRUCTION_ADDRESS token. If there is a Tokens.COLON immediately after that token (again, ignoring any Tokens.SPACING), then that first Tokens.COLON match will be appended to that Tokens.INSTRUCTION_ADDRESS token, removing any Tokens.SPACING inbetween them. For example, using the x86 tokenization scheme: - "0x1234: add rax rax" -> [(Tokens.INSTRUCTION_ADDRESS, '0x1234:'), ...] - " 0x1234 : add rax rax" -> [(Tokens.SPACING, ' '), (Tokens.INSTRUCTION_ADDRESS, '0x1234:'), ...] - "0x1234 add rax rax" -> [(Tokens.INSTRUCTION_ADDRESS, '0x1234'), ...] kwargs (Any): extra kwargs to store in the tokenizer state, for use in child classes Returns: List[Tuple[str, str]]: list of (token_type, token) tuples """ if len(strings) == 0: return [] # Replace the newline_tup if using newline_tup = self.DEFAULT_NEWLINE_TUPLE if newline_tup is _USE_DEFAULT_NT else newline_tup # If the first element starts with an `INSTRUCTION_START` token, then concatenate all of the strings together if strings[0].strip().startswith(INSTRUCTION_START_TOKEN): strings = [' '.join(strings)] # Dictionary mapping token types to self functions to handle those tokens. Only have to add tokens to handle token_handler_dict = { Tokens.INSTRUCTION_ADDRESS: self.token_instruction_address, #Tokens.INSTRUCTION_START: None, #Tokens.SPLIT_IMMEDIATE: None, Tokens.DISASSEMBLER_INFO: self.token_disassembler_info, Tokens.NEWLINE: self.token_newline, Tokens.SPACING: self.token_spacing, Tokens.OPEN_BRACKET: self.token_all_symbols, Tokens.CLOSE_BRACKET: self.token_all_symbols, Tokens.PLUS_SIGN: self.token_all_symbols, Tokens.TIMES_SIGN: self.token_all_symbols, Tokens.COLON: self.token_all_symbols, Tokens.INSTRUCTION_PREFIX: self.token_instruction_prefix, Tokens.OPCODE: self.token_opcode, Tokens.REGISTER: self.token_register, Tokens.IMMEDIATE: self.token_immediate, Tokens.MEMORY_SIZE: self.token_memory_size, #Tokens.MEMORY_EXPRESSION: 'memory_expression' Tokens.BRANCH_PREDICTION: self.token_branch_prediction, Tokens.STRING_LITERAL: self.token_string_literal, Tokens.MISMATCH: self.token_mismatch, } token_handler_dict.update(self.token_handlers) token_state = {'kwargs': kwargs, 'all_strings': strings, 'token_handlers': token_handler_dict, 'sentence': [], 'newline_tup': newline_tup, 'match_instruction_address': match_instruction_address, 'tokenizer': self} for string in strings: token_state.update({'previous_newline': True, 'split_imm': False, 'line': [], 'string': string, 'matched_ia_colon': False}) for mo in self.tokenizer.finditer(string): token_state['token_type'], token_state['token'], token_state['match'] = mo.lastgroup, mo.group(), mo # Handle all the various tokens. Keep all tokens new_token = token_state['token_handlers'][token_state['token_type']](token_state) if token_state['token_type'] in token_state['token_handlers'] \ else token_state['token'] if token_state['token_type'] in [Tokens.SPLIT_IMMEDIATE, Tokens.INSTRUCTION_START] \ else self.token_unknown(token_state) if new_token is not None: token_state['line'].append((token_state['token_type'], new_token)) # If there are split immediates, parse them out now in line token_state['line'] = self._merge_split_immediates(token_state['line']) # Check for instruction addresses if token_state['match_instruction_address']: token_state['line'] = self._check_instruction_address(token_state['line']) # Handle the new line and add on the newline_tup if using, and we don't have an empty sentence new_line = self.handle_line(token_state) token_state['sentence'] += new_line if new_line is not None else token_state['line'] if token_state['newline_tup'] is not None and len(token_state['sentence']) > 0: token_state['sentence'].append(token_state['newline_tup']) return self.handle_sentence(token_state)
def _merge_split_immediates(self, line): """Merges any split immediates into one, ignoring any spacing inbetween them""" ret_line, idx = [], 0 while idx < len(line): # If this isn't a split immediate, just go ahead to the next token if line[idx][0] not in [Tokens.SPLIT_IMMEDIATE]: ret_line.append(line[idx]) idx += 1 continue # This is a split immediate token, scan ahead for all immediate tokens (including instruction address), # ignoring spacing, until reaching a non-immediate token. Concatenate these into a single immediate value imm_inds = scan_for_token(line, type=Tokens.IMMEDIATE, ignore_type=Tokens.SPACING, stop_unmatched=True, start=idx + 1, max_matches=None, wrap=False, on_no_match=[]) if len(imm_inds) > 0: ret_line.append((line[imm_inds[0]][0], ''.join([line[i][1] for i in imm_inds]))) idx = max(imm_inds + [idx]) + 1 return ret_line def _check_instruction_address(self, line): """Scans through the current line checking for possible instruction address locations""" _newlines = [Tokens.NEWLINE, Tokens.INSTRUCTION_START] ret_line, idx = [], 0 while idx < len(line): token_tup, token_type = line[idx], line[idx][0] # If we are at the start of a line, check for instruction address if idx == 0 or token_type in _newlines: inst_addr_inds = scan_for_token(line, type=Tokens.IMMEDIATE, ignore_type=Tokens.SPACING, stop_unmatched=True, start=(idx+1) if token_type in _newlines else idx, wrap=False, on_no_match=[], ret_list=True) if len(inst_addr_inds) > 0 and imm_to_int(line[inst_addr_inds[0]][1]) > 0: # Check for possible colon token inst_addr_inds += scan_for_token(line, type=Tokens.COLON, ignore_type=Tokens.SPACING, stop_unmatched=True, start=max(inst_addr_inds)+1, wrap=False, on_no_match=[], ret_list=True) # Build the instruction address token. Insert any tokens before the start if present for i in range(idx if token_type in _newlines + [Tokens.SPACING] else (idx + 1), min(inst_addr_inds)): ret_line.append(line[i]) token_tup = (Tokens.INSTRUCTION_ADDRESS, ''.join([line[i][1] for i in inst_addr_inds])) idx = max(inst_addr_inds) # Add in this token and increment the index ret_line.append(token_tup) idx += 1 return ret_line
[docs] def handle_line(self, state): """Handles a single line (one string passed to the tokenizer) Each line could contain newlines and whatnot, but no newline_tup's will have been inserted. Subclasses may override this function for more behavior, but it defaults to just returning the passed line. Args: state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info Returns: List[Tuple[str, str]]: list of (token_type, token) tuples for this line """ return state['line']
[docs] def handle_sentence(self, state): """Handles an entire sentence (aggregation of all strings passed to one call of this tokenizer) Inbetween each line, a newline_tup will have already been inserted (if using) Subclasses may override this function for more behavior, but it defaults to just returning the passed sentence Args: state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info Returns: List[Tuple[str, str]]: the final list of tokens """ return state['sentence']
[docs] def token_branch_prediction(self, state): """Handles any branch_prediction tokens This can be overriden by subclasses for more functionality, but defaults to just returning the original token Args: state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info Returns: Union[str, None]: either a string token for the next token to append to line, or None to not append anything """ return state['token']
[docs] def token_immediate(self, state): """Handles any immediate tokens This can be overriden by subclasses for more functionality, but defaults to just returning the original token Args: state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info Returns: Union[str, None]: either a string token for the next token to append to line, or None to not append anything """ return state['token']
[docs] def token_instruction_prefix(self, state): """Handles any instruction_prefix tokens This can be overriden by subclasses for more functionality, but defaults to just returning the original token Args: state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info Returns: Union[str, None]: either a string token for the next token to append to line, or None to not append anything """ return state['token']
[docs] def token_memory_size(self, state): """Handles any memory_size tokens This can be overriden by subclasses for more functionality, but defaults to just returning the original token Args: state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info Returns: Union[str, None]: either a string token for the next token to append to line, or None to not append anything """ return state['token']
[docs] def token_opcode(self, state): """Handles any opcode tokens This can be overriden by subclasses for more functionality, but defaults to just returning the original token Args: state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info Returns: Union[str, None]: either a string token for the next token to append to line, or None to not append anything """ return state['token']
[docs] def token_register(self, state): """Handles any register tokens This can be overriden by subclasses for more functionality, but defaults to just returning the original token Args: state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info Returns: Union[str, None]: either a string token for the next token to append to line, or None to not append anything """ return state['token']
[docs] def token_disassembler_info(self, state): """Handles any disassembler information tokens This can be overriden by subclasses for more functionality, but defaults to just returning the original token Args: state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info Returns: Union[str, None]: either a string token for the next token to append to line, or None to not append anything """ return state['token']
[docs] def token_string_literal(self, state): """Handles any string literals This can be overriden by subclasses for more functionality, but defaults to just returning the original token Args: state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info Returns: Union[str, None]: either a string token for the next token to append to line, or None to not append anything """ return state['token']
[docs] def token_spacing(self, state): """Handles any spacing tokens This can be overriden by subclasses for more functionality, but defaults to just returning the original token Args: state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info Returns: Union[str, None]: either a string token for the next token to append to line, or None to not append anything """ return state['token']
[docs] def token_newline(self, state): """Handles any newline tokens This can be overriden by subclasses for more functionality, but defaults to just returning the original token Args: state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info Returns: Union[str, None]: either a string token for the next token to append to line, or None to not append anything """ return state['token']
[docs] def token_instruction_address(self, state): """Handles any instruction address tokens This can be overriden by subclasses for more functionality, but defaults to just returning the original token Args: state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info Returns: Union[str, None]: either a string token for the next token to append to line, or None to not append anything """ return state['token']
[docs] def token_all_symbols(self, state): """Handles all symbol tokens ('+', '*', '[', ']', ':') This can be overriden by subclasses for more functionality, but defaults to just returning the original token, except for colons ':', for which we check if the previous non-spacing token was an immediate value. If so, and `match_instruction_address` is True, then we append any inbetween spacing and the colon to that immediate and replace its type with Token.INSTRUCTION_ADDRESS. Args: state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info Returns: Union[str, None]: either a string token for the next token to append to line, or None to not append anything """ return state['token']
[docs] def token_mismatch(self, state): """What to do when there is a token mismatch in a string This can be overriden by subclasses for more functionality, bet defaults to raising a ``TokenMismatchError`` with info on the mismatch Args: state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info Raises: TokenMismatchError: by default """ raise TokenMismatchError("Mismatched token '%s' at index %d in string: %s" % (state['token'], state['match'].start(), repr(state['string'])))
[docs] def token_unknown(self, state): """What to do when there is a token type that we don't know how to handle This can be overriden by subclasses for more functionality, bet defaults to raising a ``UnknownTokenError`` with info on the unknown token Args: state (Dict): dictionary of current state. See :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info Raises: UnknownTokenError: by default """ raise UnknownTokenError("Unknown token type %s, token: %s" % (repr(state['token_type']), repr(state['token'])))
def __call__(self, *strings, newline_tup=_USE_DEFAULT_NT, match_instruction_address=True, **kwargs): """Tokenizes the input Subclasses should override any self.token_* methods they wish to inject behavior into. Each one of those functions takes in a 'state' dictionary as input and should return either a new string token or None to use the old token. See the docs for :func:`~bincfg.normalization.base_tokenizer.BaseTokenizer` for more info on how tokenization works, how to create subclasses, etc. Args: strings (str): arbitrary number of strings to tokenize. newline_tup (Optional[Tuple[str, str]]): the tuple to insert inbetween each passed string, or None to not insert anything. Defaults to `self.__class__.DEFAULT_NEWLINE_TUPLE`. match_instruction_address (bool, optional): if True, will match instruction addresses. If there is an immediate value at the start of a line (IE: start of a string in `strings`, or immediately after a Tokens.NEWLINE or Tokens.INSTRUCTION_START [ignoring any Tokens.SPACING]), then that token will be converted into a Tokens.INSTRUCTION_ADDRESS token. If there is a Tokens.COLON immediately after that token (again, ignoring any Tokens.SPACING), then that first Tokens.COLON match will be appended (along with any inbetween Tokens.SPACING) to that Tokens.INSTRUCTION_ADDRESS token. For example, using the x86 tokenization scheme: - "0x1234: add rax rax" -> [(Tokens.INSTRUCTION_ADDRESS, '0x1234:'), ...] - " 0x1234 : add rax rax" -> [(Tokens.SPACING, ' '), (Tokens.INSTRUCTION_ADDRESS, '0x1234 :'), ...] - "0x1234 add rax rax" -> [(Tokens.INSTRUCTION_ADDRESS, '0x1234'), ...] kwargs (Any): extra kwargs to store in the tokenizer state, for use in child classes Returns: List[Tuple[str, str]]: list of (token_type, token) tuples """ return self.tokenize(*strings, newline_tup=newline_tup, match_instruction_address=match_instruction_address, **kwargs) def __repr__(self) -> str: if eq_obj(self.tokens, self.DEFAULT_TOKENS): tokens_str = '' elif len(repr(self.tokens)) > 30: tokens_str = '...' else: tokens_str = 'tokens=' + repr(self.tokens) return self.__class__.__name__ + "(%s)" % tokens_str def __str__(self) -> str: return self.__class__.__name__ def __eq__(self, other): return type(self) == type(other) and eq_obj(self.tokens, other.tokens) and self.case_sensitive == other.case_sensitive \ and eq_obj(self.token_handlers, other.token_handlers) def __hash__(self): return hash_obj([type(self).__name__, self.tokens, self.case_sensitive, self.token_handlers], return_int=True)