"""
Provides function(s) to perform normalization techniques on CFG's
"""
import copy
import numpy as np
from ..utils import progressbar, update_memcfg_tokens, AtomicTokenDict
from .norm_utils import get_normalizer
from ..utils.type_utils import *
import bincfg
[docs]
def normalize_cfg_data(cfg_data: 'Union[CFGInputDataType, bincfg.CFG, bincfg.MemCFG, bincfg.CFGDataset, bincfg.MemCFGDataset, Iterable]',
normalizer: 'Union[str, NormalizerType]', inplace: 'bool' = False, using_tokens: 'Optional[Union[dict[str, int], AtomicTokenDict]]' = None,
force_renormalize: 'bool' = False, convert_to_mem: 'bool' = False, conv_keep_mem_addrs: 'bool' = True,
unpack_cfgs: 'bool' = False, progress: 'bool' = False) -> 'Union[bincfg.CFG, bincfg.MemCFG, bincfg.CFGDataset, bincfg.MemCFGDataset, list, tuple]':
"""Normalizes some cfg data.
Args:
cfg_data (Union[CFGInputDataType, CFG, MemCFG, CFGDataset, MemCFGDataset, Iterable]): some cfg data. Can be either: str, CFG, MemCFG,
CFGDataset, MemCFGDataset, or iterable of previously mentioned types. Will return the same type as that passed,
unless that particular input was a string, in which case a CFG will be returned.
normalizer (Union[str, Normalizer]): the normalizer to use. Can be either a ``Normalizer`` class with a
`.normalize()` method, or a string to use a built-in normalizer. See :func:`bincfg.normalization.get_normalizer`
for acceptable strings.
inplace (bool): if True, will modify data in-place instead of creating new objects. Defaults to False.
NOTE: if `inplace=False`, and the incoming data has already been normalized with the passed `normalizer`, then
the original cfg will be returned, NOT a copy.
using_tokens (Optional[Union[dict[str, int], AtomicTokenDict]]): only used for ``MemCFG``'s. If not None, then a dictionary mapping string
tokens to integer token values that will be used as any ``MemCFG``'s tokens. Defaults to None.
force_renormalize (bool): by default, this method will only normalize cfg's whose .normalizer != to the passed
normalizer. However if `force_renormalize=True`, then all cfg's will be renormalized even if they have been
previously normalized with the same normalizer. Defaults to False.
convert_to_mem (bool): if True, will convert all ``CFG``'s and ``CFGDatasets`` to their memory-efficient
versions after normalizing. Defaults to False.
conv_keep_mem_addrs (bool): if True, will pass `keep_memory_addresses=True` when converting CFG's into MemCFG's
unpack_cfgs (bool): by default, this method will return the same types that were passed to be normalized.
However if `unpack_cfgs=True`, then instead, a list of all cfgs unpacked (EG: unpacked from lists, and pulled
out of datasets) will be returned. Defaults to False.
NOTE: if only a single ``CFG``/``MemCFG`` was passed, a list will still be returned of only that single element.
progress (bool): if True, will show a progressbar for normalizations of multiple cfg's. Defaults to False.
Returns:
Union[CFG, MemCFG, CFGDataset, MemCFGDataset, List, Tuple]: the normalized data
"""
normalizer = get_normalizer(normalizer)
# If input is not a str/CFG/MemCFG/dataset, then it should be an iterable, loop or use multiprocessing
if not isinstance(cfg_data, (str, bincfg.CFG, bincfg.MemCFG, bincfg.CFGDataset, bincfg.MemCFGDataset)):
cfg_data = list(cfg_data) # Copy the element references so we don't modify the original iterable, and convert to list
cfgs, cfg_parts = _unpack_cfgs(cfg_data)
# Normalize all the cfg's
for i, cfg in enumerate(progressbar(cfgs, progress=progress)):
cfgs[i] = normalize_cfg_data(cfg, normalizer=normalizer, inplace=inplace, using_tokens=using_tokens,
force_renormalize=force_renormalize, convert_to_mem=convert_to_mem, progress=False)
# Now that all our cfg's have been normalized, sort them back to where they belong (if needed)
if not unpack_cfgs:
for i, (start, end) in enumerate(cfg_parts):
# Append the data if it is just a cfg, otherwise make it into a dataset
if isinstance(cfg_data[i], (bincfg.CFG, bincfg.MemCFG)):
cfg_data[i] = cfgs[start]
# Otherwise make it into a dataset
else:
if isinstance(cfg_data[i], bincfg.CFGDataset) and convert_to_mem:
cfg_data[i] = bincfg.MemCFGDataset()
elif not inplace: # Create a new object if not inplace
cfg_data[i] = cfg_data[i].__class__()
cfg_data[i].cfgs = cfgs[start:end]
cfg_data[i].normalizer = normalizer
return cfg_data
return cfgs
# Check for a string and convert to a CFG right away. We can turn on inplace if so
if isinstance(cfg_data, str):
cfg_data = bincfg.CFG(cfg_data)
inplace = True
# Check if we even need to normalize
if cfg_data.normalizer is not None and normalizer == cfg_data.normalizer and not force_renormalize:
# Check if we need to unpack cfgs, and/or convert to memcfg's
if convert_to_mem:
if isinstance(cfg_data, bincfg.CFG):
cfg_data = bincfg.MemCFG(cfg_data, using_tokens=using_tokens, keep_memory_addresses=conv_keep_mem_addrs)
elif isinstance(cfg_data, bincfg.CFGDataset):
cfg_data = bincfg.MemCFGDataset(cfg_data, using_tokens=using_tokens, keep_memory_addresses=conv_keep_mem_addrs)
# Otherwise it is already a MemCFG/MemCFGDataset, and we need to check if we should update the tokens
elif using_tokens is not None:
update_memcfg_tokens(cfg_data, using_tokens)
# Otherwise, check if this is already a memcfg/memcfgdataset and we need to update them tokens
elif using_tokens is not None:
if isinstance(cfg_data, (bincfg.MemCFG, bincfg.MemCFGDataset)):
update_memcfg_tokens(cfg_data, using_tokens)
if unpack_cfgs:
if isinstance(cfg_data, (bincfg.CFGDataset, bincfg.MemCFGDataset)):
cfg_data = cfg_data.cfgs
else:
cfg_data = [cfg_data]
return cfg_data
# Create return object depending on input type and inplace, set the return object's normalizer
ret = cfg_data if inplace else copy.deepcopy(cfg_data) if isinstance(cfg_data, (bincfg.CFG, bincfg.MemCFG)) else cfg_data.__class__()
ret.normalizer = normalizer
# Single CFG's and MemCFG's will be normalized here
if isinstance(cfg_data, bincfg.CFG):
for block in ret.blocks:
block.asm_lines = normalizer.normalize(*block.asm_lines, cfg=ret, block=block)
# Convert to MemCFG if needed
if convert_to_mem and isinstance(cfg_data, bincfg.CFG):
ret = bincfg.MemCFG(ret, inplace=True, using_tokens=using_tokens, keep_memory_addresses=conv_keep_mem_addrs)
elif isinstance(cfg_data, bincfg.MemCFG):
# Need to recompute the asm_lines, block_asm_idx, and tokens
# Need to start new_block_asm_idx with a 0
norm_lines, new_block_asm_idx = [], [0]
inv_tokens = {v: k for k, v in ret.tokens.items()}
# Go through each block re-normalizing its lines, and keeping track of the block_asm_idx
for block_idx in range(ret.num_blocks):
norm_lines += normalizer.normalize(*[inv_tokens[t] for t in ret.get_block_asm_lines(block_idx)], cfg=ret, block=block_idx)
new_block_asm_idx.append(len(norm_lines))
# Update the new tokens if needed, allowing for use of atomic tokens
if isinstance(using_tokens, AtomicTokenDict):
using_tokens.addtokens(*norm_lines)
new_tokens = using_tokens
else:
new_tokens = {} if using_tokens is None else using_tokens
for t in norm_lines:
new_tokens.setdefault(t, len(new_tokens))
# Convert to integer tokens
new_asm_lines = [new_tokens[t] for t in norm_lines]
# Set the new data in ret
ret.asm_lines, ret.block_asm_idx, ret.tokens = np.array(new_asm_lines), np.array(new_block_asm_idx), new_tokens
# Datasets will have their cfg's normalized as a list, then will need their tokens recounted
elif isinstance(cfg_data, (bincfg.CFGDataset, bincfg.MemCFGDataset)):
using_tokens = None if isinstance(cfg_data, bincfg.CFGDataset) else using_tokens if using_tokens is not None else {}
ret.cfgs = normalize_cfg_data(cfg_data.cfgs, normalizer=normalizer, inplace=inplace, using_tokens=using_tokens,
progress=progress)
if isinstance(cfg_data, bincfg.MemCFGDataset):
ret.tokens = using_tokens
# Convert to MemCFGDataset if needed
if convert_to_mem and isinstance(cfg_data, bincfg.CFGDataset):
ret = bincfg.MemCFGDataset(ret, normalizer=normalizer, tokens=ret.tokens, keep_memory_addresses=conv_keep_mem_addrs)
else:
raise TypeError("Got an unknown type: '%s'" % type(cfg_data).__name__)
# Unpack cfg's if needed
if unpack_cfgs:
return [ret] if isinstance(ret, (bincfg.CFG, bincfg.MemCFG)) else ret.cfgs
return ret
def _unpack_cfgs(cfg_data: 'Union[CFGInputDataType, bincfg.CFG, bincfg.MemCFG, bincfg.CFGDataset, bincfg.MemCFGDataset, Iterable]') \
-> 'tuple[list[Union[bincfg.CFG, bincfg.MemCFG]], list[tuple[int, int]]]':
"""Helper to unpack lists/tuples of cfg's/datasets's, and return their type info
Will also convert strings to CFG's
Args:
cfg_data (Union[str, CFG, MemCFG, CFGDataset, MemCFGDataset, Iterable]): some cfg data. Can be either: str, CFG, MemCFG,
CFGDataset, MemCFGDataset, or iterable of previously mentioned types.
Returns:
tuple[list[Union[bincfg.CFG, bincfg.MemCFG]], list[tuple[int, int]]]: a 2-tuple of `(cfgs, type_info)`, where
`cfgs` is a list of CFG's/MemCFG's containing all of the cfgs that exist in cfg_data in order, and `type_info`
is a list of 2-tuples of (start_idx, end_idx) where: `start_idx` is the integer start index of this chunk of
data in the `cfgs` list, and `end_idx` is the end index of this chunk of data
"""
cfgs, cfg_parts = [], []
# If the user didn't pass an iterable, make it one
if isinstance(cfg_data, (str, bincfg.CFG, bincfg.MemCFG, bincfg.CFGDataset, bincfg.MemCFGDataset)):
cfg_data = [cfg_data]
start_idx = 0
for cfgd in cfg_data:
# CFG's/MemCFG's can simply be added, but Dataset's must have their cfgs lists added
if isinstance(cfgd, str):
cfgs.append(bincfg.CFG(cfgd))
elif isinstance(cfgd, (bincfg.CFG, bincfg.MemCFG)):
cfgs.append(cfgd)
elif isinstance(cfgd, (bincfg.CFGDataset, bincfg.MemCFGDataset)):
cfgs += cfgd.cfgs
# Update the start_idx and end_idx
cfg_parts.append((start_idx, len(cfgs)))
start_idx = len(cfgs)
return cfgs, cfg_parts