"""A Lark Transformer for transforming a Lark parse tree into a Python dict""" import re import sys from collections import namedtuple from typing import List, Dict, Any from lark.tree import Meta from lark.visitors import Transformer, Discard, _DiscardType, v_args HEREDOC_PATTERN = re.compile(r"<<([a-zA-Z][a-zA-Z0-9._-]+)\n([\s\S]*)\1", re.S) HEREDOC_TRIM_PATTERN = re.compile(r"<<-([a-zA-Z][a-zA-Z0-9._-]+)\n([\s\S]*)\1", re.S) START_LINE = "__start_line__" END_LINE = "__end_line__" Attribute = namedtuple("Attribute", ("key", "value")) # pylint: disable=missing-function-docstring,unused-argument class DictTransformer(Transformer): """Takes a syntax tree generated by the parser and transforms it to a dict. """ with_meta: bool def __init__(self, with_meta: bool = False): """ :param with_meta: If set to true then adds `__start_line__` and `__end_line__` parameters to the output dict. Default to false. """ self.with_meta = with_meta super().__init__() def float_lit(self, args: List) -> float: return float("".join([str(arg) for arg in args])) def int_lit(self, args: List) -> int: return int("".join([str(arg) for arg in args])) def expr_term(self, args: List) -> Any: args = self.strip_new_line_tokens(args) # if args[0] == "true": return True if args[0] == "false": return False if args[0] == "null": return None # if the expression starts with a paren then unwrap it if args[0] == "(": return args[1] # otherwise return the value itself return args[0] def index_expr_term(self, args: List) -> str: args = self.strip_new_line_tokens(args) return f"{args[0]}{args[1]}" def index(self, args: List) -> str: args = self.strip_new_line_tokens(args) return f"[{args[0]}]" def get_attr_expr_term(self, args: List) -> str: return f"{args[0]}{args[1]}" def get_attr(self, args: List) -> str: return f".{args[0]}" def attr_splat_expr_term(self, args: List) -> str: return f"{args[0]}{args[1]}" def attr_splat(self, args: List) -> str: args_str = "".join(str(arg) for arg in args) return f".*{args_str}" def full_splat_expr_term(self, args: List) -> str: return f"{args[0]}{args[1]}" def full_splat(self, args: List) -> str: args_str = "".join(str(arg) for arg in args) return f"[*]{args_str}" def tuple(self, args: List) -> List: return [self.to_string_dollar(arg) for arg in self.strip_new_line_tokens(args)] def object_elem(self, args: List) -> Dict: # This returns a dict with a single key/value pair to make it easier to merge these # into a bigger dict that is returned by the "object" function key = self.strip_quotes(args[0]) value = self.to_string_dollar(args[1]) return {key: value} def object(self, args: List) -> Dict: args = self.strip_new_line_tokens(args) result: Dict[str, Any] = {} for arg in args: result.update(arg) return result def function_call(self, args: List) -> str: args = self.strip_new_line_tokens(args) args_str = "" if len(args) > 1: args_str = ", ".join([str(arg) for arg in args[1] if arg is not Discard]) return f"{args[0]}({args_str})" def arguments(self, args: List) -> List: return args def new_line_and_or_comma(self, args: List) -> _DiscardType: return Discard @v_args(meta=True) def block(self, meta: Meta, args: List) -> Dict: *block_labels, block_body = args result: Dict[str, Any] = block_body if self.with_meta: result.update( { START_LINE: meta.line, END_LINE: meta.end_line, } ) # create nested dict. i.e. {label1: {label2: {labelN: result}}} for label in reversed(block_labels): label_str = self.strip_quotes(label) result = {label_str: result} return result def attribute(self, args: List) -> Attribute: key = str(args[0]) if key.startswith('"') and key.endswith('"'): key = key[1:-1] value = self.to_string_dollar(args[1]) return Attribute(key, value) def conditional(self, args: List) -> str: args = self.strip_new_line_tokens(args) return f"{args[0]} ? {args[1]} : {args[2]}" def binary_op(self, args: List) -> str: return " ".join([str(arg) for arg in args]) def unary_op(self, args: List) -> str: return "".join([str(arg) for arg in args]) def binary_term(self, args: List) -> str: args = self.strip_new_line_tokens(args) return " ".join([str(arg) for arg in args]) def body(self, args: List) -> Dict[str, List]: # See https://github.com/hashicorp/hcl/blob/main/hclsyntax/spec.md#bodies # --- # A body is a collection of associated attributes and blocks. # # An attribute definition assigns a value to a particular attribute # name within a body. Each distinct attribute name may be defined no # more than once within a single body. # # A block creates a child body that is annotated with a block type and # zero or more block labels. Blocks create a structural hierarchy which # can be interpreted by the calling application. # --- # # There can be more than one child body with the same block type and # labels. This means that all blocks (even when there is only one) # should be transformed into lists of blocks. args = self.strip_new_line_tokens(args) attributes = set() result: Dict[str, Any] = {} for arg in args: if isinstance(arg, Attribute): if arg.key in result: raise RuntimeError(f"{arg.key} already defined") result[arg.key] = arg.value attributes.add(arg.key) else: # This is a block. for key, value in arg.items(): key = str(key) if key in result: if key in attributes: raise RuntimeError(f"{key} already defined") result[key].append(value) else: result[key] = [value] return result def start(self, args: List) -> Dict: args = self.strip_new_line_tokens(args) return args[0] def binary_operator(self, args: List) -> str: return str(args[0]) def heredoc_template(self, args: List) -> str: match = HEREDOC_PATTERN.match(str(args[0])) if not match: raise RuntimeError(f"Invalid Heredoc token: {args[0]}") trim_chars = "\n\t " return f'"{match.group(2).rstrip(trim_chars)}"' def heredoc_template_trim(self, args: List) -> str: # See https://github.com/hashicorp/hcl2/blob/master/hcl/hclsyntax/spec.md#template-expressions # This is a special version of heredocs that are declared with "<<-" # This will calculate the minimum number of leading spaces in each line of a heredoc # and then remove that number of spaces from each line match = HEREDOC_TRIM_PATTERN.match(str(args[0])) if not match: raise RuntimeError(f"Invalid Heredoc token: {args[0]}") trim_chars = "\n\t " text = match.group(2).rstrip(trim_chars) lines = text.split("\n") # calculate the min number of leading spaces in each line min_spaces = sys.maxsize for line in lines: leading_spaces = len(line) - len(line.lstrip(" ")) min_spaces = min(min_spaces, leading_spaces) # trim off that number of leading spaces from each line lines = [line[min_spaces:] for line in lines] return '"%s"' % "\n".join(lines) def new_line_or_comment(self, args: List) -> _DiscardType: return Discard def for_tuple_expr(self, args: List) -> str: args = self.strip_new_line_tokens(args) for_expr = " ".join([str(arg) for arg in args[1:-1]]) return f"[{for_expr}]" def for_intro(self, args: List) -> str: args = self.strip_new_line_tokens(args) return " ".join([str(arg) for arg in args]) def for_cond(self, args: List) -> str: args = self.strip_new_line_tokens(args) return " ".join([str(arg) for arg in args]) def for_object_expr(self, args: List) -> str: args = self.strip_new_line_tokens(args) for_expr = " ".join([str(arg) for arg in args[1:-1]]) # doubled curly braces stands for inlining the braces # and the third pair of braces is for the interpolation # e.g. f"{2 + 2} {{2 + 2}}" == "4 {2 + 2}" return f"{{{for_expr}}}" def strip_new_line_tokens(self, args: List) -> List: """ Remove new line and Discard tokens. The parser will sometimes include these in the tree so we need to strip them out here """ return [arg for arg in args if arg != "\n" and arg is not Discard] def to_string_dollar(self, value: Any) -> Any: """Wrap a string in ${ and }""" if isinstance(value, str): if value.startswith('"') and value.endswith('"'): return str(value)[1:-1] return f"${{{value}}}" return value def strip_quotes(self, value: Any) -> Any: """Remove quote characters from the start and end of a string""" if isinstance(value, str): if value.startswith('"') and value.endswith('"'): return str(value)[1:-1] return value def identifier(self, value: Any) -> Any: # Making identifier a token by capitalizing it to IDENTIFIER # seems to return a token object instead of the str # So treat it like a regular rule # In this case we just convert the whole thing to a string return str(value[0])