Source code for enaml.core.lexer

#------------------------------------------------------------------------------
#  Copyright (c) 2011, Enthought, Inc.
#  All rights reserved.
#------------------------------------------------------------------------------
import os
import tokenize

import ply.lex as lex


# Get a save directory for the lex and parse tables
_lex_dir = os.path.join(os.path.dirname(__file__), 'parse_tab')
_lex_module = 'enaml.core.parse_tab.lextab'


#------------------------------------------------------------------------------
# Lexing Helpers
#------------------------------------------------------------------------------
class ParsingError(Exception):
    """ A helper class to bubble up exceptions out of the parsers
    control to be re-raised at the parsing entry point. It avoids
    problems with raise SyntaxErrors from within Ply parsing rules.

    """
    def __init__(self, exc_class, *args, **kwargs):
        self.exc_class = exc_class
        self.args = args
        self.kwargs = kwargs

    def __call__(self):
        return self.exc_class(*self.args, **self.kwargs)


def _parsing_error(klass, message, token):
    """ Create and raise a parsing error for the syntax_error and
    indentation_error functions below.

    Parameters
    ----------
    klass : Exception class
        The exception type to raise.
    
    message : string
        The message to pass to the exception.
    
    token : LexToken
        The current lextoken for the error.
    
    """
    filename = token.lexer.filename
    lexdata = token.lexer.lexdata
    lineno = token.lineno
    text = lexdata.splitlines()[lineno - 1] 
    info = (filename, lineno, 0, text)
    raise ParsingError(klass, message, info)


def syntax_error(message, token):
    """ Raise a ParsingError which will be converted into an proper
    SyntaxError during parsing.

    Parameters
    ----------
    message : string
        The message to pass to the IndentationError
    
    token : LexToken
        The current lextoken for the error.

    """
    _parsing_error(SyntaxError, message, token)


def indentation_error(message, token):
    """ Raise a ParsingError which will be converted into an proper
    IndentationError during parsing.

    Parameters
    ----------
    message : string
        The message to pass to the IndentationError
    
    token : LexToken
        The current lextoken for the error.

    """
    _parsing_error(IndentationError, message, token)


#------------------------------------------------------------------------------
# Enaml Lexer
#------------------------------------------------------------------------------
[docs]class EnamlLexer(object): operators = ( (r'@', 'AT'), (r'&', 'AMPER'), (r'&=', 'AMPEREQUAL'), (r'\^', 'CIRCUMFLEX'), (r'\^=', 'CIRCUMFLEXEQUAL'), (r':', 'COLON'), (r'\.', 'DOT'), (r'//', 'DOUBLESLASH'), (r'//=', 'DOUBLESLASHEQUAL'), (r'\*\*', 'DOUBLESTAR'), (r'\*\*=', 'DOUBLESTAREQUAL'), (r'==', 'EQEQUAL'), (r'=', 'EQUAL'), (r'>', 'GREATER'), (r'>=', 'GREATEREQUAL'), (r'<<', 'LEFTSHIFT'), (r'<<=', 'LEFTSHIFTEQUAL'), (r'<', 'LESS'), (r'<=', 'LESSEQUAL'), (r'-', 'MINUS'), (r'-=', 'MINUSEQUAL'), (r'!=', 'NOTEQUAL'), (r'%', 'PERCENT'), (r'%=', 'PERCENTEQUAL'), (r'\+', 'PLUS'), (r'\+=', 'PLUSEQUAL'), (r'>>', 'RIGHTSHIFT'), (r'>>=', 'RIGHTSHIFTEQUAL'), (r';', 'SEMI'), (r'/', 'SLASH'), (r'/=', 'SLASHEQUAL',), (r'\*', 'STAR'), (r'\*=', 'STAREQUAL'), (r'~', 'TILDE'), (r'\|', 'VBAR'), (r'\|=', 'VBAREQUAL'), (r'::', 'DOUBLECOLON'), (r'\.\.\.', 'ELLIPSIS'), (r':=', 'COLONEQUAL'), ) tokens = ( 'COMMA', 'DEDENT', 'ENDMARKER', 'INDENT', 'LBRACE', 'LPAR', 'LSQB', 'NAME', 'NEWLINE', 'NUMBER', 'RBRACE', 'RPAR', 'RSQB', 'STRING', 'WS', # string token sentinels 'STRING_START_SINGLE', 'STRING_START_TRIPLE', 'STRING_CONTINUE', 'STRING_END', ) reserved = { 'and': 'AND', 'as': 'AS', 'assert': 'ASSERT', 'break': 'BREAK', 'class': 'CLASS', 'continue': 'CONTINUE', 'def': 'DEF', 'del': 'DEL', 'elif': 'ELIF', 'else': 'ELSE', 'enamldef': 'ENAMLDEF', 'exec': 'EXEC', 'except': 'EXCEPT', 'finally': 'FINALLY', 'from': 'FROM', 'for': 'FOR', 'global': 'GLOBAL', 'if': 'IF', 'import': 'IMPORT', 'in': 'IN', 'is': 'IS', 'lambda': 'LAMBDA', 'not': 'NOT', 'or': 'OR', 'pass': 'PASS', 'print': 'PRINT', 'raise': 'RAISE', 'return': 'RETURN', 'try': 'TRY', 'while': 'WHILE', 'with': 'WITH', 'yield': 'YIELD', } tokens = (tokens + tuple(val[1] for val in operators) + tuple(reserved.values())) #-------------------------------------------------------------------------- # Lexer States #-------------------------------------------------------------------------- states = ( ('SINGLEQ1', 'exclusive'), ('SINGLEQ2', 'exclusive'), ('TRIPLEQ1', 'exclusive'), ('TRIPLEQ2', 'exclusive'), ) #-------------------------------------------------------------------------- # Default Rules #-------------------------------------------------------------------------- t_COMMA = r',' t_NUMBER = tokenize.Number # Generate the token matching rules for the operators for tok_pattern, tok_name in operators: tok_name = 't_' + tok_name locals()[tok_name] = tok_pattern def t_comment(self, t): r'[ ]*\#[^\r\n]*' pass def t_WS(self, t): r' [ \t\f]+ ' value = t.value # A formfeed character may be present at the start of the # line; it will be ignored for the indentation calculations # above. Formfeed characters occurring elsewhere in the # leading whitespace have an undefined effect (for instance, # they may reset the space count to zero). value = value.rsplit("\f", 1)[-1] # First, tabs are replaced (from left to right) by one to eight # spaces such that the total number of characters up to and # including the replacement is a multiple of eight (this is # intended to be the same rule as used by Unix). The total number # of spaces preceding the first non-blank character then # determines the line's indentation. Indentation cannot be split # over multiple physical lines using backslashes; the whitespace # up to the first backslash determines the indentation. pos = 0 while True: pos = value.find("\t") if pos == -1: break n = 8 - (pos % 8) value = value[:pos] + " " * n + value[pos+1:] if self.at_line_start and self.paren_count == 0: return t # string continuation - ignored beyond the tokenizer level def t_escaped_newline(self, t): r"\\\n" t.type = "STRING_CONTINUE" # Raw strings don't escape the newline assert not self.is_raw, "only occurs outside of quoted strings" t.lexer.lineno += 1 def t_newline(self, t): r'\n+' t.lexer.lineno += len(t.value) t.type = 'NEWLINE' if self.paren_count == 0: return t def t_LPAR(self, t): r'\(' self.paren_count += 1 return t def t_RPAR(self, t): r'\)' self.paren_count -= 1 return t def t_LBRACE(self, t): r'\{' self.paren_count += 1 return t def t_RBRACE(self, t): r'\}' self.paren_count -= 1 return t def t_LSQB(self, t): r'\[' self.paren_count += 1 return t def t_RSQB(self, t): r'\]' self.paren_count -= 1 return t #-------------------------------------------------------------------------- # State string escapes #-------------------------------------------------------------------------- def t_SINGLEQ1_SINGLEQ2_TRIPLEQ1_TRIPLEQ2_escaped(self, t): r"\\(.|\n)" t.type = "STRING_CONTINUE" t.lexer.lineno += t.value.count("\n") return t #-------------------------------------------------------------------------- # TRIPLEQ1 strings #-------------------------------------------------------------------------- def t_start_triple_quoted_q1_string(self, t): r"[uU]?[rR]?'''" t.lexer.push_state("TRIPLEQ1") t.type = "STRING_START_TRIPLE" if "r" in t.value or "R" in t.value: self.is_raw = True t.value = t.value.split("'", 1)[0] return t def t_TRIPLEQ1_simple(self, t): r"[^'\\]+" t.type = "STRING_CONTINUE" t.lexer.lineno += t.value.count("\n") return t def t_TRIPLEQ1_q1_but_not_triple(self, t): r"'(?!'')" t.type = "STRING_CONTINUE" return t def t_TRIPLEQ1_end(self, t): r"'''" t.type = "STRING_END" t.lexer.pop_state() self.is_raw = False return t # supress PLY warning t_TRIPLEQ1_ignore = '' def t_TRIPLEQ1_error(self, t): syntax_error('invalid syntax', t) #-------------------------------------------------------------------------- # TRIPLEQ2 strings #-------------------------------------------------------------------------- def t_start_triple_quoted_q2_string(self, t): r'[uU]?[rR]?"""' t.lexer.push_state("TRIPLEQ2") t.type = "STRING_START_TRIPLE" if "r" in t.value or "R" in t.value: self.is_raw = True t.value = t.value.split('"', 1)[0] return t def t_TRIPLEQ2_simple(self, t): r'[^"\\]+' t.type = "STRING_CONTINUE" t.lexer.lineno += t.value.count("\n") return t def t_TRIPLEQ2_q2_but_not_triple(self, t): r'"(?!"")' t.type = "STRING_CONTINUE" return t def t_TRIPLEQ2_end(self, t): r'"""' t.type = "STRING_END" t.lexer.pop_state() self.is_raw = False return t # supress PLY warning t_TRIPLEQ2_ignore = '' def t_TRIPLEQ2_error(self, t): syntax_error('invalid syntax', t) #-------------------------------------------------------------------------- # SINGLEQ1 strings #-------------------------------------------------------------------------- def t_start_single_quoted_q1_string(self, t): r"[uU]?[rR]?'" t.lexer.push_state("SINGLEQ1") t.type = "STRING_START_SINGLE" if "r" in t.value or "R" in t.value: self.is_raw = True t.value = t.value.split("'", 1)[0] return t def t_SINGLEQ1_simple(self, t): r"[^'\\\n]+" t.type = "STRING_CONTINUE" return t def t_SINGLEQ1_end(self, t): r"'" t.type = "STRING_END" t.lexer.pop_state() self.is_raw = False return t # supress PLY warning t_SINGLEQ1_ignore = '' def t_SINGLEQ1_error(self, t): syntax_error('EOL while scanning single quoted string.', t) #-------------------------------------------------------------------------- # SINGLEQ2 strings #-------------------------------------------------------------------------- def t_start_single_quoted_q2_string(self, t): r'[uU]?[rR]?"' t.lexer.push_state("SINGLEQ2") t.type = "STRING_START_SINGLE" if "r" in t.value or "R" in t.value: self.is_raw = True t.value = t.value.split('"', 1)[0] return t def t_SINGLEQ2_simple(self, t): r'[^"\\\n]+' t.type = "STRING_CONTINUE" return t def t_SINGLEQ2_end(self, t): r'"' t.type = "STRING_END" t.lexer.pop_state() self.is_raw = False return t # supress PLY warning t_SINGLEQ2_ignore = '' def t_SINGLEQ2_error(self, t): syntax_error('EOL while scanning single quoted string.', t) #-------------------------------------------------------------------------- # Miscellaneous Token Rules #-------------------------------------------------------------------------- # This is placed after the string rules so r"" is not matched as a name. def t_NAME(self, t): r'[a-zA-Z_][a-zA-Z0-9_]*' t.type = self.reserved.get(t.value, "NAME") return t def t_error(self, t): syntax_error('invalid syntax', t) #-------------------------------------------------------------------------- # Normal Class Items #-------------------------------------------------------------------------- def __init__(self, filename='Enaml'): self.lexer = lex.lex( module=self, outputdir=_lex_dir, lextab=_lex_module, optimize=1, ) self.token_stream = None self.filename = filename # Ply has a bit of an inconsistency when using a class as a # lexer instead of a module. The .lexer attribute of tokens # created by the ply lexer are set to that ply lexer instance, # but the .lexer attribute of tokens created by the ply parser # are set to the instance of this class. So, when the p_error # function is called in the parser, we can't be sure which lexer # will be set on the error token. Since we need the filename in # that function, we add it as an attribute on both lexers. self.lexer.filename = filename def input(self, txt): self.lexer.input(txt) self.next_token = self.make_token_stream().next # State initialization self.paren_count = 0 self.is_raw = False self.at_line_start = False def token(self): try: tok = self.next_token() return tok except StopIteration: pass def dedent(self, lineno): # Synthesize a DEDENT Token tok = lex.LexToken() tok.type = 'DEDENT' tok.value = None tok.lineno = lineno tok.lexpos = -1 tok.lexer = self.lexer return tok def indent(self, lineno): # Synthesize an INDENT Token. tok = lex.LexToken() tok.type = 'INDENT' tok.value = None tok.lineno = lineno tok.lexpos = -1 tok.lexer = self.lexer return tok def newline(self, lineno): tok = lex.LexToken() tok.type = 'NEWLINE' tok.value = '\n' tok.lineno = lineno tok.lexpos = -1 tok.lexer = self.lexer return tok def make_token_stream(self): token_stream = iter(self.lexer.token, None) token_stream = self.create_strings(token_stream) token_stream = self.annotate_indentation_state(token_stream) token_stream = self.synthesize_indentation_tokens(token_stream) token_stream = self.add_endmarker(token_stream) return token_stream def create_strings(self, token_stream): for tok in token_stream: if not tok.type.startswith("STRING_START_"): yield tok continue # This is a string start; process until string end start_tok = tok string_toks = [] for tok in token_stream: if tok.type == "STRING_END": break else: assert tok.type == "STRING_CONTINUE", tok.type string_toks.append(tok) else: # Reached end of input without string termination msg = 'EOF while scanning %s-quoted string.' if start_tok.type == 'STRING_START_TRIPLE': msg = msg % 'triple' else: msg = msg % 'single' syntax_error(msg, start_tok) # Parse the quoted string. # # The four combinations are: # "ur" - raw_unicode_escape # "u" - unicode_escape # "r" - no need to do anything # "" - string_escape s = "".join(tok.value for tok in string_toks) quote_type = start_tok.value.lower() if quote_type == "": s = s.decode("string_escape") elif quote_type == "u": s = s.decode("unicode_escape") elif quote_type == "ur": s = s.decode("raw_unicode_escape") elif quote_type == "r": s = s else: msg = 'Unknown string quote type: %r' % quote_type raise AssertionError(msg) start_tok.type = "STRING" start_tok.value = s yield start_tok # Keep track of indentation state # # I implemented INDENT / DEDENT generation as a post-processing filter # # The original lex token stream contains WS and NEWLINE characters. # WS will only occur before any other tokens on a line. # # I have three filters. One tags tokens by adding two attributes. # "must_indent" is True if the token must be indented from the # previous code. The other is "at_line_start" which is True for WS # and the first non-WS/non-NEWLINE on a line. It flags the check to # see if the new line has changed indication level. # # Python's syntax has three INDENT states # 0) no colon hence no need to indent # 1) "if 1: go()" - simple statements have a COLON but no need for an indent # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent # # We only care about whitespace at the start of a line def annotate_indentation_state(self, token_stream): NO_INDENT = 0 MAY_INDENT = 1 MUST_INDENT = 2 self.at_line_start = at_line_start = True indent = NO_INDENT for token in token_stream: token.at_line_start = at_line_start # The double colon serves double purpose: in slice operations # and also as the notification operator. In the case of a # slice operation, newline continuation is already allowed # by suppressing NEWLINE tokens in a multiline expression. # So, we can treat double colon the same as colon here for # handling the logic surrounding indentation state. if token.type in ("COLON", "DOUBLECOLON"): at_line_start = False indent = MAY_INDENT token.must_indent = False elif token.type == "NEWLINE": at_line_start = True if indent == MAY_INDENT: indent = MUST_INDENT token.must_indent = False elif token.type == "WS": assert token.at_line_start == True at_line_start = True token.must_indent = False else: # A real token; only indent after COLON NEWLINE if indent == MUST_INDENT: token.must_indent = True else: token.must_indent = False at_line_start = False indent = NO_INDENT yield token self.at_line_start = at_line_start def synthesize_indentation_tokens(self, token_stream): # A stack of indentation levels; will never pop item 0 levels = [0] depth = 0 prev_was_ws = False # In case the token stream is empty for a completely # empty file. token = None for token in token_stream: # WS only occurs at the start of the line # There may be WS followed by NEWLINE so # only track the depth here. Don't indent/dedent # until there's something real. if token.type == 'WS': assert depth == 0 depth = len(token.value) prev_was_ws = True # WS tokens are never passed to the parser continue if token.type == 'NEWLINE': depth = 0 if prev_was_ws or token.at_line_start: # ignore blank lines continue # pass the other cases on through yield token continue # then it must be a real token (not WS, not NEWLINE) # which can affect the indentation level prev_was_ws = False if token.must_indent: # The current depth must be larger than the previous level if not (depth > levels[-1]): indentation_error('expected an indented block', token) levels.append(depth) yield self.indent(token.lineno) elif token.at_line_start: # Must be on the same level or one of the previous levels if depth == levels[-1]: # At the same level pass elif depth > levels[-1]: # indentation increase but not in new block indentation_error('unexpected indent', token) else: # Back up; but only if it matches a previous level try: i = levels.index(depth) except ValueError: msg = ('unindent does not match any outer level ' 'of indentation.') indentation_error(msg, token) for _ in range(i + 1, len(levels)): yield self.dedent(token.lineno) levels.pop() yield token # If the current token is WS (which is only emitted at the start # of a line), then the token before that was a newline unless # we're on line number 1. If that's the case, then we don't # need another newline token. if token is None: yield self.newline(-1) elif token.type != 'NEWLINE': if token.type != 'WS' or token.lineno == 1: yield self.newline(-1) # Must dedent any remaining levels if len(levels) > 1: assert token is not None for _ in range(1, len(levels)): yield self.dedent(token.lineno) def add_endmarker(self, token_stream): for tok in token_stream: yield tok end_marker = lex.LexToken() end_marker.type = 'ENDMARKER' end_marker.value = None end_marker.lineno = -1 end_marker.lexpos = -1 end_marker.lexer = self.lexer yield end_marker