Source code for jasy.script.tokenize.Tokenizer

#
# Jasy - Web Tooling Framework
# Copyright 2010-2012 Zynga Inc.
# Copyright 2013-2014 Sebastian Werner
#

#
# License: MPL 1.1/GPL 2.0/LGPL 2.1
# Authors:
#   - Brendan Eich <brendan@mozilla.org> (Original JavaScript) (2004-2010)
#   - Sebastian Werner <info@sebastian-werner.net> (Python Port) (2010)
#

import re
import copy

import jasy.script.tokenize.Lang as Lang
import jasy.script.api.Comment as Comment
import jasy.core.Console as Console


# Operator and punctuator mapping from token to tree node type name.
# NB: because the lexer doesn't backtrack, all token prefixes must themselves
# be valid tokens (e.g. !== is acceptable because its prefixes are the valid
# tokens != and !).
operatorNames = {
    '<'   : 'lt',
    '>'   : 'gt',
    '<='  : 'le',
    '>='  : 'ge',
    '!='  : 'ne',
    '!'   : 'not',
    '=='  : 'eq',
    '===' : 'strict_eq',
    '!==' : 'strict_ne',

    '>>'  : 'rsh',
    '<<'  : 'lsh',
    '>>>' : 'ursh',

    '+'   : 'plus',
    '*'   : 'mul',
    '-'   : 'minus',
    '/'   : 'div',
    '%'   : 'mod',

    ','   : 'comma',
    ';'   : 'semicolon',
    ':'   : 'colon',
    '='   : 'assign',
    '?'   : 'hook',

    '&&'  : 'and',
    '||'  : 'or',

    '++'  : 'increment',
    '--'  : 'decrement',

    ')'   : 'right_paren',
    '('   : 'left_paren',
    '['   : 'left_bracket',
    ']'   : 'right_bracket',
    '{'   : 'left_curly',
    '}'   : 'right_curly',

    '&'   : 'bitwise_and',
    '^'   : 'bitwise_xor',
    '|'   : 'bitwise_or',
    '~'   : 'bitwise_not'
}


# Assignment operators
assignOperators = ["|", "^", "&", "<<", ">>", ">>>", "+", "-", "*", "/", "%"]




#
# Classes
#

[docs]class Token: __slots__ = ["type", "start", "line", "assignOp", "end", "value"]
[docs]class ParseError(Exception): def __init__(self, message, fileId, line): Exception.__init__(self, "Syntax error: %s\n%s:%s" % (message, fileId, line))
[docs]class Tokenizer(object): def __init__(self, source, fileId="", line=1): # source: JavaScript source # fileId: Filename (for debugging proposes) # line: Line number (for debugging proposes) self.cursor = 0 self.source = str(source) self.tokens = {} self.tokenIndex = 0 self.lookahead = 0 self.scanNewlines = False self.fileId = fileId self.line = line self.comments = [] input_ = property(lambda self: self.source[self.cursor:]) token = property(lambda self: self.tokens.get(self.tokenIndex))
[docs] def done(self): # We need to set scanOperand to true here because the first thing # might be a regexp. return self.peek(True) == "end"
[docs] def match(self, tokenType, scanOperand=False): return self.get(scanOperand) == tokenType or self.unget()
[docs] def mustMatch(self, tokenType): if not self.match(tokenType): raise ParseError("Missing " + tokenType, self.fileId, self.line) return self.token
[docs] def peek(self, scanOperand=False): if self.lookahead: next = self.tokens.get((self.tokenIndex + self.lookahead) & 3) if self.scanNewlines and (getattr(next, "line", None) != getattr(self, "line", None)): tokenType = "newline" else: tokenType = getattr(next, "type", None) else: tokenType = self.get(scanOperand) self.unget() return tokenType
[docs] def peekOnSameLine(self, scanOperand=False): self.scanNewlines = True tokenType = self.peek(scanOperand) self.scanNewlines = False return tokenType
[docs] def getComments(self): if self.comments: comments = self.comments self.comments = [] return comments return None
[docs] def skip(self): """Eats comments and whitespace.""" input = self.source startLine = self.line # Whether this is the first called as happen on start parsing a file (eat leading comments/white space) startOfFile = self.cursor is 0 indent = "" while True: if len(input) > self.cursor: ch = input[self.cursor] else: return self.cursor += 1 if len(input) > self.cursor: next = input[self.cursor] else: next = None if ch == "\n" and not self.scanNewlines: self.line += 1 indent = "" elif ch == "/" and next == "*": self.cursor += 1 text = "/*" inline = startLine == self.line and startLine > 1 commentStartLine = self.line if startLine == self.line and not startOfFile: mode = "inline" elif (self.line - 1) > startLine: # distance before this comment means it is a comment block for a whole section (multiple lines of code) mode = "section" else: # comment for maybe multiple following lines of code, but not that important (no visual white space divider) mode = "block" while True: try: ch = input[self.cursor] self.cursor += 1 except IndexError: raise ParseError("Unterminated comment", self.fileId, self.line) if ch == "*": next = input[self.cursor] if next == "/": text += "*/" self.cursor += 1 break elif ch == "\n": self.line += 1 text += ch # Filter escaping on slash-star combinations in comment text text = text.replace("*\/", "*/") try: self.comments.append(Comment.Comment(text, mode, commentStartLine, indent, self.fileId)) except Comment.CommentException as commentError: Console.error("Ignoring comment in %s: %s", self.fileId, commentError) elif ch == "/" and next == "/": self.cursor += 1 text = "//" if startLine == self.line and not startOfFile: mode = "inline" elif (self.line - 1) > startLine: # distance before this comment means it is a comment block for a whole section (multiple lines of code) mode = "section" else: # comment for maybe multiple following lines of code, but not that important (no visual white space divider) mode = "block" while True: try: ch = input[self.cursor] self.cursor += 1 except IndexError: # end of file etc. break if ch == "\n": self.line += 1 break text += ch try: self.comments.append(Comment.Comment(text, mode, self.line - 1, "", self.fileId)) except Comment.CommentException: Console.error("Ignoring comment in %s: %s", self.fileId, commentError) # check for whitespace, also for special cases like 0xA0 elif ch in "\xA0 \t": indent += ch else: self.cursor -= 1 return
# Lexes the exponential part of a number, if present. Returns True if an # exponential part was found.
[docs] def lexExponent(self): input = self.source next = input[self.cursor] if next == "e" or next == "E": self.cursor += 1 ch = input[self.cursor] self.cursor += 1 if ch == "+" or ch == "-": ch = input[self.cursor] self.cursor += 1 if ch < "0" or ch > "9": raise ParseError("Missing exponent", self.fileId, self.line) while(True): ch = input[self.cursor] self.cursor += 1 if not (ch >= "0" and ch <= "9"): break self.cursor -= 1 return True return False
[docs] def lexZeroNumber(self, ch): token = self.token input = self.source token.type = "number" ch = input[self.cursor] self.cursor += 1 if ch == ".": while(True): ch = input[self.cursor] self.cursor += 1 if not (ch >= "0" and ch <= "9"): break self.cursor -= 1 self.lexExponent() token.value = input[token.start:self.cursor] elif ch == "x" or ch == "X": while(True): ch = input[self.cursor] self.cursor += 1 if not ((ch >= "0" and ch <= "9") or (ch >= "a" and ch <= "f") or (ch >= "A" and ch <= "F")): break self.cursor -= 1 token.value = input[token.start:self.cursor] elif ch >= "0" and ch <= "7": while(True): ch = input[self.cursor] self.cursor += 1 if not (ch >= "0" and ch <= "7"): break self.cursor -= 1 token.value = input[token.start:self.cursor] else: self.cursor -= 1 self.lexExponent() # 0E1, &c. token.value = 0
[docs] def lexNumber(self, ch): token = self.token input = self.source token.type = "number" floating = False while(True): ch = input[self.cursor] self.cursor += 1 if ch == "." and not floating: floating = True ch = input[self.cursor] self.cursor += 1 if not (ch >= "0" and ch <= "9"): break self.cursor -= 1 exponent = self.lexExponent() segment = input[token.start:self.cursor] # Protect float or exponent numbers if floating or exponent: token.value = segment else: token.value = int(segment)
[docs] def lexDot(self, ch): token = self.token input = self.source next = input[self.cursor] if next >= "0" and next <= "9": while True: ch = input[self.cursor] self.cursor += 1 if not (ch >= "0" and ch <= "9"): break self.cursor -= 1 self.lexExponent() token.type = "number" token.value = input[token.start:self.cursor] else: token.type = "dot"
[docs] def lexString(self, ch): token = self.token input = self.source token.type = "string" hasEscapes = False delim = ch ch = input[self.cursor] self.cursor += 1 while ch != delim: if ch == "\\": hasEscapes = True self.cursor += 1 ch = input[self.cursor] self.cursor += 1 if hasEscapes: token.value = eval(input[token.start:self.cursor]) else: token.value = input[token.start + 1:self.cursor - 1]
[docs] def lexRegExp(self, ch): token = self.token input = self.source token.type = "regexp" while True: try: ch = input[self.cursor] self.cursor += 1 except IndexError: raise ParseError("Unterminated regex", self.fileId, self.line) if ch == "\\": self.cursor += 1 elif ch == "[": while True: if ch == "\\": self.cursor += 1 try: ch = input[self.cursor] self.cursor += 1 except IndexError: raise ParseError("Unterminated character class", self.fileId, self.line) if ch == "]": break if ch == "/": break while(True): ch = input[self.cursor] self.cursor += 1 if not (ch >= "a" and ch <= "z"): break self.cursor -= 1 token.value = input[token.start:self.cursor]
[docs] def lexOp(self, ch): token = self.token input = self.source op = ch while(True): try: next = input[self.cursor] except IndexError: break if (op + next) in operatorNames: self.cursor += 1 op += next else: break try: next = input[self.cursor] except IndexError: next = None if next == "=" and op in assignOperators: self.cursor += 1 token.type = "assign" token.assignOp = operatorNames[op] op += "=" else: token.type = operatorNames[op] token.assignOp = None
# FIXME: Unicode escape sequences # FIXME: Unicode identifiers
[docs] def lexIdent(self, ch): token = self.token input = self.source try: while True: ch = input[self.cursor] self.cursor += 1 if not ((ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or (ch >= "0" and ch <= "9") or ch == "$" or ch == "_"): break except IndexError: self.cursor += 1 pass # Put the non-word character back. self.cursor -= 1 identifier = input[token.start:self.cursor] if identifier in Lang.keywords: token.type = identifier else: token.type = "identifier" token.value = identifier
[docs] def get(self, scanOperand=False): """ It consumes input *only* if there is no lookahead. Dispatches to the appropriate lexing function depending on the input. """ while self.lookahead: self.lookahead -= 1 self.tokenIndex = (self.tokenIndex + 1) & 3 token = self.tokens[self.tokenIndex] if token.type != "newline" or self.scanNewlines: return token.type self.skip() self.tokenIndex = (self.tokenIndex + 1) & 3 self.tokens[self.tokenIndex] = token = Token() token.start = self.cursor token.line = self.line input = self.source if self.cursor == len(input): token.end = token.start token.type = "end" return token.type ch = input[self.cursor] self.cursor += 1 if (ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or ch == "$" or ch == "_": self.lexIdent(ch) elif scanOperand and ch == "/": self.lexRegExp(ch) elif ch == ".": self.lexDot(ch) elif self.scanNewlines and ch == "\n": token.type = "newline" self.line += 1 elif ch in operatorNames: self.lexOp(ch) elif ch >= "1" and ch <= "9": self.lexNumber(ch) elif ch == "0": self.lexZeroNumber(ch) elif ch == '"' or ch == "'": self.lexString(ch) else: raise ParseError("Illegal token: %s (Code: %s)" % (ch, ord(ch)), self.fileId, self.line) token.end = self.cursor return token.type
[docs] def unget(self): """Match depends on unget returning undefined.""" self.lookahead += 1 if self.lookahead == 4: raise ParseError("PANIC: too much lookahead!", self.fileId, self.line) self.tokenIndex = (self.tokenIndex - 1) & 3