Source code for jasy.style.tokenize.Tokenizer

#
# Jasy - Web Tooling Framework
# Copyright 2013-2014 Sebastian Werner
#

import re
import copy

import jasy.core.Console as Console
import jasy.script.api.Comment as Comment


# Operator and punctuator mapping from token to tree node type name.
# NB: because the lexer doesn't backtrack, all token prefixes must themselves
# be valid tokens (e.g. !== is acceptable because its prefixes are the valid
# tokens != and !).
operatorNames = {
    '<'   : 'lt',
    '>'   : 'gt',
    '<='  : 'le',
    '>='  : 'ge',
    '!='  : 'ne',
    '=='  : 'eq',

    '!'   : 'not',

    '+'   : 'plus',
    '*'   : 'mul',
    '-'   : 'minus',
    '/'   : 'div',
    '%'   : 'mod',
    '$'   : 'dollar',
    '^'   : 'carat',
    '|'   : 'pipe',

    ','   : 'comma',
    ';'   : 'semicolon',
    ':'   : 'colon',
    '='   : 'assign',
    '&'   : 'ampersand',
    '~'   : 'tilde',
    '@'   : 'at',
    '?'   : 'questionmark',

    '&&'  : 'and',
    '||'  : 'or',

    ')'   : 'right_paren',
    '('   : 'left_paren',
    '['   : 'left_bracket',
    ']'   : 'right_bracket',
    '{'   : 'left_curly',
    '}'   : 'right_curly'
}


# Assignment operators
assignOperators = ["+", "-", "*", "/", "%", "?"]


#
# Classes
#

[docs]class Token:
    __slots__ = ["type", "start", "line", "assignOp", "end", "value", "unit", "quote"]


[docs]class TokenizerError(Exception):

    def __init__(self, message, fileId, line):
        self.message = "Tokenization Error: %s" % message
        self.fileId = fileId
        self.line = line

        Exception.__init__(self, self.message)

    def __str__(self):
        return "%s in %s at %s" % (self.message, self.fileId, self.line)


[docs]class Tokenizer(object):

    def __init__(self, source, fileId="", line=1):
        # source: JavaScript source
        # fileId: Filename (for debugging proposes)
        # line: Line number (for debugging proposes)
        self.cursor = 0
        self.source = str(source)
        self.tokens = {}
        self.tokenIndex = 0
        self.lookahead = 0
        self.scanNewlines = False
        self.fileId = fileId
        self.line = line
        self.comments = []

    input_ = property(lambda self: self.source[self.cursor:])
    token = property(lambda self: self.tokens.get(self.tokenIndex))


[docs]    def done(self):
        # We need to set scanOperand to true here because the first thing
        # might be a regexp.
        return self.peek(True) == "end"


[docs]    def match(self, tokenType, scanOperand=False):
        return self.get(scanOperand) == tokenType or self.unget()


[docs]    def mustMatch(self, tokenType):
        if not self.match(tokenType):
            raise TokenizerError("Missing " + tokenType, self.fileId, self.line)

        return self.token


[docs]    def find(self, anyOf):
        point = self.save()

        while True:
            tokenType = self.get()
            if tokenType in anyOf:
                self.rewind(point)
                return tokenType

        self.rewind(point)
        return None


[docs]    def peek(self, scanOperand=False):
        if self.lookahead:
            next = self.tokens.get((self.tokenIndex + self.lookahead) & 3)
            if self.scanNewlines and (getattr(next, "line", None) != getattr(self, "line", None)):
                tokenType = "newline"
            else:
                tokenType = getattr(next, "type", None)
        else:
            tokenType = self.get(scanOperand)
            self.unget()

        return tokenType


[docs]    def peekOnSameLine(self, scanOperand=False):
        self.scanNewlines = True
        tokenType = self.peek(scanOperand)
        self.scanNewlines = False
        return tokenType


[docs]    def getComments(self):
        if self.comments:
            comments = self.comments
            self.comments = []
            return comments

        return None


[docs]    def skip(self):
        """Eats comments and whitespace."""
        input = self.source
        startLine = self.line

        # Whether this is the first called as happen on start parsing a file (eat leading comments/white space)
        startOfFile = self.cursor is 0

        indent = ""

        self.skippedSpaces = False
        self.skippedComments = False
        self.skippedLineBreaks = False

        while (True):
            if len(input) > self.cursor:
                ch = input[self.cursor]
            else:
                break

            self.cursor += 1

            if len(input) > self.cursor:
                next = input[self.cursor]
            else:
                next = None

            if ch == "\n" and not self.scanNewlines:
                self.line += 1
                indent = ""
                self.skippedLineBreaks = True

            elif ch == "/" and next == "*":
                self.cursor += 1
                self.skippedComments = True
                text = "/*"
                inline = startLine == self.line and startLine > 1
                commentStartLine = self.line
                if startLine == self.line and not startOfFile:
                    mode = "inline"
                elif (self.line - 1) > startLine:
                    # distance before this comment means it is a comment block for a whole section (multiple lines of code)
                    mode = "section"
                else:
                    # comment for maybe multiple following lines of code, but not that important (no visual white space divider)
                    mode = "block"

                while (True):
                    try:
                        ch = input[self.cursor]
                        self.cursor += 1
                    except IndexError:
                        raise TokenizerError("Unterminated comment", self.fileId, self.line)

                    if ch == "*":
                        next = input[self.cursor]
                        if next == "/":
                            text += "*/"
                            self.cursor += 1
                            break

                    elif ch == "\n":
                        self.line += 1

                    text += ch


                # Filter escaping on slash-star combinations in comment text
                text = text.replace("*\/", "*/")

                try:
                    self.comments.append(Comment.Comment(text, mode, commentStartLine, indent, self.fileId))
                except Comment.CommentException as commentError:
                    Console.error("Ignoring comment in %s: %s", self.fileId, commentError)


            elif ch == "/" and next == "/":
                self.cursor += 1
                self.skippedComments = True
                text = "//"
                if startLine == self.line and not startOfFile:
                    mode = "inline"
                elif (self.line - 1) > startLine:
                    # distance before this comment means it is a comment block for a whole section (multiple lines of code)
                    mode = "section"
                else:
                    # comment for maybe multiple following lines of code, but not that important (no visual white space divider)
                    mode = "block"

                while (True):
                    try:
                        ch = input[self.cursor]
                        self.cursor += 1
                    except IndexError:
                        # end of file etc.
                        break

                    if ch == "\n":
                        self.line += 1
                        break

                    text += ch

                try:
                    self.comments.append(Comment.Comment(text, mode, self.line - 1, "", self.fileId))
                except Comment.CommentException:
                    Console.error("Ignoring comment in %s: %s", self.fileId, commentError)

            # check for whitespace, also for special cases like 0xA0
            elif ch in "\xA0 \t":
                self.skippedSpaces = True
                indent += ch

            else:
                self.cursor -= 1
                break


[docs]    def lexZeroNumber(self, ch):
        token = self.token
        input = self.source
        token.type = "number"

        ch = input[self.cursor]
        self.cursor += 1
        if ch == ".":
            while(True):
                ch = input[self.cursor]
                self.cursor += 1
                if not (ch >= "0" and ch <= "9"):
                    break

            self.cursor -= 1
            token.value = float(input[token.start:self.cursor])

        elif ch == "x" or ch == "X":
            while(True):
                ch = input[self.cursor]
                self.cursor += 1
                if not ((ch >= "0" and ch <= "9") or (ch >= "a" and ch <= "f") or (ch >= "A" and ch <= "F")):
                    break

            self.cursor -= 1
            token.value = input[token.start:self.cursor]

        else:
            self.cursor -= 1
            token.value = 0

        unit = self.lexUnit()
        if unit:
            token.unit = unit


[docs]    def lexNumber(self, ch):
        token = self.token
        input = self.source
        token.type = "number"

        floating = False
        while(True):
            ch = input[self.cursor]
            self.cursor += 1

            if ch == "." and not floating:
                floating = True
                ch = input[self.cursor]
                self.cursor += 1

            if not (ch >= "0" and ch <= "9"):
                break

        self.cursor -= 1

        segment = input[token.start:self.cursor]

        # Protect float or exponent numbers
        if floating:
            token.value = float(segment)
        else:
            token.value = int(segment)

        unit = self.lexUnit()
        if unit:
            token.unit = unit


[docs]    def lexUnit(self):
        """Parses units like %, cm, inch, px, etc."""

        start = self.cursor
        input = self.source

        while(True):
            ch = input[self.cursor]
            self.cursor += 1
            if not ((ch >= "a" and ch <= "z") or ch == "%"):
                break

        self.cursor -= 1

        segment = input[start:self.cursor]
        return segment


[docs]    def lexDot(self, ch):
        token = self.token
        input = self.source
        next = input[self.cursor]

        if next >= "0" and next <= "9":
            while (True):
                ch = input[self.cursor]
                self.cursor += 1
                if not (ch >= "0" and ch <= "9"):
                    break

            self.cursor -= 1

            token.type = "number"
            token.value = float(input[token.start:self.cursor])

            unit = self.lexUnit()
            if unit:
                token.unit = unit

        else:
            token.type = "dot"


[docs]    def lexString(self, ch):
        token = self.token
        input = self.source
        token.type = "string"

        hasEscapes = False
        delim = ch
        ch = input[self.cursor]
        length = len(input)
        self.cursor += 1
        while ch != delim:
            if ch == "\\":
                hasEscapes = True
                self.cursor += 1

            if self.cursor >= length:
                raise TokenizerError("Missing end quote for string!", self.fileId, self.line)

            ch = input[self.cursor]
            self.cursor += 1

        token.value = str(input[token.start + 1:self.cursor - 1])
        token.quote = input[token.start]


[docs]    def lexOp(self, ch):
        token = self.token
        input = self.source

        op = ch
        while(True):
            try:
                next = input[self.cursor]
            except IndexError:
                break

            if (op + next) in operatorNames:
                self.cursor += 1
                op += next
            else:
                break

        try:
            next = input[self.cursor]
        except IndexError:
            next = None

        if next == "=" and op in assignOperators:
            self.cursor += 1
            token.type = "assign"
            token.assignOp = operatorNames[op]
            op += "="

        elif op in operatorNames:
            token.type = operatorNames[op]
            token.assignOp = None

        else:
            raise TokenizerError("Unknown operator: %s!" % op, self.fileId, self.line)


[docs]    def lexIdent(self, ch):
        token = self.token
        input = self.source

        # Variables/Commands should support packaged/namespaced names e.g. "foo.bar"
        isVariable = input[token.start] == "$"
        isCommand = input[token.start] == "@"
        isHex = input[token.start] == "#"

        # Support variable blocks e.g. ${foo}
        inVariableBlock = False
        if isVariable and input[self.cursor] == "{":
            inVariableBlock = True
            self.cursor += 1

        try:
            while True:
                ch = input[self.cursor]
                self.cursor += 1

                if not ((ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or (ch >= "0" and ch <= "9") or ch == "_" or ch == "-" or ch == "."):
                    break

        except IndexError:
            self.cursor += 1
            pass

        # Put the non-word character back.
        self.cursor -= 1

        # Compute start offset
        startOffset = 0
        if isCommand or isVariable:
            if inVariableBlock:
                startOffset = 2
            else:
                startOffset = 1

        # Extract identifier part
        identifier = input[token.start + startOffset:self.cursor]

        # Support for variable blocks e.g. ${foo}
        if inVariableBlock:
            # Check whether next character would be the required curly brace
            if input[self.cursor] != "}":
                raise TokenizerError("Invalid variable block identifier: %s" % identifier, self.fileId, self.line)

            # Jump over closing curly brace
            self.cursor += 1

        if len(identifier) == 0 and (isCommand or isVariable or isHex):
            raise TokenizerError("Invalid identifier: %s" % identifier, self.fileId, self.line)

        if isCommand:
            token.type = "command"
            token.value = identifier
        elif isVariable:
            token.type = "variable"
            token.value = identifier
        elif identifier == "true" or identifier == "false" or identifier == "null" or identifier == "and" or identifier == "or" or identifier == "not":
            token.type = identifier
        else:
            token.type = "identifier"
            token.value = identifier


[docs]    def get(self, scanOperand=False):
        """
        It consumes input *only* if there is no lookahead.
        Dispatches to the appropriate lexing function depending on the input.
        """
        while self.lookahead:
            self.lookahead -= 1
            self.tokenIndex = (self.tokenIndex + 1) & 3
            token = self.tokens[self.tokenIndex]
            if token.type != "newline" or self.scanNewlines:
                return token.type


        self.skip()

        self.tokenIndex = (self.tokenIndex + 1) & 3
        self.tokens[self.tokenIndex] = token = Token()

        token.start = self.cursor
        token.line = self.line

        input = self.source
        if self.cursor == len(input):
            token.end = token.start
            token.type = "end"
            return token.type

        ch = input[self.cursor]
        self.cursor += 1

        # Peek to next character
        if (ch == "-" or ch == "#" or ch == "$" or ch == "@") and len(input) > self.cursor:
            nextCh = input[self.cursor]
        else:
            nextCh = None

        # Identifiers (or single operators)
        if (ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or ch == "$" or ch == "@" or ch == "_" or ch == "#" or ch == "-":
            # Lex as identifier if not started with a special symbol
            if nextCh is None:
                self.lexIdent(ch)
            # Lex as identifier when next character is an actual character
            elif (nextCh >= "a" and nextCh <= "z") or (nextCh >= "A" and nextCh <= "Z"):
                self.lexIdent(ch)
            # For hex value still lex as identifier when next character is a number
            elif ch == "#" and (nextCh >= "0" and nextCh <= "9"):
                self.lexIdent(ch)
            # Variable in boundary
            elif ch == "$" and nextCh == "{":
                self.lexIdent(ch)
            # Engine prefixed system command
            elif ch == "@" and nextCh == "-":
                self.lexIdent(ch)
            # Otherwise lex as a trivial operator
            else:
                self.lexOp(ch)

        elif ch == ".":
            self.lexDot(ch)

        elif self.scanNewlines and ch == "\n":
            token.type = "newline"
            self.line += 1

        elif ch in operatorNames:
            self.lexOp(ch)

        elif ch >= "1" and ch <= "9":
            self.lexNumber(ch)

        elif ch == "0":
            self.lexZeroNumber(ch)

        elif ch == '"' or ch == "'":
            self.lexString(ch)

        else:
            raise TokenizerError("Illegal token: %s (Code: %s) - Next: %s (Code: %s)" % (ch, ord(ch), nextCh, nextCh and ord(nextCh)), self.fileId, self.line)

        token.end = self.cursor
        return token.type


[docs]    def unget(self):
        """Match depends on unget returning undefined."""
        self.lookahead += 1

        if self.lookahead == 4:
            raise TokenizerError("PANIC: too much lookahead!", self.fileId, self.line)

        self.tokenIndex = (self.tokenIndex - 1) & 3


[docs]    def save(self):
        return {
            "cursor" : self.cursor,
            "tokenIndex": self.tokenIndex,
            "tokens": copy.copy(self.tokens),
            "lookahead": self.lookahead,
            "scanNewlines": self.scanNewlines,
            "line": self.line,
            "skippedSpaces": self.skippedSpaces,
            "skippedComments": self.skippedComments,
            "skippedLineBreaks": self.skippedLineBreaks
        }


[docs]    def rewind(self, point):
        self.cursor = point["cursor"]
        self.tokenIndex = point["tokenIndex"]
        self.tokens = copy.copy(point["tokens"])
        self.lookahead = point["lookahead"]
        self.scanNewline = point["scanNewlines"]
        self.line = point["line"]
        self.skippedSpaces = point["skippedSpaces"]
        self.skippedComments = point["skippedComments"]
        self.skippedLineBreaks = point["skippedLineBreaks"]