#
# Jasy - Web Tooling Framework
# Copyright 2013-2014 Sebastian Werner
#
import re
import copy
import jasy.core.Console as Console
import jasy.script.api.Comment as Comment
# Operator and punctuator mapping from token to tree node type name.
# NB: because the lexer doesn't backtrack, all token prefixes must themselves
# be valid tokens (e.g. !== is acceptable because its prefixes are the valid
# tokens != and !).
operatorNames = {
'<' : 'lt',
'>' : 'gt',
'<=' : 'le',
'>=' : 'ge',
'!=' : 'ne',
'==' : 'eq',
'!' : 'not',
'+' : 'plus',
'*' : 'mul',
'-' : 'minus',
'/' : 'div',
'%' : 'mod',
'$' : 'dollar',
'^' : 'carat',
'|' : 'pipe',
',' : 'comma',
';' : 'semicolon',
':' : 'colon',
'=' : 'assign',
'&' : 'ampersand',
'~' : 'tilde',
'@' : 'at',
'?' : 'questionmark',
'&&' : 'and',
'||' : 'or',
')' : 'right_paren',
'(' : 'left_paren',
'[' : 'left_bracket',
']' : 'right_bracket',
'{' : 'left_curly',
'}' : 'right_curly'
}
# Assignment operators
assignOperators = ["+", "-", "*", "/", "%", "?"]
#
# Classes
#
[docs]class Token:
__slots__ = ["type", "start", "line", "assignOp", "end", "value", "unit", "quote"]
[docs]class TokenizerError(Exception):
def __init__(self, message, fileId, line):
self.message = "Tokenization Error: %s" % message
self.fileId = fileId
self.line = line
Exception.__init__(self, self.message)
def __str__(self):
return "%s in %s at %s" % (self.message, self.fileId, self.line)
[docs]class Tokenizer(object):
def __init__(self, source, fileId="", line=1):
# source: JavaScript source
# fileId: Filename (for debugging proposes)
# line: Line number (for debugging proposes)
self.cursor = 0
self.source = str(source)
self.tokens = {}
self.tokenIndex = 0
self.lookahead = 0
self.scanNewlines = False
self.fileId = fileId
self.line = line
self.comments = []
input_ = property(lambda self: self.source[self.cursor:])
token = property(lambda self: self.tokens.get(self.tokenIndex))
[docs] def done(self):
# We need to set scanOperand to true here because the first thing
# might be a regexp.
return self.peek(True) == "end"
[docs] def match(self, tokenType, scanOperand=False):
return self.get(scanOperand) == tokenType or self.unget()
[docs] def mustMatch(self, tokenType):
if not self.match(tokenType):
raise TokenizerError("Missing " + tokenType, self.fileId, self.line)
return self.token
[docs] def find(self, anyOf):
point = self.save()
while True:
tokenType = self.get()
if tokenType in anyOf:
self.rewind(point)
return tokenType
self.rewind(point)
return None
[docs] def peek(self, scanOperand=False):
if self.lookahead:
next = self.tokens.get((self.tokenIndex + self.lookahead) & 3)
if self.scanNewlines and (getattr(next, "line", None) != getattr(self, "line", None)):
tokenType = "newline"
else:
tokenType = getattr(next, "type", None)
else:
tokenType = self.get(scanOperand)
self.unget()
return tokenType
[docs] def peekOnSameLine(self, scanOperand=False):
self.scanNewlines = True
tokenType = self.peek(scanOperand)
self.scanNewlines = False
return tokenType
[docs] def skip(self):
"""Eats comments and whitespace."""
input = self.source
startLine = self.line
# Whether this is the first called as happen on start parsing a file (eat leading comments/white space)
startOfFile = self.cursor is 0
indent = ""
self.skippedSpaces = False
self.skippedComments = False
self.skippedLineBreaks = False
while (True):
if len(input) > self.cursor:
ch = input[self.cursor]
else:
break
self.cursor += 1
if len(input) > self.cursor:
next = input[self.cursor]
else:
next = None
if ch == "\n" and not self.scanNewlines:
self.line += 1
indent = ""
self.skippedLineBreaks = True
elif ch == "/" and next == "*":
self.cursor += 1
self.skippedComments = True
text = "/*"
inline = startLine == self.line and startLine > 1
commentStartLine = self.line
if startLine == self.line and not startOfFile:
mode = "inline"
elif (self.line - 1) > startLine:
# distance before this comment means it is a comment block for a whole section (multiple lines of code)
mode = "section"
else:
# comment for maybe multiple following lines of code, but not that important (no visual white space divider)
mode = "block"
while (True):
try:
ch = input[self.cursor]
self.cursor += 1
except IndexError:
raise TokenizerError("Unterminated comment", self.fileId, self.line)
if ch == "*":
next = input[self.cursor]
if next == "/":
text += "*/"
self.cursor += 1
break
elif ch == "\n":
self.line += 1
text += ch
# Filter escaping on slash-star combinations in comment text
text = text.replace("*\/", "*/")
try:
self.comments.append(Comment.Comment(text, mode, commentStartLine, indent, self.fileId))
except Comment.CommentException as commentError:
Console.error("Ignoring comment in %s: %s", self.fileId, commentError)
elif ch == "/" and next == "/":
self.cursor += 1
self.skippedComments = True
text = "//"
if startLine == self.line and not startOfFile:
mode = "inline"
elif (self.line - 1) > startLine:
# distance before this comment means it is a comment block for a whole section (multiple lines of code)
mode = "section"
else:
# comment for maybe multiple following lines of code, but not that important (no visual white space divider)
mode = "block"
while (True):
try:
ch = input[self.cursor]
self.cursor += 1
except IndexError:
# end of file etc.
break
if ch == "\n":
self.line += 1
break
text += ch
try:
self.comments.append(Comment.Comment(text, mode, self.line - 1, "", self.fileId))
except Comment.CommentException:
Console.error("Ignoring comment in %s: %s", self.fileId, commentError)
# check for whitespace, also for special cases like 0xA0
elif ch in "\xA0 \t":
self.skippedSpaces = True
indent += ch
else:
self.cursor -= 1
break
[docs] def lexZeroNumber(self, ch):
token = self.token
input = self.source
token.type = "number"
ch = input[self.cursor]
self.cursor += 1
if ch == ".":
while(True):
ch = input[self.cursor]
self.cursor += 1
if not (ch >= "0" and ch <= "9"):
break
self.cursor -= 1
token.value = float(input[token.start:self.cursor])
elif ch == "x" or ch == "X":
while(True):
ch = input[self.cursor]
self.cursor += 1
if not ((ch >= "0" and ch <= "9") or (ch >= "a" and ch <= "f") or (ch >= "A" and ch <= "F")):
break
self.cursor -= 1
token.value = input[token.start:self.cursor]
else:
self.cursor -= 1
token.value = 0
unit = self.lexUnit()
if unit:
token.unit = unit
[docs] def lexNumber(self, ch):
token = self.token
input = self.source
token.type = "number"
floating = False
while(True):
ch = input[self.cursor]
self.cursor += 1
if ch == "." and not floating:
floating = True
ch = input[self.cursor]
self.cursor += 1
if not (ch >= "0" and ch <= "9"):
break
self.cursor -= 1
segment = input[token.start:self.cursor]
# Protect float or exponent numbers
if floating:
token.value = float(segment)
else:
token.value = int(segment)
unit = self.lexUnit()
if unit:
token.unit = unit
[docs] def lexUnit(self):
"""Parses units like %, cm, inch, px, etc."""
start = self.cursor
input = self.source
while(True):
ch = input[self.cursor]
self.cursor += 1
if not ((ch >= "a" and ch <= "z") or ch == "%"):
break
self.cursor -= 1
segment = input[start:self.cursor]
return segment
[docs] def lexDot(self, ch):
token = self.token
input = self.source
next = input[self.cursor]
if next >= "0" and next <= "9":
while (True):
ch = input[self.cursor]
self.cursor += 1
if not (ch >= "0" and ch <= "9"):
break
self.cursor -= 1
token.type = "number"
token.value = float(input[token.start:self.cursor])
unit = self.lexUnit()
if unit:
token.unit = unit
else:
token.type = "dot"
[docs] def lexString(self, ch):
token = self.token
input = self.source
token.type = "string"
hasEscapes = False
delim = ch
ch = input[self.cursor]
length = len(input)
self.cursor += 1
while ch != delim:
if ch == "\\":
hasEscapes = True
self.cursor += 1
if self.cursor >= length:
raise TokenizerError("Missing end quote for string!", self.fileId, self.line)
ch = input[self.cursor]
self.cursor += 1
token.value = str(input[token.start + 1:self.cursor - 1])
token.quote = input[token.start]
[docs] def lexOp(self, ch):
token = self.token
input = self.source
op = ch
while(True):
try:
next = input[self.cursor]
except IndexError:
break
if (op + next) in operatorNames:
self.cursor += 1
op += next
else:
break
try:
next = input[self.cursor]
except IndexError:
next = None
if next == "=" and op in assignOperators:
self.cursor += 1
token.type = "assign"
token.assignOp = operatorNames[op]
op += "="
elif op in operatorNames:
token.type = operatorNames[op]
token.assignOp = None
else:
raise TokenizerError("Unknown operator: %s!" % op, self.fileId, self.line)
[docs] def lexIdent(self, ch):
token = self.token
input = self.source
# Variables/Commands should support packaged/namespaced names e.g. "foo.bar"
isVariable = input[token.start] == "$"
isCommand = input[token.start] == "@"
isHex = input[token.start] == "#"
# Support variable blocks e.g. ${foo}
inVariableBlock = False
if isVariable and input[self.cursor] == "{":
inVariableBlock = True
self.cursor += 1
try:
while True:
ch = input[self.cursor]
self.cursor += 1
if not ((ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or (ch >= "0" and ch <= "9") or ch == "_" or ch == "-" or ch == "."):
break
except IndexError:
self.cursor += 1
pass
# Put the non-word character back.
self.cursor -= 1
# Compute start offset
startOffset = 0
if isCommand or isVariable:
if inVariableBlock:
startOffset = 2
else:
startOffset = 1
# Extract identifier part
identifier = input[token.start + startOffset:self.cursor]
# Support for variable blocks e.g. ${foo}
if inVariableBlock:
# Check whether next character would be the required curly brace
if input[self.cursor] != "}":
raise TokenizerError("Invalid variable block identifier: %s" % identifier, self.fileId, self.line)
# Jump over closing curly brace
self.cursor += 1
if len(identifier) == 0 and (isCommand or isVariable or isHex):
raise TokenizerError("Invalid identifier: %s" % identifier, self.fileId, self.line)
if isCommand:
token.type = "command"
token.value = identifier
elif isVariable:
token.type = "variable"
token.value = identifier
elif identifier == "true" or identifier == "false" or identifier == "null" or identifier == "and" or identifier == "or" or identifier == "not":
token.type = identifier
else:
token.type = "identifier"
token.value = identifier
[docs] def get(self, scanOperand=False):
"""
It consumes input *only* if there is no lookahead.
Dispatches to the appropriate lexing function depending on the input.
"""
while self.lookahead:
self.lookahead -= 1
self.tokenIndex = (self.tokenIndex + 1) & 3
token = self.tokens[self.tokenIndex]
if token.type != "newline" or self.scanNewlines:
return token.type
self.skip()
self.tokenIndex = (self.tokenIndex + 1) & 3
self.tokens[self.tokenIndex] = token = Token()
token.start = self.cursor
token.line = self.line
input = self.source
if self.cursor == len(input):
token.end = token.start
token.type = "end"
return token.type
ch = input[self.cursor]
self.cursor += 1
# Peek to next character
if (ch == "-" or ch == "#" or ch == "$" or ch == "@") and len(input) > self.cursor:
nextCh = input[self.cursor]
else:
nextCh = None
# Identifiers (or single operators)
if (ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or ch == "$" or ch == "@" or ch == "_" or ch == "#" or ch == "-":
# Lex as identifier if not started with a special symbol
if nextCh is None:
self.lexIdent(ch)
# Lex as identifier when next character is an actual character
elif (nextCh >= "a" and nextCh <= "z") or (nextCh >= "A" and nextCh <= "Z"):
self.lexIdent(ch)
# For hex value still lex as identifier when next character is a number
elif ch == "#" and (nextCh >= "0" and nextCh <= "9"):
self.lexIdent(ch)
# Variable in boundary
elif ch == "$" and nextCh == "{":
self.lexIdent(ch)
# Engine prefixed system command
elif ch == "@" and nextCh == "-":
self.lexIdent(ch)
# Otherwise lex as a trivial operator
else:
self.lexOp(ch)
elif ch == ".":
self.lexDot(ch)
elif self.scanNewlines and ch == "\n":
token.type = "newline"
self.line += 1
elif ch in operatorNames:
self.lexOp(ch)
elif ch >= "1" and ch <= "9":
self.lexNumber(ch)
elif ch == "0":
self.lexZeroNumber(ch)
elif ch == '"' or ch == "'":
self.lexString(ch)
else:
raise TokenizerError("Illegal token: %s (Code: %s) - Next: %s (Code: %s)" % (ch, ord(ch), nextCh, nextCh and ord(nextCh)), self.fileId, self.line)
token.end = self.cursor
return token.type
[docs] def unget(self):
"""Match depends on unget returning undefined."""
self.lookahead += 1
if self.lookahead == 4:
raise TokenizerError("PANIC: too much lookahead!", self.fileId, self.line)
self.tokenIndex = (self.tokenIndex - 1) & 3
[docs] def save(self):
return {
"cursor" : self.cursor,
"tokenIndex": self.tokenIndex,
"tokens": copy.copy(self.tokens),
"lookahead": self.lookahead,
"scanNewlines": self.scanNewlines,
"line": self.line,
"skippedSpaces": self.skippedSpaces,
"skippedComments": self.skippedComments,
"skippedLineBreaks": self.skippedLineBreaks
}
[docs] def rewind(self, point):
self.cursor = point["cursor"]
self.tokenIndex = point["tokenIndex"]
self.tokens = copy.copy(point["tokens"])
self.lookahead = point["lookahead"]
self.scanNewline = point["scanNewlines"]
self.line = point["line"]
self.skippedSpaces = point["skippedSpaces"]
self.skippedComments = point["skippedComments"]
self.skippedLineBreaks = point["skippedLineBreaks"]