Source code for babel.messages.jslexer

# -*- coding: utf-8 -*-
"""
    babel.messages.jslexer
    ~~~~~~~~~~~~~~~~~~~~~~

    A simple JavaScript 1.5 lexer which is used for the JavaScript
    extractor.

    :copyright: (c) 2013 by the Babel Team.
    :license: BSD, see LICENSE for more details.
"""

from operator import itemgetter
import re
from babel._compat import unichr

operators = [
    '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
    '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
    '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
    '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':'
]
operators.sort(key=lambda a: -len(a))

escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}

rules = [
    (None, re.compile(r'\s+(?u)')),
    (None, re.compile(r'<!--.*')),
    ('linecomment', re.compile(r'//.*')),
    ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')),
    ('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')),
    ('number', re.compile(r'''(?x)(
        (?:0|[1-9]\d*)
        (\.\d+)?
        ([eE][-+]?\d+)? |
        (0x[a-fA-F0-9]+)
    )''')),
    ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
    ('string', re.compile(r'''(?xs)(
        '(?:[^'\\]*(?:\\.[^'\\]*)*)'  |
        "(?:[^"\\]*(?:\\.[^"\\]*)*)"
    )'''))
]

division_re = re.compile(r'/=?')
regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*(?s)')
line_re = re.compile(r'(\r\n|\n|\r)')
line_join_re = re.compile(r'\\' + line_re.pattern)
uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')


[docs]class Token(tuple): """Represents a token as returned by `tokenize`.""" __slots__ = () def __new__(cls, type, value, lineno): return tuple.__new__(cls, (type, value, lineno)) type = property(itemgetter(0)) value = property(itemgetter(1)) lineno = property(itemgetter(2))
[docs]def indicates_division(token): """A helper function that helps the tokenizer to decide if the current token may be followed by a division operator. """ if token.type == 'operator': return token.value in (')', ']', '}', '++', '--') return token.type in ('name', 'number', 'string', 'regexp')
[docs]def unquote_string(string): """Unquote a string with JavaScript rules. The string has to start with string delimiters (``'`` or ``"``.) """ assert string and string[0] == string[-1] and string[0] in '"\'', \ 'string provided is not properly delimited' string = line_join_re.sub('\\1', string[1:-1]) result = [] add = result.append pos = 0 while 1: # scan for the next escape escape_pos = string.find('\\', pos) if escape_pos < 0: break add(string[pos:escape_pos]) # check which character is escaped next_char = string[escape_pos + 1] if next_char in escapes: add(escapes[next_char]) # unicode escapes. trie to consume up to four characters of # hexadecimal characters and try to interpret them as unicode # character point. If there is no such character point, put # all the consumed characters into the string. elif next_char in 'uU': escaped = uni_escape_re.match(string, escape_pos + 2) if escaped is not None: escaped_value = escaped.group() if len(escaped_value) == 4: try: add(unichr(int(escaped_value, 16))) except ValueError: pass else: pos = escape_pos + 6 continue add(next_char + escaped_value) pos = escaped.end() continue else: add(next_char) # bogus escape. Just remove the backslash. else: add(next_char) pos = escape_pos + 2 if pos < len(string): add(string[pos:]) return u''.join(result)
[docs]def tokenize(source): """Tokenize a JavaScript source. Returns a generator of tokens. """ may_divide = False pos = 0 lineno = 1 end = len(source) while pos < end: # handle regular rules first for token_type, rule in rules: match = rule.match(source, pos) if match is not None: break # if we don't have a match we don't give up yet, but check for # division operators or regular expression literals, based on # the status of `may_divide` which is determined by the last # processed non-whitespace token using `indicates_division`. else: if may_divide: match = division_re.match(source, pos) token_type = 'operator' else: match = regex_re.match(source, pos) token_type = 'regexp' if match is None: # woops. invalid syntax. jump one char ahead and try again. pos += 1 continue token_value = match.group() if token_type is not None: token = Token(token_type, token_value, lineno) may_divide = indicates_division(token) yield token lineno += len(line_re.findall(token_value)) pos = match.end()