"""Implementation of JSONDecoder """ from __future__ import absolute_import import re import sys import struct from .compat import fromhex, u, text_type, binary_type, PY2, unichr, ascii from dirtyjson.attributed_containers import AttributedDict, AttributedList from .error import Error def _floatconstants(): _BYTES = fromhex('7FF80000000000007FF0000000000000') # The struct module in Python 2.4 would get frexp() out of range here # when an endian is specified in the format string. Fixed in Python 2.5+ if sys.byteorder != 'big': _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1] nan, inf = struct.unpack('dd', _BYTES) return nan, inf, -inf NaN, PosInf, NegInf = _floatconstants() _CONSTANTS = { 'null': None, 'true': True, 'false': False, '-Infinity': NegInf, 'Infinity': PosInf, 'NaN': NaN, } CONSTANT_RE = re.compile('(%s)' % '|'.join(_CONSTANTS)) NUMBER_RE = re.compile(r'(-?(?:0x[\da-fA-F]+|\d+))(\.\d+)?([eE][-+]?\d+)?') EQUATION_RE = re.compile(r'[0-9.+\-]*[()[0-9+\-*/eEx&|]+') STRINGCHUNK_DOUBLEQUOTE = re.compile(r'(.*?)(["\\\x00-\x1f])') STRINGCHUNK_SINGLEQUOTE = re.compile(r"(.*?)(['\\\x00-\x1f])") UNQUOTED_KEYNAME = re.compile(r"([\w_$]+[\w\d_$]*)") WHITESPACE_STR = ' \t\n\r' WHITESPACE = re.compile('[%s]*' % WHITESPACE_STR, re.VERBOSE | re.MULTILINE | re.DOTALL) BACKSLASH = { '"': u('"'), '\'': u('\''), '\\': u('\u005c'), '/': u('/'), 'b': u('\b'), 'f': u('\f'), 'n': u('\n'), 'r': u('\r'), 't': u('\t'), } DEFAULT_ENCODING = "utf-8" class Position(object): def __init__(self, line, column): self.line = line self.column = column def __lt__(self, other): if self.line > other.line: return False return self.line < other.line or self.column < other.column class KeyValuePosition(object): def __init__(self, key_position, value_position): self.key = key_position self.value = value_position class DirtyJSONLoader(object): """JSON decoder that can handle muck in the file Performs the following translations in decoding by default: +---------------+-------------------+ | JSON | Python | +===============+===================+ | object | AttributedDict | +---------------+-------------------+ | array | list | +---------------+-------------------+ | string | unicode | +---------------+-------------------+ | number (int) | int, long | +---------------+-------------------+ | number (real) | float | +---------------+-------------------+ | true | True | +---------------+-------------------+ | false | False | +---------------+-------------------+ | null | None | +---------------+-------------------+ It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as their corresponding ``float`` values, which is outside the JSON spec. """ def __init__(self, content, encoding=None, parse_float=None, parse_int=None, parse_constant=None): self.encoding = encoding or DEFAULT_ENCODING self.parse_float = parse_float or float self.parse_int = parse_int or int self.parse_constant = parse_constant or _CONSTANTS.__getitem__ self.memo = {} if not PY2 and isinstance(content, binary_type): self.content = content.decode() else: self.content = content if self.encoding != DEFAULT_ENCODING: fixed = self.content.encode(self.encoding, 'ignore').decode() if self.content != fixed: for index, character in enumerate(self.content): if character != fixed[index]: raise Error('Non-{} character {}'.format(self.encoding, ascii(character)), self.content, index) self.end = len(self.content) self.lineno = 1 self.current_line_pos = 0 self.pos = 0 self.expecting = 'Expecting value' def _next_character(self): try: nextchar = self.content[self.pos] self.pos += 1 return nextchar except IndexError: raise Error(self.expecting, self.content, self.pos) def _next_character_after_whitespace(self): try: nextchar = self.content[self.pos] if nextchar in WHITESPACE_STR: self._skip_whitespace() nextchar = self.content[self.pos] self.pos += 1 return nextchar except IndexError: return '' def _skip_whitespace(self): while True: self._skip_forward_to(WHITESPACE.match(self.content, self.pos).end()) if self.pos > self.end - 2: break two_chars = self.content[self.pos:self.pos + 2] if two_chars == '//' or two_chars == '/*': terminator = '\n' if two_chars == '//' else '*/' lf = self.content.index(terminator, self.pos) if lf >= 0: self._skip_forward_to(lf + len(terminator)) else: self._skip_forward_to(self.end) break else: break def _skip_forward_to(self, end): if end != self.pos: linefeeds = self.content.count('\n', self.pos, end) if linefeeds: self.lineno += linefeeds rpos = self.content.rfind('\n', self.pos, end) self.current_line_pos = rpos + 1 self.pos = end def _current_position(self, offset=0): return Position(self.lineno, self.pos - self.current_line_pos + 1 + offset) def scan(self): self.expecting = 'Expecting value' nextchar = self._next_character() if nextchar == '"' or nextchar == "'": return self.parse_string(nextchar) if nextchar == '{': return self.parse_object() if nextchar == '[': return self.parse_array() self.pos -= 1 m = CONSTANT_RE.match(self.content, self.pos) if m: self.pos = m.end() return self.parse_constant(m.groups()[0]) m = NUMBER_RE.match(self.content, self.pos) if m and (m.end() == len(self.content) or self.content[m.end()] not in '+-/*()'): integer, frac, exp = m.groups() if frac or exp: res = self.parse_float(integer + (frac or '') + (exp or '')) else: try: res = self.parse_int(int(integer, 0)) except ValueError: if integer[0] == '0': integer = '0o' + integer[1:] res = self.parse_int(int(integer, 0)) else: raise self.pos = m.end() return res m = EQUATION_RE.match(self.content, self.pos) if m: try: res = eval(m.string[m.pos:m.end()]) except (SyntaxError, NameError): raise Error('Cannot evaluate expression', self.content, self.pos) self.pos = m.end() return res raise Error(self.expecting, self.content, self.pos) def parse_string(self, terminating_character, _b=BACKSLASH, _join=u('').join, _py2=PY2, _maxunicode=sys.maxunicode): """Scan the string for a JSON string. End is the index of the character in string after the quote that started the JSON string. Unescapes all valid JSON string escape sequences and raises ValueError on attempt to decode an invalid string. Returns a tuple of the decoded string and the index of the character in string after the end quote.""" _m = STRINGCHUNK_DOUBLEQUOTE.match if terminating_character == '"' else STRINGCHUNK_SINGLEQUOTE.match chunks = [] _append = chunks.append begin = self.pos - 1 while 1: chunk = _m(self.content, self.pos) if chunk is None: raise Error( "Unterminated string starting at", self.content, begin) self.pos = chunk.end() content, terminator = chunk.groups() # Content is contains zero or more unescaped string characters if content: if _py2 and not isinstance(content, text_type): content = text_type(content, self.encoding) _append(content) # Terminator is the end of string, a literal control character, # or a backslash denoting that an escape sequence follows if terminator == terminating_character: break elif terminator != '\\': _append(terminator) continue try: esc = self.content[self.pos] except IndexError: raise Error( "Unterminated string starting at", self.content, begin) # If not a unicode escape sequence, must be in the lookup table if esc != 'u': try: char = _b[esc] except KeyError: msg = "Invalid \\X escape sequence %r" raise Error(msg, self.content, self.pos) self.pos += 1 else: # Unicode escape sequence msg = "Invalid \\uXXXX escape sequence" esc = self.content[self.pos + 1:self.pos + 5] esc_x = esc[1:2] if len(esc) != 4 or esc_x == 'x' or esc_x == 'X': raise Error(msg, self.content, self.pos - 1) try: uni = int(esc, 16) except ValueError: raise Error(msg, self.content, self.pos - 1) self.pos += 5 # Check for surrogate pair on UCS-4 systems # Note that this will join high/low surrogate pairs # but will also pass unpaired surrogates through if _maxunicode > 65535 and uni & 0xfc00 == 0xd800 and self.content[self.pos:self.pos + 2] == '\\u': esc2 = self.content[self.pos + 2:self.pos + 6] esc_x = esc2[1:2] if len(esc2) == 4 and not (esc_x == 'x' or esc_x == 'X'): try: uni2 = int(esc2, 16) except ValueError: raise Error(msg, self.content, self.pos) if uni2 & 0xfc00 == 0xdc00: uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) self.pos += 6 char = unichr(uni) # Append the unescaped character _append(char) return _join(chunks) def parse_object(self): # Backwards compatibility memo_get = self.memo.setdefault obj = AttributedDict() # Use a slice to prevent IndexError from being raised, the following # check will raise a more specific ValueError if the string is empty nextchar = self._next_character_after_whitespace() # Trivial empty object while True: if nextchar == '}': break key_pos = self._current_position(-len(nextchar)) if nextchar == '"' or nextchar == "'": key = self.parse_string(nextchar) else: chunk = UNQUOTED_KEYNAME.match(self.content, self.pos - 1) if chunk is None: raise Error( "Expecting property name", self.content, self.pos) self.pos = chunk.end() key = chunk.groups()[0] key = memo_get(key, key) # To skip some function call overhead we optimize the fast paths where # the JSON key separator is ": " or just ":". if self._next_character_after_whitespace() != ':': raise Error("Expecting ':' delimiter", self.content, self.pos) self._skip_whitespace() key_value_pos = KeyValuePosition(key_pos, self._current_position()) value = self.scan() obj.add_with_attributes(key, value, key_value_pos) nextchar = self._next_character_after_whitespace() if nextchar == '}': break elif nextchar != ',': raise Error("Expecting ',' delimiter or '}'", self.content, self.pos - len(nextchar)) nextchar = self._next_character_after_whitespace() return obj def parse_array(self): values = AttributedList() nextchar = self._next_character_after_whitespace() # Look-ahead for trivial empty array if nextchar == ']': return values elif nextchar == '': raise Error("Expecting value or ']'", self.content, self.pos) while True: if nextchar == ']': break self.pos -= len(nextchar) value_pos = self._current_position() value = self.scan() values.append(value, value_pos) nextchar = self._next_character_after_whitespace() if nextchar == ']': break elif nextchar != ',': raise Error("Expecting ',' delimiter or ']'", self.content, self.pos - len(nextchar)) nextchar = self._next_character_after_whitespace() return values def decode(self, search_for_first_object=False, start_index=0): """Return the Python representation of ``s`` (a ``str`` or ``unicode`` instance containing a JSON document) """ if start_index: self._skip_forward_to(start_index) if search_for_first_object: i = self.content.find('[', self.pos) o = self.content.find('{', self.pos) if i > o >= self.pos or i < 0: i = o if i >= self.pos: self._skip_forward_to(i) self._skip_whitespace() obj = self.scan() return obj