brief-extractor/backend/venv/lib/python3.10/site-packages/dirtyjson/loader.py
2026-03-06 18:42:46 +00:00

373 lines
14 KiB
Python
Executable file

"""Implementation of JSONDecoder
"""
from __future__ import absolute_import
import re
import sys
import struct
from .compat import fromhex, u, text_type, binary_type, PY2, unichr, ascii
from dirtyjson.attributed_containers import AttributedDict, AttributedList
from .error import Error
def _floatconstants():
_BYTES = fromhex('7FF80000000000007FF0000000000000')
# The struct module in Python 2.4 would get frexp() out of range here
# when an endian is specified in the format string. Fixed in Python 2.5+
if sys.byteorder != 'big':
_BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
nan, inf = struct.unpack('dd', _BYTES)
return nan, inf, -inf
NaN, PosInf, NegInf = _floatconstants()
_CONSTANTS = {
'null': None,
'true': True,
'false': False,
'-Infinity': NegInf,
'Infinity': PosInf,
'NaN': NaN,
}
CONSTANT_RE = re.compile('(%s)' % '|'.join(_CONSTANTS))
NUMBER_RE = re.compile(r'(-?(?:0x[\da-fA-F]+|\d+))(\.\d+)?([eE][-+]?\d+)?')
EQUATION_RE = re.compile(r'[0-9.+\-]*[()[0-9+\-*/eEx&|]+')
STRINGCHUNK_DOUBLEQUOTE = re.compile(r'(.*?)(["\\\x00-\x1f])')
STRINGCHUNK_SINGLEQUOTE = re.compile(r"(.*?)(['\\\x00-\x1f])")
UNQUOTED_KEYNAME = re.compile(r"([\w_$]+[\w\d_$]*)")
WHITESPACE_STR = ' \t\n\r'
WHITESPACE = re.compile('[%s]*' % WHITESPACE_STR, re.VERBOSE | re.MULTILINE | re.DOTALL)
BACKSLASH = {
'"': u('"'), '\'': u('\''), '\\': u('\u005c'), '/': u('/'),
'b': u('\b'), 'f': u('\f'), 'n': u('\n'), 'r': u('\r'), 't': u('\t'),
}
DEFAULT_ENCODING = "utf-8"
class Position(object):
def __init__(self, line, column):
self.line = line
self.column = column
def __lt__(self, other):
if self.line > other.line:
return False
return self.line < other.line or self.column < other.column
class KeyValuePosition(object):
def __init__(self, key_position, value_position):
self.key = key_position
self.value = value_position
class DirtyJSONLoader(object):
"""JSON decoder that can handle muck in the file
Performs the following translations in decoding by default:
+---------------+-------------------+
| JSON | Python |
+===============+===================+
| object | AttributedDict |
+---------------+-------------------+
| array | list |
+---------------+-------------------+
| string | unicode |
+---------------+-------------------+
| number (int) | int, long |
+---------------+-------------------+
| number (real) | float |
+---------------+-------------------+
| true | True |
+---------------+-------------------+
| false | False |
+---------------+-------------------+
| null | None |
+---------------+-------------------+
It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
their corresponding ``float`` values, which is outside the JSON spec.
"""
def __init__(self, content, encoding=None, parse_float=None, parse_int=None,
parse_constant=None):
self.encoding = encoding or DEFAULT_ENCODING
self.parse_float = parse_float or float
self.parse_int = parse_int or int
self.parse_constant = parse_constant or _CONSTANTS.__getitem__
self.memo = {}
if not PY2 and isinstance(content, binary_type):
self.content = content.decode()
else:
self.content = content
if self.encoding != DEFAULT_ENCODING:
fixed = self.content.encode(self.encoding, 'ignore').decode()
if self.content != fixed:
for index, character in enumerate(self.content):
if character != fixed[index]:
raise Error('Non-{} character {}'.format(self.encoding, ascii(character)), self.content, index)
self.end = len(self.content)
self.lineno = 1
self.current_line_pos = 0
self.pos = 0
self.expecting = 'Expecting value'
def _next_character(self):
try:
nextchar = self.content[self.pos]
self.pos += 1
return nextchar
except IndexError:
raise Error(self.expecting, self.content, self.pos)
def _next_character_after_whitespace(self):
try:
nextchar = self.content[self.pos]
if nextchar in WHITESPACE_STR:
self._skip_whitespace()
nextchar = self.content[self.pos]
self.pos += 1
return nextchar
except IndexError:
return ''
def _skip_whitespace(self):
while True:
self._skip_forward_to(WHITESPACE.match(self.content, self.pos).end())
if self.pos > self.end - 2:
break
two_chars = self.content[self.pos:self.pos + 2]
if two_chars == '//' or two_chars == '/*':
terminator = '\n' if two_chars == '//' else '*/'
lf = self.content.index(terminator, self.pos)
if lf >= 0:
self._skip_forward_to(lf + len(terminator))
else:
self._skip_forward_to(self.end)
break
else:
break
def _skip_forward_to(self, end):
if end != self.pos:
linefeeds = self.content.count('\n', self.pos, end)
if linefeeds:
self.lineno += linefeeds
rpos = self.content.rfind('\n', self.pos, end)
self.current_line_pos = rpos + 1
self.pos = end
def _current_position(self, offset=0):
return Position(self.lineno, self.pos - self.current_line_pos + 1 + offset)
def scan(self):
self.expecting = 'Expecting value'
nextchar = self._next_character()
if nextchar == '"' or nextchar == "'":
return self.parse_string(nextchar)
if nextchar == '{':
return self.parse_object()
if nextchar == '[':
return self.parse_array()
self.pos -= 1
m = CONSTANT_RE.match(self.content, self.pos)
if m:
self.pos = m.end()
return self.parse_constant(m.groups()[0])
m = NUMBER_RE.match(self.content, self.pos)
if m and (m.end() == len(self.content) or self.content[m.end()] not in '+-/*()'):
integer, frac, exp = m.groups()
if frac or exp:
res = self.parse_float(integer + (frac or '') + (exp or ''))
else:
try:
res = self.parse_int(int(integer, 0))
except ValueError:
if integer[0] == '0':
integer = '0o' + integer[1:]
res = self.parse_int(int(integer, 0))
else:
raise
self.pos = m.end()
return res
m = EQUATION_RE.match(self.content, self.pos)
if m:
try:
res = eval(m.string[m.pos:m.end()])
except (SyntaxError, NameError):
raise Error('Cannot evaluate expression', self.content, self.pos)
self.pos = m.end()
return res
raise Error(self.expecting, self.content, self.pos)
def parse_string(self, terminating_character,
_b=BACKSLASH, _join=u('').join,
_py2=PY2, _maxunicode=sys.maxunicode):
"""Scan the string for a JSON string. End is the index of the
character in string after the quote that started the JSON string.
Unescapes all valid JSON string escape sequences and raises ValueError
on attempt to decode an invalid string.
Returns a tuple of the decoded string and the index of the character in
string after the end quote."""
_m = STRINGCHUNK_DOUBLEQUOTE.match if terminating_character == '"' else STRINGCHUNK_SINGLEQUOTE.match
chunks = []
_append = chunks.append
begin = self.pos - 1
while 1:
chunk = _m(self.content, self.pos)
if chunk is None:
raise Error(
"Unterminated string starting at", self.content, begin)
self.pos = chunk.end()
content, terminator = chunk.groups()
# Content is contains zero or more unescaped string characters
if content:
if _py2 and not isinstance(content, text_type):
content = text_type(content, self.encoding)
_append(content)
# Terminator is the end of string, a literal control character,
# or a backslash denoting that an escape sequence follows
if terminator == terminating_character:
break
elif terminator != '\\':
_append(terminator)
continue
try:
esc = self.content[self.pos]
except IndexError:
raise Error(
"Unterminated string starting at", self.content, begin)
# If not a unicode escape sequence, must be in the lookup table
if esc != 'u':
try:
char = _b[esc]
except KeyError:
msg = "Invalid \\X escape sequence %r"
raise Error(msg, self.content, self.pos)
self.pos += 1
else:
# Unicode escape sequence
msg = "Invalid \\uXXXX escape sequence"
esc = self.content[self.pos + 1:self.pos + 5]
esc_x = esc[1:2]
if len(esc) != 4 or esc_x == 'x' or esc_x == 'X':
raise Error(msg, self.content, self.pos - 1)
try:
uni = int(esc, 16)
except ValueError:
raise Error(msg, self.content, self.pos - 1)
self.pos += 5
# Check for surrogate pair on UCS-4 systems
# Note that this will join high/low surrogate pairs
# but will also pass unpaired surrogates through
if _maxunicode > 65535 and uni & 0xfc00 == 0xd800 and self.content[self.pos:self.pos + 2] == '\\u':
esc2 = self.content[self.pos + 2:self.pos + 6]
esc_x = esc2[1:2]
if len(esc2) == 4 and not (esc_x == 'x' or esc_x == 'X'):
try:
uni2 = int(esc2, 16)
except ValueError:
raise Error(msg, self.content, self.pos)
if uni2 & 0xfc00 == 0xdc00:
uni = 0x10000 + (((uni - 0xd800) << 10) |
(uni2 - 0xdc00))
self.pos += 6
char = unichr(uni)
# Append the unescaped character
_append(char)
return _join(chunks)
def parse_object(self):
# Backwards compatibility
memo_get = self.memo.setdefault
obj = AttributedDict()
# Use a slice to prevent IndexError from being raised, the following
# check will raise a more specific ValueError if the string is empty
nextchar = self._next_character_after_whitespace()
# Trivial empty object
while True:
if nextchar == '}':
break
key_pos = self._current_position(-len(nextchar))
if nextchar == '"' or nextchar == "'":
key = self.parse_string(nextchar)
else:
chunk = UNQUOTED_KEYNAME.match(self.content, self.pos - 1)
if chunk is None:
raise Error(
"Expecting property name",
self.content, self.pos)
self.pos = chunk.end()
key = chunk.groups()[0]
key = memo_get(key, key)
# To skip some function call overhead we optimize the fast paths where
# the JSON key separator is ": " or just ":".
if self._next_character_after_whitespace() != ':':
raise Error("Expecting ':' delimiter", self.content, self.pos)
self._skip_whitespace()
key_value_pos = KeyValuePosition(key_pos, self._current_position())
value = self.scan()
obj.add_with_attributes(key, value, key_value_pos)
nextchar = self._next_character_after_whitespace()
if nextchar == '}':
break
elif nextchar != ',':
raise Error("Expecting ',' delimiter or '}'", self.content, self.pos - len(nextchar))
nextchar = self._next_character_after_whitespace()
return obj
def parse_array(self):
values = AttributedList()
nextchar = self._next_character_after_whitespace()
# Look-ahead for trivial empty array
if nextchar == ']':
return values
elif nextchar == '':
raise Error("Expecting value or ']'", self.content, self.pos)
while True:
if nextchar == ']':
break
self.pos -= len(nextchar)
value_pos = self._current_position()
value = self.scan()
values.append(value, value_pos)
nextchar = self._next_character_after_whitespace()
if nextchar == ']':
break
elif nextchar != ',':
raise Error("Expecting ',' delimiter or ']'", self.content, self.pos - len(nextchar))
nextchar = self._next_character_after_whitespace()
return values
def decode(self, search_for_first_object=False, start_index=0):
"""Return the Python representation of ``s`` (a ``str`` or ``unicode``
instance containing a JSON document)
"""
if start_index:
self._skip_forward_to(start_index)
if search_for_first_object:
i = self.content.find('[', self.pos)
o = self.content.find('{', self.pos)
if i > o >= self.pos or i < 0:
i = o
if i >= self.pos:
self._skip_forward_to(i)
self._skip_whitespace()
obj = self.scan()
return obj