From 4acff2d749b6957fd183ab1ca7a7798fc18f59a0 Mon Sep 17 00:00:00 2001 From: Rafael Zubairov Date: Tue, 20 Oct 2015 00:35:27 +0300 Subject: [PATCH] Extra parser class to allow instance reuse, memory buffers reuse. --- src/main/java/org/noggit/JSONParserExt.java | 1228 +++++++++++++++++++ 1 file changed, 1228 insertions(+) create mode 100644 src/main/java/org/noggit/JSONParserExt.java diff --git a/src/main/java/org/noggit/JSONParserExt.java b/src/main/java/org/noggit/JSONParserExt.java new file mode 100644 index 0000000..e0925b1 --- /dev/null +++ b/src/main/java/org/noggit/JSONParserExt.java @@ -0,0 +1,1228 @@ +package org.noggit; + + +import java.io.IOException; + +/** + * Class complimentary to the JSONParser, but allowing one to reuse memory buffers, reinit paser, etc. + */ +public class JSONParserExt { + + /** + * Event indicating a JSON string value, including member names of objects + */ + public static final int STRING = 1; + /** + * Event indicating a JSON number value which fits into a signed 64 bit integer + */ + public static final int LONG = 2; + /** + * Event indicating a JSON number value which has a fractional part or an exponent + * and with string length <= 23 chars not including sign. This covers + * all representations of normal values for Double.toString(). + */ + public static final int NUMBER = 3; + /** + * Event indicating a JSON number value that was not produced by toString of any + * Java primitive numerics such as Double or Long. It is either + * an integer outside the range of a 64 bit signed integer, or a floating + * point value with a string representation of more than 23 chars. + */ + public static final int BIGNUMBER = 4; + /** + * Event indicating a JSON boolean + */ + public static final int BOOLEAN = 5; + /** + * Event indicating a JSON null + */ + public static final int NULL = 6; + /** + * Event indicating the start of a JSON object + */ + public static final int OBJECT_START = 7; + /** + * Event indicating the end of a JSON object + */ + public static final int OBJECT_END = 8; + /** + * Event indicating the start of a JSON array + */ + public static final int ARRAY_START = 9; + /** + * Event indicating the end of a JSON array + */ + public static final int ARRAY_END = 10; + /** + * Event indicating the end of input has been reached + */ + public static final int EOF = 11; + /** + * Flags to control parsing behavior + */ + public static final int ALLOW_COMMENTS = 1 << 0; + public static final int ALLOW_SINGLE_QUOTES = 1 << 1; + public static final int ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER = 1 << 2; + public static final int ALLOW_UNQUOTED_KEYS = 1 << 3; + public static final int ALLOW_UNQUOTED_STRING_VALUES = 1 << 4; + /** + * ALLOW_EXTRA_COMMAS causes any nunber of extra commas in arrays and objects to be ignored + * Note that a trailing comma in [] would be [,] (hence calling the feature "trailing" commas + * isn't really correct. Since trailing commas is fundamentally incompatible with any future + * "fill-in-missing-values-with-null", it was decided to extend this feature to handle any + * number of extra commas. + */ + public static final int ALLOW_EXTRA_COMMAS = 1 << 5; + public static final int FLAGS_STRICT = 0; + public static final int FLAGS_DEFAULT = ALLOW_COMMENTS | ALLOW_SINGLE_QUOTES | ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER | ALLOW_UNQUOTED_KEYS | ALLOW_UNQUOTED_STRING_VALUES | ALLOW_EXTRA_COMMAS; + // parser states stored in the stack + private static final byte DID_OBJSTART = 1; // '{' just read + private static final byte DID_ARRSTART = 2; // '[' just read + private static final byte DID_ARRELEM = 3; // array element just read + private static final byte DID_MEMNAME = 4; // object member name (map key) just read + private static final byte DID_MEMVAL = 5; // object member value (map val) just read + private static final CharArr devNull = new NullCharArr(); + private static final long WS_MASK = (1L << ' ') | (1L << '\t') | (1L << '\r') | (1L << '\n') | (1L << '#') | (1L << '/') | (0x01); // set 1 bit so 0xA0 will be flagged as whitespace + private static final int HAS_FRACTION = 0x01; // nstate flag, '.' already read + private static final int HAS_EXPONENT = 0x02; // nstate flag, '[eE][+-]?[0-9]' already read + + char[] buf; // input buffer with JSON text in it + // temporary output buffer + private final CharArr out; + // a dummy buffer we can use to point at other buffers + private final CharArr tmp; + int flags = FLAGS_DEFAULT; + int start; // current position in the buffer + int end; // end position in the buffer (one past last valid index) + boolean eof = false; // true if the end of the stream was reached. + long gpos; // global position = gpos + start + int event; // last event read + int stringTerm; // The terminator for the last string we read: single quote, double quote, or 0 for unterminated. + // We need to keep some state in order to (at a minimum) know if + // we should skip ',' or ':'. + private byte[] stack; + private int ptr = 0; // pointer into the stack of parser states + + // idea - if someone passes us a CharArrayReader, we could + // directly use that buffer as it's protected. + private byte state = 0; // current parser state + // info about value that was just read (or is in the middle of being read) + private int valstate; + private boolean bool; // bo-olean value read + private long lval; // long value read + private int nstate; // current state while reading a number + + public JSONParserExt(char[] data, int start, int end, byte[] stack, CharArr localBuffer, CharArr tmp) { + this.stack = stack; + this.out = localBuffer; + this.tmp = tmp; + + initBase(data, start, end); + } + + public void init(char[] data, int start, int end) { + initBase(data, start, end); + } + + private void initBase(char[] data, int start, int end) { + this.buf = data; + this.flags = FLAGS_DEFAULT; + this.start = start; + this.end = end; + this.eof = false; + this.gpos = 0; + this.event = 0; + this.stringTerm = 0; + this.ptr = 0; // pointer into the stack of parser states + + // idea - if someone passes us a CharArrayReader, we could + // directly use that buffer as it's protected. + this.state = 0; // current parser state + // info about value that was just read (or is in the middle of being read) + this.valstate = 0; + this.bool = false; + this.lval = 0; + this.nstate = 0; + } + + public static String getEventString(int e) { + switch (e) { + case STRING: + return "STRING"; + case LONG: + return "LONG"; + case NUMBER: + return "NUMBER"; + case BIGNUMBER: + return "BIGNUMBER"; + case BOOLEAN: + return "BOOLEAN"; + case NULL: + return "NULL"; + case OBJECT_START: + return "OBJECT_START"; + case OBJECT_END: + return "OBJECT_END"; + case ARRAY_START: + return "ARRAY_START"; + case ARRAY_END: + return "ARRAY_END"; + case EOF: + return "EOF"; + } + return "Unknown: " + e; + } + + /** + * Returns true if the given character is considered to be whitespace. + * One difference between Java's Character.isWhitespace() is that this method + * considers a hard space (non-breaking space, or nbsp) to be whitespace. + */ + protected static final boolean isWhitespace(int ch) { + return (Character.isWhitespace(ch) || ch == 0x00a0); + } + + private static boolean isUnquotedStringStart(int ch) { + return Character.isJavaIdentifierStart(ch); + } + + // What characters are allowed to continue an unquoted string + // once we know we are in one. + private static boolean isUnquotedStringChar(int ch) { + return Character.isJavaIdentifierPart(ch) + || ch == '.' + || ch == '-' + || ch == '/'; + + // would checking for a-z first speed up the common case? + + // possibly much more liberal unquoted string handling... + /*** + switch (ch) { + case -1: + case ' ': + case '\t': + case '\r': + case '\n': + case '}': + case ']': + case ',': + case ':': + case '=': // reserved for future use + case '\\': // check for backslash should come after this function call + return false; + } + return true; + ***/ + } + + public int getFlags() { + return flags; + } + + public int setFlags(int flags) { + int oldFlags = flags; + this.flags = flags; + return oldFlags; + } + + // push current parser state (use at start of new container) + private final void push() { + if (ptr >= stack.length) { + // doubling here is probably overkill, but anything that needs to double more than + // once (32 levels deep) is very atypical anyway. + byte[] newstack = new byte[stack.length << 1]; + System.arraycopy(stack, 0, newstack, 0, stack.length); + stack = newstack; + } + stack[ptr++] = state; + } + + // pop parser state (use at end of container) + private final void pop() { + if (--ptr < 0) { + throw err("Unbalanced container"); + } else { + state = stack[ptr]; + } + } + + protected void fill() throws IOException { + if (start >= end) eof = true; + } + + private void getMore() throws IOException { + fill(); + if (start >= end) { + throw err(null); + } + } + + protected int getChar() throws IOException { + if (start >= end) { + fill(); + if (start >= end) return -1; + } + return buf[start++]; + } + + protected int getCharNWS() throws IOException { + for (; ; ) { + int ch = getChar(); + // getCharNWS is normally called in the context of expecting certain JSON special characters + // such as ":}"]," + // all of these characters are below 64 (including comment chars '/' and '#', so we can make this the fast path + // even w/o checking the range first. We'll only get some false-positives while using bare strings (chars "IJMc") + if (((WS_MASK >> ch) & 0x01) == 0) { + return ch; + } else if (ch <= ' ') { // this will only be true if one of the whitespace bits was set + continue; + } else if (ch == '/') { + getSlashComment(); + } else if (ch == '#') { + getNewlineComment(); + } else if (!isWhitespace(ch)) { // we'll only reach here with certain bare strings, errors, or strange whitespace like 0xa0 + return ch; + } + + /*** + // getCharNWS is normally called in the context of expecting certain JSON special characters + // such as ":}"]," + // all of these characters are below 64 (including comment chars '/' and '#', so we can make this the fast path + if (ch < 64) { + if (((WS_MASK >> ch) & 0x01) == 0) return ch; + if (ch <= ' ') continue; // whitespace below a normal space + if (ch=='/') { + getSlashComment(); + } else if (ch=='#') { + getNewlineComment(); + } + } else if (!isWhitespace(ch)) { // check for higher whitespace like 0xA0 + return ch; + } + ***/ + + /** older code + switch (ch) { + case ' ' : + case '\t' : + case '\r' : + case '\n' : + continue outer; + case '#' : + getNewlineComment(); + continue outer; + case '/' : + getSlashComment(); + continue outer; + default: + return ch; + } + **/ + } + } + + protected int getCharNWS(int ch) throws IOException { + for (; ; ) { + // getCharNWS is normally called in the context of expecting certain JSON special characters + // such as ":}"]," + // all of these characters are below 64 (including comment chars '/' and '#', so we can make this the fast path + // even w/o checking the range first. We'll only get some false-positives while using bare strings (chars "IJMc") + if (((WS_MASK >> ch) & 0x01) == 0) { + return ch; + } else if (ch <= ' ') { // this will only be true if one of the whitespace bits was set + // whitespace... get new char at bottom of loop + } else if (ch == '/') { + getSlashComment(); + } else if (ch == '#') { + getNewlineComment(); + } else if (!isWhitespace(ch)) { // we'll only reach here with certain bare strings, errors, or strange whitespace like 0xa0 + return ch; + } + ch = getChar(); + } + } + + private int getCharExpected(int expected) throws IOException { + for (; ; ) { + int ch = getChar(); + if (ch == expected) return expected; + if (ch == ' ') continue; + return getCharNWS(ch); + } + } + + protected void getNewlineComment() throws IOException { + // read a # or a //, so go until newline + for (; ; ) { + int ch = getChar(); + // don't worry about DOS /r/n... we'll stop on the \r and let the rest of the whitespace + // eater consume the \n + if (ch == '\n' || ch == '\r' || ch == -1) { + return; + } + } + } + + protected void getSlashComment() throws IOException { + int ch = getChar(); + if (ch == '/') { + getNewlineComment(); + return; + } + + if (ch != '*') { + throw err("Invalid comment: expected //, /*, or #"); + } + + ch = getChar(); + for (; ; ) { + if (ch == '*') { + ch = getChar(); + if (ch == '/') { + return; + } else if (ch == '*') { + // handle cases of *******/ + continue; + } + } + if (ch == -1) { + return; + } + ch = getChar(); + } + } + + private boolean matchBareWord(char[] arr) throws IOException { + for (int i = 1; i < arr.length; i++) { + int ch = getChar(); + if (ch != arr[i]) { + if ((flags & ALLOW_UNQUOTED_STRING_VALUES) == 0) { + throw err("Expected " + new String(arr)); + } else { + stringTerm = 0; + out.reset(); + out.write(arr, 0, i); + if (!eof) { + start--; + } + return false; + } + } + } + + // if we don't allow bare strings, we don't need to check that the string actually terminates... just + // let things fail as the parser tries to continue + if ((flags & ALLOW_UNQUOTED_STRING_VALUES) == 0) { + return true; + } + + // check that the string actually terminates... for example trueX should return false + int ch = getChar(); + if (eof) { + return true; + } else if (!isUnquotedStringChar(ch)) { + start--; + return true; + } + + // we encountered something like "trueX" when matching "true" + stringTerm = 0; + out.reset(); + out.unsafeWrite(arr, 0, arr.length); + out.unsafeWrite(ch); + return false; + } + + protected ParseException err(String msg) { + // We can't tell if EOF was hit by comparing start<=end + // because the illegal char could have been the last in the buffer + // or in the stream. To deal with this, the "eof" var was introduced + if (!eof && start > 0) start--; // backup one char + String chs = "char=" + ((start >= end) ? "(EOF)" : "" + buf[start]); + String pos = "position=" + (gpos + start); + String tot = chs + ',' + pos + getContext(); + if (msg == null) { + if (start >= end) msg = "Unexpected EOF"; + else msg = "JSON Parse Error"; + } + return new ParseException(msg + ": " + tot); + } + + private String getContext() { + String context = ""; + if (start >= 0) { + context += " BEFORE='" + errEscape(Math.max(start - 60, 0), start + 1) + "'"; + } + if (start < end) { + context += " AFTER='" + errEscape(start + 1, start + 40) + "'"; + } + return context; + } + + private String errEscape(int a, int b) { + b = Math.min(b, end); + if (a >= b) return ""; + return new String(buf, a, b - a).replaceAll("\\s+", " "); + } + + /** + * Returns the long read... only significant if valstate==LONG after + * this call. firstChar should be the first numeric digit read. + */ + private long readNumber(int firstChar, boolean isNeg) throws IOException { + out.unsafeWrite(firstChar); // unsafe OK since we know output is big enough + // We build up the number in the negative plane since it's larger (by one) than + // the positive plane. + long v = '0' - firstChar; + // can't overflow a long in 18 decimal digits (i.e. 17 additional after the first). + // we also need 22 additional to handle double so we'll handle in 2 separate loops. + int i; + for (i = 0; i < 17; i++) { + int ch = getChar(); + // TODO: is this switch faster as an if-then-else? + switch (ch) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + v = v * 10 - (ch - '0'); + out.unsafeWrite(ch); + continue; + case '.': + out.unsafeWrite('.'); + valstate = readFrac(out, 22 - i); + return 0; + case 'e': + case 'E': + out.unsafeWrite(ch); + nstate = 0; + valstate = readExp(out, 22 - i); + return 0; + default: + // return the number, relying on nextEvent() to return an error + // for invalid chars following the number. + if (ch != -1) --start; // push back last char if not EOF + + valstate = LONG; + return isNeg ? v : -v; + } + } + + // after this, we could overflow a long and need to do extra checking + boolean overflow = false; + long maxval = isNeg ? Long.MIN_VALUE : -Long.MAX_VALUE; + + for (; i < 22; i++) { + int ch = getChar(); + switch (ch) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (v < (0x8000000000000000L / 10)) overflow = true; // can't multiply by 10 w/o overflowing + v *= 10; + int digit = ch - '0'; + if (v < maxval + digit) overflow = true; // can't add digit w/o overflowing + v -= digit; + out.unsafeWrite(ch); + continue; + case '.': + out.unsafeWrite('.'); + valstate = readFrac(out, 22 - i); + return 0; + case 'e': + case 'E': + out.unsafeWrite(ch); + nstate = 0; + valstate = readExp(out, 22 - i); + return 0; + default: + // return the number, relying on nextEvent() to return an error + // for invalid chars following the number. + if (ch != -1) --start; // push back last char if not EOF + + valstate = overflow ? BIGNUMBER : LONG; + return isNeg ? v : -v; + } + } + + + nstate = 0; + valstate = BIGNUMBER; + return 0; + } + + // read digits right of decimal point + private int readFrac(CharArr arr, int lim) throws IOException { + nstate = HAS_FRACTION; // deliberate set instead of '|' + while (--lim >= 0) { + int ch = getChar(); + if (ch >= '0' && ch <= '9') { + arr.write(ch); + } else if (ch == 'e' || ch == 'E') { + arr.write(ch); + return readExp(arr, lim); + } else { + if (ch != -1) start--; // back up + return NUMBER; + } + } + return BIGNUMBER; + } + + // call after 'e' or 'E' has been seen to read the rest of the exponent + private int readExp(CharArr arr, int lim) throws IOException { + nstate |= HAS_EXPONENT; + int ch = getChar(); + lim--; + + if (ch == '+' || ch == '-') { + arr.write(ch); + ch = getChar(); + lim--; + } + + // make sure at least one digit is read. + if (ch < '0' || ch > '9') { + throw err("missing exponent number"); + } + arr.write(ch); + + return readExpDigits(arr, lim); + } + + // continuation of readExpStart + private int readExpDigits(CharArr arr, int lim) throws IOException { + while (--lim >= 0) { + int ch = getChar(); + if (ch >= '0' && ch <= '9') { + arr.write(ch); + } else { + if (ch != -1) start--; // back up + return NUMBER; + } + } + return BIGNUMBER; + } + + private void continueNumber(CharArr arr) throws IOException { + if (arr != out) arr.write(out); + + if ((nstate & HAS_EXPONENT) != 0) { + readExpDigits(arr, Integer.MAX_VALUE); + return; + } + if (nstate != 0) { + readFrac(arr, Integer.MAX_VALUE); + return; + } + + for (; ; ) { + int ch = getChar(); + if (ch >= '0' && ch <= '9') { + arr.write(ch); + } else if (ch == '.') { + arr.write(ch); + readFrac(arr, Integer.MAX_VALUE); + return; + } else if (ch == 'e' || ch == 'E') { + arr.write(ch); + readExp(arr, Integer.MAX_VALUE); + return; + } else { + if (ch != -1) start--; + return; + } + } + } + + private int hexval(int hexdig) { + if (hexdig >= '0' && hexdig <= '9') { + return hexdig - '0'; + } else if (hexdig >= 'A' && hexdig <= 'F') { + return hexdig + (10 - 'A'); + } else if (hexdig >= 'a' && hexdig <= 'f') { + return hexdig + (10 - 'a'); + } + throw err("invalid hex digit"); + } + + // backslash has already been read when this is called + private char readEscapedChar() throws IOException { + int ch = getChar(); + switch (ch) { + case '"': + return '"'; + case '\'': + return '\''; + case '\\': + return '\\'; + case '/': + return '/'; + case 'n': + return '\n'; + case 'r': + return '\r'; + case 't': + return '\t'; + case 'f': + return '\f'; + case 'b': + return '\b'; + case 'u': + return (char) ( + (hexval(getChar()) << 12) + | (hexval(getChar()) << 8) + | (hexval(getChar()) << 4) + | (hexval(getChar()))); + } + if ((flags & ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER) != 0 && ch != EOF) { + return (char) ch; + } + throw err("Invalid character escape"); + } + + private CharArr readStringChars() throws IOException { + if (stringTerm == 0) { + // "out" will already contain the first part of the bare string, so don't reset it + readStringBare(out); + return out; + } + + char terminator = (char) stringTerm; + int i; + for (i = start; i < end; i++) { + char c = buf[i]; + if (c == terminator) { + tmp.set(buf, start, i); // directly use input buffer + start = i + 1; // advance past last '"' + return tmp; + } else if (c == '\\') { + break; + } + } + out.reset(); + readStringChars2(out, i); + return out; + } + + // middle is the pointer to the middle of a buffer to start scanning for a non-string + // character ('"' or "/"). start<=middle= end) { + arr.write(buf, start, middle - start); + start = middle; + getMore(); + middle = start; + } + int ch = buf[middle++]; + if (ch == terminator) { + int len = middle - start - 1; + if (len > 0) arr.write(buf, start, len); + start = middle; + return; + } else if (ch == '\\') { + int len = middle - start - 1; + if (len > 0) arr.write(buf, start, len); + start = middle; + arr.write(readEscapedChar()); + middle = start; + } + } + } + + private void readStringBare(CharArr arr) throws IOException { + if (arr != out) { + arr.append(out); + } + + for (; ; ) { + int ch = getChar(); + if (!isUnquotedStringChar(ch)) { + if (ch == -1) break; + if (ch == '\\') { + arr.write(readEscapedChar()); + continue; + } + start--; + break; + } + + if (ch == '\\') { + arr.write(readEscapedChar()); + continue; + } + + arr.write(ch); + } + } + + // isName==true if this is a field name (as opposed to a value) + private void handleNonDoubleQuoteString(int ch, boolean isName) throws IOException { + if (ch == '\'') { + stringTerm = ch; + if ((flags & ALLOW_SINGLE_QUOTES) == 0) { + throw err("Single quoted strings not allowed"); + } + } else { + if (isName && (flags & ALLOW_UNQUOTED_KEYS) == 0 + || !isName && (flags & ALLOW_UNQUOTED_STRING_VALUES) == 0 + || eof) { + if (isName) { + throw err("Expected quoted string"); + } else { + throw err(null); + } + } + + if (!isUnquotedStringStart(ch)) { + throw err(null); + } + + stringTerm = 0; // signal for unquoted string + out.reset(); + out.unsafeWrite(ch); + } + } + + /** + * alternate implementation + * // middle is the pointer to the middle of a buffer to start scanning for a non-string + * // character ('"' or "/"). start<=middle + * arr.write(buf,start,middle-start); + * if (middle>=end) { + * getMore(); + * middle=start; + * } else { + * start = middle+1; // set buffer pointer to correct spot + * if (ch=='"') { + * valstate=0; + * return; + * } else if (ch=='\\') { + * arr.write(readEscapedChar()); + * if (start>=end) getMore(); + * middle=start; + * } + * } + * } + * } + * * + */ + + + // return the next event when parser is in a neutral state (no + // map separators or array element separators to read + private int next(int ch) throws IOException { + // TODO: try my own form of indirect jump... look up char class and index directly into handling implementation? + for (; ; ) { + switch (ch) { + case ' ': // this is not the exclusive list of whitespace chars... the rest are handled in default: + case '\t': + case '\r': + case '\n': + ch = getCharNWS(); // calling getCharNWS here seems faster than letting the switch handle it + break; + case '"': + stringTerm = '"'; + valstate = STRING; + return STRING; + case '\'': + if ((flags & ALLOW_SINGLE_QUOTES) == 0) { + throw err("Single quoted strings not allowed"); + } + stringTerm = '\''; + valstate = STRING; + return STRING; + case '{': + push(); + state = DID_OBJSTART; + return OBJECT_START; + case '[': + push(); + state = DID_ARRSTART; + return ARRAY_START; + case '0': + out.reset(); + //special case '0'? If next char isn't '.' val=0 + ch = getChar(); + if (ch == '.') { + start--; + ch = '0'; + readNumber('0', false); + return valstate; + } else if (ch > '9' || ch < '0') { + out.unsafeWrite('0'); + if (ch != -1) start--; + lval = 0; + valstate = LONG; + return LONG; + } else { + throw err("Leading zeros not allowed"); + } + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + out.reset(); + lval = readNumber(ch, false); + return valstate; + case '-': + out.reset(); + out.unsafeWrite('-'); + ch = getChar(); + if (ch < '0' || ch > '9') throw err("expected digit after '-'"); + lval = readNumber(ch, true); + return valstate; + case 't': + // TODO: test performance of this non-branching inline version. + // if ((('r'-getChar())|('u'-getChar())|('e'-getChar())) != 0) throw err(""); + if (matchBareWord(JSONUtil.TRUE_CHARS)) { + bool = true; + valstate = BOOLEAN; + return valstate; + } else { + valstate = STRING; + return STRING; + } + case 'f': + if (matchBareWord(JSONUtil.FALSE_CHARS)) { + bool = false; + valstate = BOOLEAN; + return valstate; + } else { + valstate = STRING; + return STRING; + } + case 'n': + if (matchBareWord(JSONUtil.NULL_CHARS)) { + valstate = NULL; + return valstate; + } else { + valstate = STRING; + return STRING; + } + case '/': + getSlashComment(); + ch = getChar(); + break; + case '#': + getNewlineComment(); + ch = getChar(); + break; + case ']': // This only happens with a trailing comma (or an error) + if (state != DID_ARRELEM || (flags & ALLOW_EXTRA_COMMAS) == 0) { + throw err("Unexpected array closer ]"); + } + pop(); + return event = ARRAY_END; + case '}': // This only happens with a trailing comma (or an error) + if (state != DID_MEMVAL || (flags & ALLOW_EXTRA_COMMAS) == 0) { + throw err("Unexpected object closer }"); + } + pop(); + return event = ARRAY_END; + case ',': // This only happens with input like [1,] + if ((state != DID_ARRELEM && state != DID_MEMVAL) || (flags & ALLOW_EXTRA_COMMAS) == 0) { + throw err("Unexpected comma"); + } + ch = getChar(); + break; + case -1: + if (getLevel() > 0) throw err("Premature EOF"); + return EOF; + default: + // Handle unusual unicode whitespace like no-break space (0xA0) + if (isWhitespace(ch)) { + ch = getChar(); // getCharNWS() would also work + break; + } + handleNonDoubleQuoteString(ch, false); + valstate = STRING; + return STRING; + // throw err(null); + } + + } + } + + @Override + public String toString() { + return "start=" + start + ",end=" + end + ",state=" + state + "valstate=" + valstate; + } + + /** + * Returns the next event encountered in the JSON stream, one of + *
    + *
  • {@link #STRING}
  • + *
  • {@link #LONG}
  • + *
  • {@link #NUMBER}
  • + *
  • {@link #BIGNUMBER}
  • + *
  • {@link #BOOLEAN}
  • + *
  • {@link #NULL}
  • + *
  • {@link #OBJECT_START}
  • + *
  • {@link #OBJECT_END}
  • + *
  • {@link #OBJECT_END}
  • + *
  • {@link #ARRAY_START}
  • + *
  • {@link #ARRAY_END}
  • + *
  • {@link #EOF}
  • + *
+ */ + public int nextEvent() throws IOException { + if (valstate != 0) { + if (valstate == STRING) { + readStringChars2(devNull, start); + } else if (valstate == BIGNUMBER) { + continueNumber(devNull); + } + valstate = 0; + } + + int ch; + outer: + for (; ; ) { + switch (state) { + case 0: + return event = next(getChar()); + case DID_OBJSTART: + ch = getCharExpected('"'); + if (ch == '}') { + pop(); + return event = OBJECT_END; + } + if (ch == '"') { + stringTerm = ch; + } else if (ch == ',' && (flags & ALLOW_EXTRA_COMMAS) != 0) { + continue outer; + } else { + handleNonDoubleQuoteString(ch, true); + } + state = DID_MEMNAME; + valstate = STRING; + return event = STRING; + case DID_MEMNAME: + ch = getCharExpected(':'); + if (ch != ':') { + throw err("Expected key,value separator ':'"); + } + state = DID_MEMVAL; // set state first because it might be pushed... + return event = next(getChar()); + case DID_MEMVAL: + ch = getCharExpected(','); + if (ch == '}') { + pop(); + return event = OBJECT_END; + } else if (ch != ',') { + throw err("Expected ',' or '}'"); + } + ch = getCharExpected('"'); + if (ch == '"') { + stringTerm = ch; + } else if ((ch == ',' || ch == '}') && (flags & ALLOW_EXTRA_COMMAS) != 0) { + if (ch == ',') continue outer; + pop(); + return event = OBJECT_END; + } else { + handleNonDoubleQuoteString(ch, true); + } + state = DID_MEMNAME; + valstate = STRING; + return event = STRING; + case DID_ARRSTART: + ch = getCharNWS(); + if (ch == ']') { + pop(); + return event = ARRAY_END; + } + state = DID_ARRELEM; // set state first, might be pushed... + return event = next(ch); + case DID_ARRELEM: + ch = getCharExpected(','); + if (ch == ',') { + // state = DID_ARRELEM; // redundant + return event = next(getChar()); + } else if (ch == ']') { + pop(); + return event = ARRAY_END; + } else { + throw err("Expected ',' or ']'"); + } + } + } // end for(;;) + } + + public int lastEvent() { + return event; + } + + public boolean wasKey() { + return state == DID_MEMNAME; + } + + private void goTo(int what) throws IOException { + if (valstate == what) { + valstate = 0; + return; + } + if (valstate == 0) { + /*int ev = */ + nextEvent(); // TODO + if (valstate != what) { + throw err("type mismatch"); + } + valstate = 0; + } else { + throw err("type mismatch"); + } + } + + /** + * Returns the JSON string value, decoding any escaped characters. + */ + public String getString() throws IOException { + return getStringChars().toString(); + } + + /** + * Returns the characters of a JSON string value, decoding any escaped characters. + *

The underlying buffer of the returned CharArr should *not* be + * modified as it may be shared with the input buffer. + *

The returned CharArr will only be valid up until + * the next JSONParser method is called. Any required data should be + * read before that point. + */ + public CharArr getStringChars() throws IOException { + goTo(STRING); + return readStringChars(); + } + + /** + * Reads a JSON string into the output, decoding any escaped characters. + */ + public void getString(CharArr output) throws IOException { + goTo(STRING); + readStringChars2(output, start); + } + + /** + * Reads a number from the input stream and parses it as a long, only if + * the value will in fact fit into a signed 64 bit integer. + */ + public long getLong() throws IOException { + goTo(LONG); + return lval; + } + + /** + * Reads a number from the input stream and parses it as a double + */ + public double getDouble() throws IOException { + return Double.parseDouble(getNumberChars().toString()); + } + + /** + * Returns the characters of a JSON numeric value. + *

The underlying buffer of the returned CharArr should *not* be + * modified as it may be shared with the input buffer. + *

The returned CharArr will only be valid up until + * the next JSONParser method is called. Any required data should be + * read before that point. + */ + public CharArr getNumberChars() throws IOException { + int ev = 0; + if (valstate == 0) ev = nextEvent(); + + if (valstate == LONG || valstate == NUMBER) { + valstate = 0; + return out; + } else if (valstate == BIGNUMBER) { + continueNumber(out); + valstate = 0; + return out; + } else { + throw err("Unexpected " + ev); + } + } + + /** + * Reads a JSON numeric value into the output. + */ + public void getNumberChars(CharArr output) throws IOException { + int ev = 0; + if (valstate == 0) ev = nextEvent(); + if (valstate == LONG || valstate == NUMBER) output.write(this.out); + else if (valstate == BIGNUMBER) { + continueNumber(output); + } else { + throw err("Unexpected " + ev); + } + valstate = 0; + } + + /** + * Reads a boolean value + */ + public boolean getBoolean() throws IOException { + goTo(BOOLEAN); + return bool; + } + + /** + * Reads a null value + */ + public void getNull() throws IOException { + goTo(NULL); + } + + /** + * @return the current nesting level, the number of parent objects or arrays. + */ + public int getLevel() { + return ptr; + } + + public long getPosition() { + return gpos + start; + } + + public static class ParseException extends RuntimeException { + public ParseException(String msg) { + super(msg); + } + } +}