-
Notifications
You must be signed in to change notification settings - Fork 82
fix/json unicode offsets #2659
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
jurgenvinju
wants to merge
27
commits into
main
Choose a base branch
from
fix/json-unicode-offsets
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+227
−77
Open
fix/json unicode offsets #2659
Changes from all commits
Commits
Show all changes
27 commits
Select commit
Hold shift + click to select a range
3aa527d
buffer offset is now compensated for surrogate pairs
jurgenvinju 505e233
Merge branch 'main' into fix/json-unicode-offsets
jurgenvinju 977759a
added test with unicode surrogate pairs
jurgenvinju 5432aea
initial throw at unicode resilient positions during JSON parsing
jurgenvinju fd57b97
minor improvements
jurgenvinju 6c64095
minor comment
jurgenvinju f1b25c8
working to get unicode columns right
jurgenvinju 39a0ef1
minor fix
jurgenvinju d21bc29
fixed line markup in unicode example for testing
jurgenvinju e1be9a1
gettin the off-by-ones under control
jurgenvinju d842105
cleanup, refactoring and documentation, plus corrections
jurgenvinju 31f1a8a
cleanup
jurgenvinju 09bbbb3
working on another bug
jurgenvinju eb402e3
fixed boundary condition for getOffset
jurgenvinju df0a5c4
fixed all tests
jurgenvinju 38e625a
added new failing tests for boundary conditions with unicode origins
jurgenvinju b0ce362
fixed specific unicode offset issues
jurgenvinju 0c29e5a
cleanup debug code
jurgenvinju 9d146dd
cleanup unused handler code
jurgenvinju 0f7162e
added rationale in comment to explain use of offset buffers
jurgenvinju c88eb12
better field names, removed need for comments
jurgenvinju 7c2170c
comments
jurgenvinju cebf2f9
comments
jurgenvinju 8b145cd
Merge branch 'main' into fix/json-unicode-offsets
jurgenvinju ef4e609
Merge branch 'main' into fix/json-unicode-offsets
jurgenvinju 8c6e4c5
forgot to commit
jurgenvinju 43aeda6
Merge branch 'main' into fix/json-unicode-offsets
jurgenvinju File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -83,8 +83,6 @@ public class JsonValueReader { | |
| private final IRascalMonitor monitor; | ||
| private final ISourceLocation src; | ||
| private VarHandle posHandler; | ||
| private VarHandle lineHandler; | ||
| private VarHandle lineStartHandler; | ||
|
|
||
| /* options */ | ||
| private ThreadLocal<SimpleDateFormat> format; | ||
|
|
@@ -96,7 +94,6 @@ public class JsonValueReader { | |
| private IFunction parsers; | ||
| private Map<Type, IValue> nulls = Collections.emptyMap(); | ||
|
|
||
|
|
||
| private final class ExpectedTypeDispatcher implements ITypeVisitor<IValue, IOException> { | ||
| private final JsonReader in; | ||
| private final OriginTrackingReader tracker; | ||
|
|
@@ -485,7 +482,7 @@ private int getOffset() { | |
| try { | ||
| assert posHandler != null; | ||
| var internalPos = (int) posHandler.get(in); | ||
| return tracker.getOffsetAtBufferStart() + internalPos; | ||
| return tracker.getOffsetAtBufferPos(internalPos); | ||
| } | ||
| catch (IllegalArgumentException | SecurityException e) { | ||
| // we stop trying to track positions if it fails so hard, | ||
|
|
@@ -497,16 +494,17 @@ private int getOffset() { | |
|
|
||
| private int getLine() { | ||
| if (stopTracking) { | ||
| return 0; | ||
| return 1; | ||
| } | ||
|
|
||
| try { | ||
| return (int) lineHandler.get(in) + 1; | ||
| var internalPos = (int) posHandler.get(in); | ||
| return tracker.getLineAtBufferPos(internalPos); | ||
| } | ||
| catch (IllegalArgumentException | SecurityException e) { | ||
| // stop trying to recover the positions | ||
| stopTracking = true; | ||
| return 0; | ||
| return 1; | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -523,7 +521,10 @@ private int getCol() { | |
| } | ||
|
|
||
| try { | ||
| return ((int) posHandler.get(in)) - ((int) lineStartHandler.get(in)); | ||
| assert posHandler != null; | ||
| var internalPos = (int) posHandler.get(in); | ||
|
|
||
| return tracker.getColumnAtBufferPos(internalPos); | ||
| } | ||
| catch (IllegalArgumentException | SecurityException e) { | ||
| // stop trying to recover the positions | ||
|
|
@@ -614,12 +615,13 @@ private IValue visitStringAsAbstractData(Type type) throws IOException { | |
| private IValue visitObjectAsAbstractData(Type type) throws IOException { | ||
| Set<Type> alternatives = null; | ||
|
|
||
| int startPos = getOffset() - 1; | ||
| int startPos = Math.max(getOffset() - 1 /* pos cursor is at { */, 0); | ||
| int startLine = getLine(); | ||
| int startCol = getCol() - 1; | ||
|
|
||
| in.beginObject(); | ||
|
|
||
|
|
||
|
|
||
| // use explicit information in the JSON to select and filter constructors from the TypeStore | ||
| // we expect always to have the field _constructor before _type. | ||
| if (explicitConstructorNames || explicitDataTypes) { | ||
|
|
@@ -734,14 +736,14 @@ else if (!explicitDataTypes && "_type".equals(label)) { | |
| } | ||
| } | ||
|
|
||
| int endPos = getOffset() - 1; | ||
| assert endPos > startPos : "offset tracking messed up while stopTracking is " + stopTracking + " and trackOrigins is " + trackOrigins; | ||
|
|
||
| int endPos = Math.max(getOffset() - 1, 0); | ||
| assert endPos > startPos : "offset tracking messed up while stopTracking is " + stopTracking + " and trackOrigins is " + trackOrigins; | ||
| int endLine = getLine(); | ||
| int endCol = getCol() - 1; | ||
|
|
||
| in.endObject(); | ||
|
|
||
| for (int i = 0; i < args.length; i++) { | ||
| if (args[i] == null) { | ||
| throw parseErrorHere( | ||
|
|
@@ -802,12 +804,12 @@ public IValue visitNode(Type type) throws IOException { | |
| return inferNullValue(nulls, type); | ||
| } | ||
|
|
||
| int startPos = getOffset() - 1; | ||
| int startPos = Math.max(getOffset() - 1, 0); | ||
| int startLine = getLine(); | ||
| int startCol = getCol() - 1; | ||
|
|
||
| in.beginObject(); | ||
|
|
||
| Map<String, IValue> kws = new HashMap<>(); | ||
| Map<String, IValue> args = new HashMap<>(); | ||
|
|
||
|
|
@@ -839,7 +841,7 @@ public IValue visitNode(Type type) throws IOException { | |
| } | ||
| } | ||
|
|
||
| int endPos = getOffset() - 1; | ||
| int endPos = Math.max(getOffset() - 1, 0); | ||
| int endLine = getLine(); | ||
| int endCol = getCol() - 1; | ||
|
|
||
|
|
@@ -909,7 +911,7 @@ public IValue visitList(Type type) throws IOException { | |
| } | ||
|
|
||
| IListWriter w = vf.listWriter(); | ||
| getOffset(); | ||
|
|
||
| in.beginArray(); | ||
| while (in.hasNext()) { | ||
| // here we pass label from the higher context | ||
|
|
@@ -922,7 +924,7 @@ public IValue visitList(Type type) throws IOException { | |
| } | ||
|
|
||
| in.endArray(); | ||
| getOffset(); | ||
|
|
||
| return w.done(); | ||
| } | ||
|
|
||
|
|
@@ -980,9 +982,7 @@ public JsonValueReader(IValueFactory vf, TypeStore store, IRascalMonitor monitor | |
| var lookup = MethodHandles.lookup(); | ||
| var privateLookup = MethodHandles.privateLookupIn(JsonReader.class, lookup); | ||
| this.posHandler = privateLookup.findVarHandle(JsonReader.class, "pos", int.class); | ||
| this.lineHandler = privateLookup.findVarHandle(JsonReader.class, "lineNumber", int.class); | ||
| this.lineStartHandler = privateLookup.findVarHandle(JsonReader.class, "lineStart", int.class); | ||
|
|
||
|
|
||
| if (posHandler == null) { | ||
| stopTracking = true; | ||
| } | ||
|
|
@@ -1029,8 +1029,6 @@ public JsonValueReader(IRascalValueFactory vf, TypeStore store, IRascalMonitor m | |
| var lookup = MethodHandles.lookup(); | ||
| var privateLookup = MethodHandles.privateLookupIn(JsonReader.class, lookup); | ||
| this.posHandler = privateLookup.findVarHandle(JsonReader.class, "pos", int.class); | ||
| this.lineHandler = privateLookup.findVarHandle(JsonReader.class, "lineNumber", int.class); | ||
| this.lineStartHandler = privateLookup.findVarHandle(JsonReader.class, "lineStart", int.class); | ||
| } | ||
| catch (NoSuchFieldException | SecurityException | IllegalAccessException e) { | ||
| // we disable the origin tracking if we can not get to the fields | ||
|
|
@@ -1151,72 +1149,161 @@ public IValue read(Reader in, Type expected) throws IOException { | |
| * just enough information, together with internal private fields of JsonReader, to compute Rascal-required | ||
| * offsets. We get only the character offset in the file, at the start of each streamed buffer contents. | ||
| * That should be just enough information to recompute the actual offset of every Json element, using the | ||
| * current position in the buffer (the private field `pos` of JsonReader). | ||
| * current position in the buffer as stored in {@link JsonReader#pos} (private). | ||
| * | ||
| * See the body of {@link JsonReader#fillBuffer(int minimum)} for the contract that we must satisfy and | ||
| * the preconditions we are given at every call to {@link #read(char[], int, int)}. | ||
| */ | ||
| public static class OriginTrackingReader extends FilterReader { | ||
| // offset is always pointing at the point in the file where JsonReader.pos == 0 | ||
| private int offset = 0; | ||
| // limit is always pointing to the amount of no-junk characters in the underlying buffer below buffer.length | ||
| private int limit = 0; | ||
|
|
||
| private int codepointOffset = 0; | ||
| private int codepointColumn = 0; | ||
| private int codepointLine = 1; | ||
|
|
||
| private int surrogatePairs = 0; | ||
| private int surrogatePairsThisLine = 0; | ||
|
|
||
| private int prevBufferLimit = 0; | ||
|
|
||
| private int[] charPosToCodepoints = null; | ||
| private int[] charPosToCodepointColumns = null; | ||
| private int[] charPosToLines = null; | ||
|
|
||
| protected OriginTrackingReader(Reader in) { | ||
| super(in); | ||
| } | ||
|
|
||
| /* This private method from JsonReader must be mirrored by `read` | ||
| private boolean fillBuffer(int minimum) throws IOException { | ||
| char[] buffer = this.buffer; | ||
| lineStart -= pos; | ||
| if (limit != pos) { | ||
| limit -= pos; | ||
| System.arraycopy(buffer, pos, buffer, 0, limit); | ||
| } else { | ||
| limit = 0; | ||
| } | ||
|
|
||
| pos = 0; | ||
| int total; | ||
| while ((total = in.read(buffer, limit, buffer.length - limit)) != -1) { | ||
| limit += total; | ||
|
|
||
| // if this is the first read, consume an optional byte order mark (BOM) if it exists | ||
| if (lineNumber == 0 && lineStart == 0 && limit > 0 && buffer[0] == '\ufeff') { | ||
| pos++; | ||
| lineStart++; | ||
| minimum++; | ||
| } | ||
|
|
||
| if (limit >= minimum) { | ||
| return true; | ||
| } | ||
| } | ||
| return false; | ||
| } */ | ||
| @Override | ||
| public int read(char[] cbuf, int off, int len) throws IOException { | ||
| // Note that `fillBuffer.limit != fillBuffer.pos <==> reader.off != 0`. | ||
| // Moreover, `fillBuffer.limit == reader.off` at the start of this method. | ||
| initializeBuffers(cbuf); | ||
|
|
||
| // `codepoints[prevBufferLimit - 1] - 1` is the offset of the last character read with the previous call to read. | ||
| // So the new codepointOffset starts there. We look back `off` chars because of possible left-overs before the limit. | ||
| codepointOffset += (prevBufferLimit == 0 | ||
| ? 0 | ||
| : charPosToCodepoints[Math.max(0, prevBufferLimit - off - 1)] + 1); | ||
|
|
||
| // we know take the previous limit and add it to the | ||
| // offset, to arrive at the new `pos=0` of `buffer[0]`, | ||
| // rewinding `off` characters which were reused from the previous buffer | ||
| // with System.arraycopy. | ||
| offset += limit - off; | ||
| // The accumlated surrogatePairs is included in the codepointOffset and codepointColumn counters, | ||
| // so we start from scratch again. | ||
| surrogatePairs = 0; | ||
| surrogatePairsThisLine = 0; | ||
|
|
||
| // make sure we are only a facade for the real reader. | ||
| // make sure we are only a transparant facade for the real reader. | ||
| // parameters are mapped one-to-one without mutations. | ||
| var charsRead = in.read(cbuf, off, len); | ||
|
|
||
| // the next buffer[0] offset will be after this increment. | ||
| // Note that `fillBuffer.limit == read.limit` | ||
| limit = off + charsRead; | ||
| // Now we simulate exactly what JsonReader does to `cbuf` on our administration of surrogate pairs. | ||
| // It DOES happen that {@see GsonValueReader} asks for `charPosToCodepoints[0]`. | ||
| shiftRemaindersLeft(off); | ||
|
|
||
| // The next buffer[0] offset will be right after this increment. | ||
| // Note that `GsonReader::fillBuffer.limit == this.prevBufferLimit` | ||
| prevBufferLimit = off + charsRead; | ||
|
|
||
| // and then we can fill our administration of surrogate pairs quickly | ||
| precomputeSurrogatePairCounts(cbuf, off, prevBufferLimit); | ||
|
|
||
| // and return only the number of characters read. | ||
| return charsRead; | ||
| } | ||
|
|
||
| public int getOffsetAtBufferStart() { | ||
| return offset; | ||
| /** | ||
| * The cbuf char buffer may contain "surrogate pairs", where two int16 chars represent | ||
| * one unicode codepoint. We want to count in codepoints so we store here for every | ||
| * character in cbuf what it's codepoint offset is, what its codepoint column is | ||
| * and what its codepoint line is. | ||
| * | ||
| * Later when the JSONValueReader needs to know "current positions", this OriginTrackerReader | ||
| * will have the answers stored in its buffers. | ||
| * | ||
| * @param cbuf buffer to detect surrogate pairs in | ||
| * @param off where we left off the last time | ||
| * @param limit until which index the buffer is filled | ||
| */ | ||
| private void precomputeSurrogatePairCounts(char[] cbuf, int off, int limit) { | ||
| // NB we assume here that the remainder of the content pos..limit has already been shifted to cbuf[0]; | ||
| // So codepoints[0..off], columns[0..off] and lines[0..off] have been filled already. | ||
| for (int i = off; i < limit; i++) { | ||
| charPosToCodepoints[i] = i - surrogatePairs; | ||
| charPosToCodepointColumns[i] = codepointColumn - surrogatePairsThisLine; | ||
| charPosToLines[i] = codepointLine; | ||
|
|
||
| if (Character.isHighSurrogate(cbuf[i])) { | ||
| // For every high surrogate we assume a low surrogate will follow, | ||
| // and we count only one of them for the character offset by increasing `shift` | ||
|
Comment on lines
+1231
to
+1232
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this would mean that a broken pair causes offsets? is that fine? |
||
| surrogatePairs++; | ||
| surrogatePairsThisLine++; | ||
| // Do not assume the low surrogate is in the current buffer yet (boundary condition) | ||
| } | ||
| else if (cbuf[i] == '\n') { | ||
| codepointLine++; | ||
| codepointColumn = 0; | ||
| surrogatePairsThisLine = 0; | ||
| } | ||
| else { | ||
| codepointColumn++; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private void shiftRemaindersLeft(int off) { | ||
| if (off > 0) { | ||
| System.arraycopy(charPosToCodepoints, charPosToCodepoints.length - off, charPosToCodepoints, 0, off); | ||
| System.arraycopy(charPosToCodepointColumns, charPosToCodepointColumns.length - off, charPosToCodepointColumns, 0, off); | ||
| System.arraycopy(charPosToLines, charPosToLines.length - off, charPosToLines, 0, off); | ||
| } | ||
| } | ||
|
|
||
| /* | ||
| * We keep our own buffers of int offsets instead of a reference or a copy of the cbuf: | ||
| * * a quick and dirty reference to cbuf won't do because the {@see GsonReader} client uses System.arrayCopy to | ||
| * overwrite the buffer before we can get to it. | ||
| * * a copy/clone of the buffer could work to have access to the previous version. However, | ||
| * we would still need to loop over it and compute the offsets. So that adds copying the entire buffer | ||
| * at every `read` to the load. | ||
| * * Also the GsonReader does not guarantee that `pos` grows (strictly) monotically, which means we | ||
| * sometimes might have to look back and only having the last offset at the parsing frontier is not sufficient. | ||
| * * the current solution requires more memory, but it is faster while streaming. | ||
| * * the line buffer is strictly not required, but beats the reflective access we need to | ||
| * _two_ private variables in GsonReader. | ||
| */ | ||
| private void initializeBuffers(char[] cbuf) { | ||
| if (charPosToCodepoints == null) { | ||
| assert charPosToCodepointColumns == null; | ||
| assert charPosToLines == null; | ||
|
|
||
| charPosToCodepoints = new int[cbuf.length]; | ||
| charPosToCodepointColumns = new int[cbuf.length]; | ||
| charPosToLines = new int[cbuf.length]; | ||
| } | ||
|
|
||
| // nothing else changed in the mean time, especially not the length of cbuf. | ||
| assert charPosToCodepoints.length == cbuf.length; | ||
| assert charPosToCodepointColumns.length == cbuf.length; | ||
| assert charPosToLines.length == cbuf.length; | ||
| } | ||
|
|
||
| /** | ||
| * @return the codepoint offset (from the start of the streaming content) | ||
| * for the character at char position `pos` in the last buffered content. | ||
| */ | ||
| public int getOffsetAtBufferPos(int pos) { | ||
| return (pos >= prevBufferLimit) ? (codepointOffset + charPosToCodepoints[pos - 1] + 1) : (codepointOffset + charPosToCodepoints[pos]); | ||
| } | ||
|
|
||
| /** | ||
| * @return the codepoint column (from the start of the current line) | ||
| * for the character at char position `pos` in the last buffered content. | ||
| */ | ||
| public int getColumnAtBufferPos(int pos) { | ||
| return (pos >= prevBufferLimit) ? codepointColumn : charPosToCodepointColumns[pos]; | ||
| } | ||
|
|
||
| /** | ||
| * @return the codepoint line (from the start of the entire content) | ||
| * for the character at char position `pos` in the last buffered content. | ||
| */ | ||
| public int getLineAtBufferPos(int pos) { | ||
| return (pos >= prevBufferLimit) ? codepointLine : charPosToLines[pos]; | ||
| } | ||
| } | ||
| } | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
a lot of this code reminds me of the
ColumnMapsandLineColumnOffsetMap(and its implementationArrayLineOffsetMap). They've been migrated to rascal a while back, so maybe we can reuse that code here?Perhaps the
LineColumnOffsetMapneeds to learn a few new features, but it has been used for quite a few years in rascal-lsp for a very similar job.