Skip to content

Commit 0875280

Browse files
committed
add support for ANSI files containing non-ASCII
Notepad++ internally represents files with most encodings as UTF8, but ANSI files are internally represented as ANSI. Previously JsonTools assumed that any method that got the bytes of the document from Notepad++ could treat those bytes as UTF8, and that assumption is wrong for ANSI. Now JsonTools uses ScintillaGateway.GetCodePage() to determine the encoding of the document and requires an encoding as a parameter for many document-text-extracting functions. However, the JsonParser class is still hardcoded to count the bytes in the UTF8 representation of each character in a string, and it would be both annoying and potentially performance-reducing to stop making this assumption. In a future commit I may address this issue, but for now some features are broken for non-ASCII ANSI files, including the following: - automagic navigation to position in document of a tree node - selecting a tree node or its children - getting the path to a position in the document - selecting all valid JSON in a selection
1 parent 3c7eb99 commit 0875280

File tree

11 files changed

+165
-92
lines changed

11 files changed

+165
-92
lines changed

CHANGELOG.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,14 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
5454

5555
### Fixed
5656

57-
1. Bug where JsonTools could not set the text of a file to end with the SOH character (ASCII code `\x01`)
58-
2. Bug where some JsonGrepper tests would always fail because they queried an external API that had previously returned JSON but now returns HTML.
57+
1. __Correctly parse all ANSI files.__ Previously JsonTools assumed that Notepad++ used UTF8 to internally represent every document, and while that appears to have worked fine for most encodings, it would specifically fail on ANSI-encoded documents containing non-ASCII.
58+
- __The following features are currently *still broken* on ANSI files with non-ASCII characters__ (because they rely on knowing the position of a JSON node):
59+
* [path to current position](/docs/README.md#path-to-current-position)
60+
* [clicking on a tree node to navigate to its position in the document](/docs/README.md#the-basics)
61+
* [selecting a tree node or its children](/docs/README.md#select-tree-nodes-json-or-its-children-added-in-v57)
62+
* [selecting all valid JSON in selection](/docs/README.md#selecting-all-valid-json)
63+
2. Minor bug where JsonTools could not set the text of a file to end with the SOH character (ASCII code `\x01`)
64+
3. Minor bug where some JsonGrepper tests would always fail because they queried an external API that had previously returned JSON but now returns HTML.
5965

6066
## [8.4.0] - 2025-05-04
6167

JsonToolsNppPlugin/Forms/TreeViewer.cs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1248,6 +1248,7 @@ public static bool LengthOfStringInRegexMode(JNode[] nodes, char delim, char quo
12481248
int startIndex = 0;
12491249
string documentText = null;
12501250
utf8Lengths = new int[nodes.Length];
1251+
Encoding encoding = Npp.editor.GetCodePage();
12511252
for (int ii = 0; ii < nodes.Length; ii++)
12521253
{
12531254
JNode jnode = nodes[ii];
@@ -1269,7 +1270,7 @@ public static bool LengthOfStringInRegexMode(JNode[] nodes, char delim, char quo
12691270
if (documentText is null)
12701271
documentText = selectionEnd < 0
12711272
? Npp.editor.GetText()
1272-
: Npp.GetSlice(selectionStart, selectionEnd);
1273+
: Npp.GetSlice(selectionStart, selectionEnd, encoding);
12731274
int utf8Extra = 0;
12741275
for (; startIndex < documentText.Length && startIndex + utf8Extra < nodepos; startIndex++)
12751276
utf8Extra += JsonParser.ExtraUTF8Bytes(documentText[startIndex]);
@@ -1298,7 +1299,7 @@ public static bool LengthOfStringInRegexMode(JNode[] nodes, char delim, char quo
12981299
// even if a string doesn't *need* to be quoted, it could be quoted anyway, in which case we need to select the quotes
12991300
int nodeStart = selectionStart + jnode.position;
13001301
int quoteLen = 1 + JsonParser.ExtraUTF8Bytes(quote);
1301-
string firstChar = Npp.GetSlice(nodeStart, nodeStart + quoteLen);
1302+
string firstChar = Npp.GetSlice(nodeStart, nodeStart + quoteLen, encoding);
13021303
if (firstChar[0] == quote)
13031304
utf8Len += 2 * quoteLen;
13041305
}

JsonToolsNppPlugin/Main.cs

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -652,9 +652,10 @@ public static (ParserState parserState, JNode node, bool usesSelections, Documen
652652
if (wasAutotriggered && combinedLength > sizeThreshold)
653653
return (ParserState.OK, null, false, DocumentType.NONE);
654654
stopUsingSelections = false;
655+
Encoding encoding = Npp.editor.GetCodePage();
655656
foreach ((int start, int end) in selRanges)
656657
{
657-
string selRange = Npp.GetSlice(start, end);
658+
string selRange = Npp.GetSlice(start, end, encoding);
658659
JNode subJson;
659660
if (documentType == DocumentType.REGEX)
660661
{
@@ -959,6 +960,7 @@ public static void CompressJson()
959960
Npp.editor.BeginUndoAction();
960961
if (usesSelections)
961962
{
963+
Encoding encoding = Npp.editor.GetCodePage();
962964
var obj = (JObject)json;
963965
int delta = 0;
964966
pluginIsEditing = true;
@@ -976,7 +978,7 @@ public static void CompressJson()
976978
int newStart = start + delta;
977979
int newCount = Encoding.UTF8.GetByteCount(printed);
978980
Npp.editor.DeleteRange(newStart, oldLen);
979-
Npp.editor.InsertText(newStart, printed);
981+
Npp.editor.InsertText(newStart, printed, encoding);
980982
currentIndicator = ApplyAndSwapIndicator(currentIndicator, newStart, newCount);
981983
int newEnd = newStart + newCount;
982984
delta = newEnd - end;
@@ -1071,9 +1073,10 @@ public static void DumpSelectedTextAsJsonString()
10711073
else
10721074
{
10731075
var sb = new StringBuilder();
1076+
Encoding encoding = Npp.editor.GetCodePage();
10741077
foreach ((int start, int end) in selections)
10751078
{
1076-
string sel = Npp.GetSlice(start, end);
1079+
string sel = Npp.GetSlice(start, end, encoding);
10771080
JNode selNode = new JNode(sel);
10781081
sb.Append(selNode.ToString());
10791082
sb.Append("\r\n");
@@ -1096,10 +1099,11 @@ public static void DumpSelectedJsonStringsAsText()
10961099
var selections = SelectionManager.GetSelectedRanges();
10971100
selections.Sort(SelectionManager.StartEndCompareByStart);
10981101
var sb = new StringBuilder();
1102+
Encoding encoding = Npp.editor.GetCodePage();
10991103
JsonParser jsonParser = JsonParserFromSettings();
11001104
if (SelectionManager.NoTextSelected(selections))
11011105
{
1102-
string selStrValue = TryGetSelectedJsonStringValue(jsonParser);
1106+
string selStrValue = TryGetSelectedJsonStringValue(jsonParser, encoding);
11031107
if (selStrValue == null)
11041108
return;
11051109
sb.Append(selStrValue);
@@ -1108,7 +1112,7 @@ public static void DumpSelectedJsonStringsAsText()
11081112
{
11091113
foreach ((int start, int end) in selections)
11101114
{
1111-
string selStrValue = TryGetSelectedJsonStringValue(jsonParser, start, end);
1115+
string selStrValue = TryGetSelectedJsonStringValue(jsonParser, encoding, start, end);
11121116
if (selStrValue == null)
11131117
return;
11141118
sb.Append(selStrValue);
@@ -1119,11 +1123,11 @@ public static void DumpSelectedJsonStringsAsText()
11191123
Npp.editor.SetText(sb.ToString());
11201124
}
11211125

1122-
public static string TryGetSelectedJsonStringValue(JsonParser jsonParser, int start = -1, int end = -1)
1126+
public static string TryGetSelectedJsonStringValue(JsonParser jsonParser, Encoding encoding, int start = -1, int end = -1)
11231127
{
11241128
string text = start < 0 || end < 0
11251129
? Npp.editor.GetText()
1126-
: Npp.GetSlice(start, end);
1130+
: Npp.GetSlice(start, end, encoding);
11271131
try
11281132
{
11291133
JNode textNode = jsonParser.Parse(text);
@@ -1454,7 +1458,8 @@ private static string PathToPosition(KeyStyle style, char separator, int pos = -
14541458
(int start, int end) = SelectionManager.GetEnclosingRememberedSelection(pos, selectionRememberingIndicator1, selectionRememberingIndicator2);
14551459
if (start < 0)
14561460
return "";
1457-
string selText = Npp.GetSlice(start, end);
1461+
var encoding = Npp.editor.GetCodePage();
1462+
string selText = Npp.GetSlice(start, end, encoding);
14581463
var parser = JsonParserFromSettings();
14591464
JNode selJson = parser.Parse(selText);
14601465
if (parser.fatal)
@@ -1496,9 +1501,10 @@ public static void SelectEveryValidJson()
14961501
var startEnds = new List<string>();
14971502
int lastEnd = 0;
14981503
Predicate<char> isTryParseStart = c => settings.try_parse_start_chars.IndexOf(c) >= 0;
1504+
var encoding = Npp.editor.GetCodePage();
14991505
foreach ((int start, int end) in selections)
15001506
{
1501-
string text = Npp.GetSlice(start, end);
1507+
string text = Npp.GetSlice(start, end, encoding);
15021508
int ii = 0;
15031509
int len = text.Length;
15041510
int utf8Pos = start;
@@ -2016,7 +2022,7 @@ public static void OpenSortForm()
20162022
/// <returns></returns>
20172023
public static int EndOfJNodeAtPos(int startUtf8Pos, int end)
20182024
{
2019-
string slice = Npp.GetSlice(startUtf8Pos, end);
2025+
string slice = Npp.GetSlice(startUtf8Pos, end, Npp.editor.GetCodePage());
20202026
var parser = new JsonParser(LoggerLevel.JSON5);
20212027
try
20222028
{
@@ -2055,7 +2061,8 @@ public static void SelectAllChildren(IEnumerable<int> positions, bool isJsonLine
20552061
"Can't select all children", MessageBoxButtons.OK, MessageBoxIcon.Exclamation);
20562062
return;
20572063
}
2058-
string slice = Npp.GetSlice(minPos, len);
2064+
Encoding encoding = Npp.editor.GetCodePage();
2065+
string slice = Npp.GetSlice(minPos, len, encoding);
20592066
var parser = new JsonParser(LoggerLevel.JSON5);
20602067
int utf8ExtraBytes = 0;
20612068
int positionsIdx = 0;

JsonToolsNppPlugin/PluginInfrastructure/IScintillaGateway.cs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// NPP plugin platform for .Net v0.94.00 by Kasper B. Graversen etc.
22
using System;
3+
using System.Text;
34
using static Kbg.NppPluginNET.PluginInfrastructure.Win32;
45

56
namespace Kbg.NppPluginNET.PluginInfrastructure
@@ -15,7 +16,7 @@ public interface IScintillaGateway
1516
{
1617
int GetSelectionLength();
1718
void AppendTextAndMoveCursor(string text);
18-
void InsertTextAndMoveCursor(string text);
19+
void InsertTextAndMoveCursor(string text, Encoding encoding);
1920
void SelectCurrentLine();
2021
void ClearSelectionToCursor();
2122

@@ -40,7 +41,7 @@ public interface IScintillaGateway
4041
unsafe void AddStyledText(int length, Cells c);
4142

4243
/// <summary>Insert string at a position. (Scintilla feature 2003)</summary>
43-
unsafe void InsertText(int pos, string text);
44+
unsafe void InsertText(int pos, string text, Encoding encoding);
4445

4546
/// <summary>Change the text that is being inserted in response to SC_MOD_INSERTCHECK (Scintilla feature 2672)</summary>
4647
unsafe void ChangeInsertion(int length, string text);
@@ -761,7 +762,7 @@ public interface IScintillaGateway
761762
int GetLineEndPosition(int line);
762763

763764
/// <summary>Get the code page used to interpret the bytes of the document as characters. (Scintilla feature 2137)</summary>
764-
int GetCodePage();
765+
Encoding GetCodePage();
765766

766767
/// <summary>Get the foreground colour of the caret. (Scintilla feature 2138)</summary>
767768
Colour GetCaretFore();

JsonToolsNppPlugin/PluginInfrastructure/ScintillaGateway.cs

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ public class ScintillaGateway : IScintillaGateway
2525
/// * if length is 0, return the length and have no side effects<br></br>
2626
/// * if length is greater than 0, return the length and fill the buffer with length characters<br></br>
2727
/// This method gets the length if the length is 0, uses the second mode to fill a buffer,<br></br>
28-
/// and returns a string of the UTF8-decoded buffer with all trailing '\x00' chars stripped off.
28+
/// figures out the document's encoding,<br></br>
29+
/// and returns a string of the decoded buffer with all trailing '\x00' chars stripped off.
2930
/// </summary>
3031
/// <param name="msg">message to send</param>
3132
/// <param name="length">number of characters to retrieve (if 0, find out by sending message)</param>
@@ -35,6 +36,7 @@ private unsafe string GetNullStrippedStringFromMessageThatReturnsLength(SciMsg m
3536
if (length < 1)
3637
length = Win32.SendMessage(scintilla, msg, (IntPtr)Unused, (IntPtr)Unused).ToInt32();
3738
byte[] textBuffer = new byte[length];
39+
var encoding = GetCodePage();
3840
fixed (byte* textPtr = textBuffer)
3941
{
4042
Win32.SendMessage(scintilla, msg, (IntPtr)length, (IntPtr)textPtr);
@@ -43,21 +45,21 @@ private unsafe string GetNullStrippedStringFromMessageThatReturnsLength(SciMsg m
4345
// other than NULL can have any 0-valued bytes in UTF-8.
4446
// See https://en.wikipedia.org/wiki/UTF-8#Encoding
4547
for (; lastNullCharPos >= 0 && textBuffer[lastNullCharPos] == '\x00'; lastNullCharPos--) { }
46-
return Encoding.UTF8.GetString(textBuffer, 0, lastNullCharPos + 1);
48+
return encoding.GetString(textBuffer, 0, lastNullCharPos + 1);
4749
}
4850
}
4951

5052
/// <summary>
51-
/// the same byte[] buffer that would be returned by Encoding.UTF8.GetBytes(text),
53+
/// the same byte[] buffer that would be returned by encoding.GetBytes(text),<br></br>
5254
/// but with +1 length and a NULL byte at the end
5355
/// </summary>
5456
/// <param name="text"></param>
5557
/// <returns></returns>
56-
private byte[] GetNullTerminatedUTF8Bytes(string text)
58+
private byte[] GetNullTerminatedEncodedBytes(string text, Encoding encoding)
5759
{
58-
int length = Encoding.UTF8.GetByteCount(text);
60+
int length = encoding.GetByteCount(text);
5961
byte[] bytes = new byte[length + 1];
60-
int lengthWritten = Encoding.UTF8.GetBytes(text, 0, text.Length, bytes, 0);
62+
int lengthWritten = encoding.GetBytes(text, 0, text.Length, bytes, 0);
6163
//if (lengthWritten != length)
6264
// throw new Exception("not sure what we would do here");
6365
return bytes;
@@ -80,10 +82,10 @@ public void AppendTextAndMoveCursor(string text)
8082
GotoPos(GetCurrentPos() + text.Length);
8183
}
8284

83-
public void InsertTextAndMoveCursor(string text)
85+
public void InsertTextAndMoveCursor(string text, Encoding encoding)
8486
{
8587
var currentPos = GetCurrentPos();
86-
InsertText(currentPos, text);
88+
InsertText(currentPos, text, encoding);
8789
GotoPos(currentPos + text.Length);
8890
}
8991

@@ -145,9 +147,9 @@ public unsafe void AddStyledText(int length, Cells c)
145147
}
146148

147149
/// <summary>Insert string at a position. (Scintilla feature 2003)</summary>
148-
public unsafe void InsertText(int pos, string text)
150+
public unsafe void InsertText(int pos, string text, Encoding encoding)
149151
{
150-
fixed (byte* textPtr = GetNullTerminatedUTF8Bytes(text))
152+
fixed (byte* textPtr = GetNullTerminatedEncodedBytes(text, encoding))
151153
{
152154
Win32.SendMessage(scintilla, SciMsg.SCI_INSERTTEXT, (IntPtr) pos, (IntPtr) textPtr);
153155
}
@@ -1505,9 +1507,17 @@ public int GetLineEndPosition(int line)
15051507
}
15061508

15071509
/// <summary>Get the code page used to interpret the bytes of the document as characters. (Scintilla feature 2137)</summary>
1508-
public int GetCodePage()
1510+
public Encoding GetCodePage()
15091511
{
1510-
return (int)Win32.SendMessage(scintilla, SciMsg.SCI_GETCODEPAGE, (IntPtr) Unused, (IntPtr) Unused);
1512+
var cpNum = (int)Win32.SendMessage(scintilla, SciMsg.SCI_GETCODEPAGE, (IntPtr) Unused, (IntPtr) Unused);
1513+
try
1514+
{
1515+
return Encoding.GetEncoding(cpNum);
1516+
}
1517+
catch
1518+
{
1519+
return Encoding.UTF8;
1520+
}
15111521
}
15121522

15131523
/// <summary>Get the foreground colour of the caret. (Scintilla feature 2138)</summary>
@@ -1797,7 +1807,8 @@ public void Clear()
17971807
/// <summary>Replace the contents of the document with the argument text. (Scintilla feature 2181)</summary>
17981808
public unsafe void SetText(string text)
17991809
{
1800-
fixed (byte* textPtr = GetNullTerminatedUTF8Bytes(text))
1810+
Encoding encoding = GetCodePage();
1811+
fixed (byte* textPtr = GetNullTerminatedEncodedBytes(text, encoding))
18011812
{
18021813
Win32.SendMessage(scintilla, SciMsg.SCI_SETTEXT, (IntPtr) Unused, (IntPtr) textPtr);
18031814
}

JsonToolsNppPlugin/Properties/AssemblyInfo.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,5 +28,5 @@
2828
// Build Number
2929
// Revision
3030
//
31-
[assembly: AssemblyVersion("8.4.0.1")]
32-
[assembly: AssemblyFileVersion("8.4.0.1")]
31+
[assembly: AssemblyVersion("8.4.0.2")]
32+
[assembly: AssemblyFileVersion("8.4.0.2")]

JsonToolsNppPlugin/Tests/TestRunner.cs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
using System;
55
using System.Collections.Generic;
66
using System.IO;
7+
using System.Text;
78
using System.Threading.Tasks;
89
using JSON_Tools.Utils;
910
using Kbg.NppPluginNET;
@@ -198,10 +199,10 @@ public static async Task RunAll()
198199
failures.Add(name);
199200
}
200201
}
201-
202+
Encoding encoding = Npp.editor.GetCodePage();
202203
if (skipped.Count > 0)
203-
Npp.editor.InsertText(header.Length + 2, "Tests skipped: " + string.Join(", ", skipped) + "\r\n");
204-
Npp.editor.InsertText(header.Length + 2, "Tests failed: " + string.Join(", ", failures) + "\r\n");
204+
Npp.editor.InsertText(header.Length + 2, "Tests skipped: " + string.Join(", ", skipped) + "\r\n", encoding);
205+
Npp.editor.InsertText(header.Length + 2, "Tests failed: " + string.Join(", ", failures) + "\r\n", encoding);
205206
}
206207
}
207208
}

JsonToolsNppPlugin/Tests/UserInterfaceTests.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
using System;
22
using System.Collections.Generic;
33
using System.Linq;
4+
using System.Text;
45
using System.Windows.Forms;
56
using JSON_Tools.Forms;
67
using JSON_Tools.JSON_Tools;
@@ -65,7 +66,8 @@ public static bool ExecuteFileManipulation(string command, List<string> messages
6566
var start = (int)args[0];
6667
text = (string)args[1];
6768
messages.Add($"insert {JNode.StrToString(text, false)} at {start}");
68-
Npp.editor.InsertText(start, text);
69+
Encoding encoding = Npp.editor.GetCodePage();
70+
Npp.editor.InsertText(start, text, encoding);
6971
break;
7072
case "delete_text":
7173
start = (int)args[0];

JsonToolsNppPlugin/Utils/Npp.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -256,14 +256,14 @@ public static void RemoveTrailingSOH()
256256
/// <param name="start"></param>
257257
/// <param name="end"></param>
258258
/// <returns></returns>
259-
public static string GetSlice(int start, int end)
259+
public static string GetSlice(int start, int end, Encoding encoding)
260260
{
261261
int len = end - start;
262262
IntPtr rangePtr = editor.GetRangePointer(start, len);
263263
string ansi = Marshal.PtrToStringAnsi(rangePtr, len);
264264
// TODO: figure out a way to do this that involves less memcopy for non-ASCII
265-
if (ansi.Any(c => c >= 128))
266-
return Encoding.UTF8.GetString(Encoding.Default.GetBytes(ansi));
265+
if (encoding.CodePage != Encoding.Default.CodePage && ansi.Any(c => c >= 128))
266+
return encoding.GetString(Encoding.Default.GetBytes(ansi));
267267
return ansi;
268268
}
269269

0 commit comments

Comments
 (0)