Skip to content

Commit b86e4c4

Browse files
committed
Fix parser to follow POSIX shell escaping rules
1 parent f4d53e7 commit b86e4c4

File tree

2 files changed

+267
-9
lines changed

2 files changed

+267
-9
lines changed

src/args-tokenizer.test.ts

Lines changed: 248 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,12 @@ test("escape spaces with backslashes", () => {
5353
expect(tokenizeArgs(`command space\\ `)).toEqual(["command", "space "]);
5454
});
5555

56-
test("ignore escaped newlines outside of quotes", () => {
56+
test("remove escaped newlines outside of single quotes", () => {
5757
expect(tokenizeArgs(`command \\\nargument`)).toEqual(["command", `argument`]);
58-
expect(tokenizeArgs(`command "\\\nargument"`)).toEqual([
58+
expect(tokenizeArgs(`command "\\\nargument"`)).toEqual(["command", `argument`,]);
59+
expect(tokenizeArgs(`command '\\\nargument'`)).toEqual([
5960
"command",
60-
`\nargument`,
61+
`\\\nargument`,
6162
]);
6263
});
6364

@@ -116,3 +117,247 @@ test("empty command", () => {
116117
expect(tokenizeArgs(``)).toEqual([]);
117118
expect(tokenizeArgs(` `)).toEqual([]);
118119
});
120+
121+
// --------------------------------------------------
122+
// Characters and character codes
123+
// --------------------------------------------------
124+
125+
// special characters
126+
const CHR_BS = "\\";
127+
// special character codes
128+
const ASC_NL = "\n".charCodeAt(0);
129+
const ASC_DQ = '"'.charCodeAt(0);
130+
const ASC_SQ = "'".charCodeAt(0);
131+
const ASC_DOLLAR = "$".charCodeAt(0);
132+
const ASC_AT = "@".charCodeAt(0);
133+
const ASC_BS = CHR_BS.charCodeAt(0);
134+
const ASC_BQ = "`".charCodeAt(0);
135+
// characters that vanish, when escaped with a backslash
136+
// <backslash><newline> is a line continuation, that should be removed
137+
const skip_escaped_unquoted = [ASC_NL];
138+
const skip_escaped_double = [ASC_NL];
139+
const skip_escaped_single = [ASC_SQ];
140+
// characters that are unescaped in a double quoting context
141+
const escaped_double = [ASC_DQ, ASC_DOLLAR, ASC_BQ, ASC_BS];
142+
143+
// --------------------------------------------------
144+
// Expected unescaped result crafted according to POSIX standard
145+
// --------------------------------------------------
146+
147+
// characters escaped with a backslash in 2 parts
148+
let chars_escaped_1: string[] = [];
149+
let chars_escaped_2: string[] = [];
150+
151+
// expected results for unescaped characters depending on context
152+
let chars_unescaped_unquoted: string[] = [];
153+
let chars_unescaped_double: string[] = [];
154+
let chars_unescaped_single: string[] = [];
155+
156+
let arg_string_unquoted: string;
157+
let arg_string_double: string;
158+
let arg_string_single: string;
159+
160+
let arg_tokens_unquoted: string[] = [];
161+
let arg_tokens_double: string[] = [];
162+
let arg_tokens_single: string[] = [];
163+
164+
function add_char_to_unescaped_arrays(ascii_code: number, escaped_chars: string[]) {
165+
let _chr = String.fromCharCode(ascii_code);
166+
escaped_chars.push(CHR_BS + _chr);
167+
168+
if (skip_escaped_unquoted.indexOf(ascii_code) < 0) {
169+
// all characters are unescaped
170+
chars_unescaped_unquoted.push(_chr);
171+
}
172+
if (skip_escaped_double.indexOf(ascii_code) < 0) {
173+
// only some characters are unescaped
174+
if (escaped_double.indexOf(ascii_code) < 0) {
175+
chars_unescaped_double.push(CHR_BS + _chr);
176+
} else {
177+
chars_unescaped_double.push(_chr);
178+
}
179+
}
180+
if (skip_escaped_single.indexOf(ascii_code) < 0) {
181+
// no characters are unescaped
182+
chars_unescaped_single.push(CHR_BS + _chr);
183+
} else {
184+
// a single quote terminates single quoting
185+
chars_unescaped_single.push(CHR_BS);
186+
}
187+
}
188+
189+
type Overrides = {
190+
tokens_unquoted?: string[];
191+
tokens_double?: string[];
192+
tokens_single?: string[];
193+
}
194+
195+
function chars_escaped_test_generate_strings (start: number, end: number, overrides?: Overrides) {
196+
chars_escaped_1= [];
197+
chars_escaped_2 = [];
198+
chars_unescaped_unquoted = [];
199+
chars_unescaped_double = [];
200+
chars_unescaped_single = [];
201+
arg_tokens_unquoted = [];
202+
arg_tokens_double = [];
203+
arg_tokens_single = [];
204+
205+
if (typeof(overrides) === 'undefined') {
206+
overrides = {};
207+
}
208+
209+
// the first part of escaped characters are all characters from
210+
// 0 - ASC(single_quote) "\000 ... '"
211+
for(let ascii_code=start; ascii_code<=ASC_SQ; ascii_code++) {
212+
add_char_to_unescaped_arrays(ascii_code, chars_escaped_1);
213+
}
214+
// The second part are all characters from
215+
// ASC(<open-parenthesis>) - ASC(255) "( ... ÿ"
216+
for(let ascii_code=ASC_SQ+1; ascii_code<=end; ascii_code++) {
217+
add_char_to_unescaped_arrays(ascii_code, chars_escaped_2);
218+
}
219+
220+
arg_string_unquoted = chars_escaped_1.join("") + chars_escaped_2.join("");
221+
arg_string_double = '"' + chars_escaped_1.join("") + chars_escaped_2.join("") + '"';
222+
// Since a single quote cannot be a member of a single quoted string,
223+
// the escaped single quote at the end of the first part will
224+
// terminate single-quoting. To avoid a syntax error, a single quote
225+
// must be prepended to the second part of escaped characters.
226+
arg_string_single = "'" + chars_escaped_1.join("") + "'" + chars_escaped_2.join("") + "'";
227+
228+
arg_tokens_unquoted = overrides.tokens_unquoted ? overrides.tokens_unquoted : [chars_unescaped_unquoted.join("")];
229+
arg_tokens_double = overrides.tokens_double ? overrides.tokens_double : [chars_unescaped_double.join("")];
230+
arg_tokens_single = overrides.tokens_single ? overrides.tokens_single : [chars_unescaped_single.join("")];
231+
}
232+
233+
// --------------------------------------------------
234+
// Expected unescaped result generated by /bin/sh
235+
// --------------------------------------------------
236+
237+
let shell_arg_token_unquoted = atob(`
238+
AQIDBAUGBwgJCwwNDg8QERITFBUWFxgZGhscHR4fICEiIyQlJicoKSorLC0uLzAxMjM0NTY3ODk6
239+
Ozw9Pj9AQUJDREVGR0hJSktMTU5PUFFSU1RVVldYWVpbXF1eX2BhYmNkZWZnaGlqa2xtbm9wcXJz
240+
dHV2d3h5ent8fX5/
241+
`);
242+
243+
let shell_arg_token_double = atob(`
244+
XAFcAlwDXARcBVwGXAdcCFwJXAtcDFwNXA5cD1wQXBFcElwTXBRcFVwWXBdcGFwZXBpcG1wcXB1c
245+
HlwfXCBcISJcIyRcJVwmXCdcKFwpXCpcK1wsXC1cLlwvXDBcMVwyXDNcNFw1XDZcN1w4XDlcOlw7
246+
XDxcPVw+XD9cQFxBXEJcQ1xEXEVcRlxHXEhcSVxKXEtcTFxNXE5cT1xQXFFcUlxTXFRcVVxWXFdc
247+
WFxZXFpcW1xcXVxeXF9gXGFcYlxjXGRcZVxmXGdcaFxpXGpca1xsXG1cblxvXHBccVxyXHNcdFx1
248+
XHZcd1x4XHlcelx7XHxcfVx+XH8=
249+
`);
250+
251+
let shell_arg_token_single = atob(`
252+
XAFcAlwDXARcBVwGXAdcCFwJXApcC1wMXA1cDlwPXBBcEVwSXBNcFFwVXBZcF1wYXBlcGlwbXBxc
253+
HVweXB9cIFwhXCJcI1wkXCVcJlxcKFwpXCpcK1wsXC1cLlwvXDBcMVwyXDNcNFw1XDZcN1w4XDlc
254+
Olw7XDxcPVw+XD9cQFxBXEJcQ1xEXEVcRlxHXEhcSVxKXEtcTFxNXE5cT1xQXFFcUlxTXFRcVVxW
255+
XFdcWFxZXFpcW1xcXF1cXlxfXGBcYVxiXGNcZFxlXGZcZ1xoXGlcalxrXGxcbVxuXG9ccFxxXHJc
256+
c1x0XHVcdlx3XHhceVx6XHtcfFx9XH5cfw==
257+
`);
258+
259+
// function fold_string (str: string, width?: number) {
260+
// let lines = [];
261+
// if (typeof(width) === 'undefined') {
262+
// width = 76;
263+
// }
264+
// while (str) {
265+
// lines.push(str.substring(0, width));
266+
// str = str.substring(width);
267+
// }
268+
// return lines.join("\n");
269+
// }
270+
//
271+
// async function generate_shell_arg_tokens () {
272+
// // result.stdout - the stdout as a string
273+
// // result.stderr - the stderr as a string
274+
// // result.exitCode - the process exit code as a number
275+
// let result = await x('/bin/sh', ['-c', `pecho () { printf "%s" "\${*}"; }; pecho ` + arg_string_unquoted]);
276+
// console.log('let shell_arg_token_unquoted = atob(`' + "\n" + fold_string(btoa(result.stdout)) + '`)';);
277+
// result = await x('/bin/sh', ['-c', `pecho () { printf "%s" "\${*}"; }; pecho ` + arg_string_double]);
278+
// console.log('let shell_arg_token_double = atob(`' + "\n" + fold_string(btoa(result.stdout)) + '`)');
279+
// result = await x('/bin/sh', ['-c', `pecho () { printf "%s" "\${*}"; }; pecho ` + arg_string_single]);
280+
// console.log('let shell_arg_token_single = atob(`' + "\n" + fold_string(btoa(result.stdout)) + '`)');
281+
// }
282+
283+
// import { x } from 'tinyexec';
284+
// // tinyexec does not handle NUL in argument strings, ASCII codes > 127 are messed up by UTF-8 output
285+
// chars_escaped_test_generate_strings(1, 127);
286+
// await generate_shell_arg_tokens();
287+
288+
function pretty_print_character_string_array(char_string_array: string[]) {
289+
let output = [];
290+
for (const _string of char_string_array) {
291+
output.push("--------------------------------------------------");
292+
let _escaped = "";
293+
for (let _indx=0; _indx<_string.length; _indx++) {
294+
let _chr = _string[_indx];
295+
let _asc = _chr.charCodeAt(0);
296+
if (!_escaped) {
297+
if (_chr === CHR_BS) {
298+
_escaped = _chr;
299+
continue
300+
}
301+
}
302+
if (_asc < 32) {
303+
_chr = "^" + String.fromCharCode(ASC_AT + _asc);
304+
} else if (_asc >= 127) {
305+
_chr = "\\x" + _asc.toString(16).toUpperCase();
306+
}
307+
output.push(_escaped + _chr + " " + _asc.toString());
308+
_escaped = "";
309+
}
310+
if (_escaped) {
311+
output.push(_escaped);
312+
}
313+
}
314+
return output.join("\n");
315+
}
316+
317+
function chars_escaped_test(start: number, end: number, suffix: string, overrides?: Overrides) {
318+
chars_escaped_test_generate_strings(start, end, overrides);
319+
320+
if (suffix) {
321+
suffix = " " + suffix;
322+
}
323+
324+
test("all escaped characters outside quoting context" + suffix, () => {
325+
expect(
326+
pretty_print_character_string_array(
327+
tokenizeArgs(arg_string_unquoted))
328+
).toEqual(
329+
pretty_print_character_string_array(
330+
arg_tokens_unquoted)
331+
);
332+
});
333+
334+
test("all escaped characters in double quoting context" + suffix, () => {
335+
expect(
336+
pretty_print_character_string_array(
337+
tokenizeArgs(arg_string_double))
338+
).toEqual(
339+
pretty_print_character_string_array(
340+
arg_tokens_double)
341+
);
342+
});
343+
344+
test("all escaped characters in single quoting context" + suffix, () => {
345+
expect(
346+
pretty_print_character_string_array(
347+
tokenizeArgs(arg_string_single))
348+
).toEqual(
349+
pretty_print_character_string_array(
350+
arg_tokens_single)
351+
);
352+
});
353+
}
354+
355+
// Expected unescaped result generated according to POSIX
356+
chars_escaped_test(0, 255, "(POSIX)");
357+
358+
// Expected unescaped result generated by /bin/sh
359+
chars_escaped_test(1, 127, "(/bin/sh)", {
360+
tokens_unquoted: [shell_arg_token_unquoted],
361+
tokens_double: [shell_arg_token_double],
362+
tokens_single: [shell_arg_token_single]
363+
});

src/args-tokenizer.ts

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
const spaceRegex = /\s/;
2+
const dqSpecialRegex = /[$`"\\]/;
23

34
type Options = {
45
loose?: boolean;
@@ -19,18 +20,30 @@ export const tokenizeArgs = (
1920
const char = argsString[index];
2021

2122
if (escaped) {
23+
// Backslashes are not recognized in single quotes, so `escaped`
24+
// is never true in this case.
2225
escaped = false;
23-
// escape newline inside of quotes
24-
// ignore newline elsewhere
25-
if (openningQuote || char !== "\n") {
26+
// In other regions, a newline and the preceding backslash
27+
// are always dropped.
28+
if (char !== "\n") {
29+
// In double quotes, special POSIX rules apply (see above).
30+
// For the characters <dollar-sign>, <backquote>,
31+
// <double-quote> and <backslash> the escaping backslash is
32+
// dropped. For all other characters the backslash is kept.
33+
if (openningQuote && ! dqSpecialRegex.test(char)) {
34+
currentToken += "\\";
35+
}
36+
// All other characters are kept as is.
2637
currentToken += char;
2738
}
2839
continue;
2940
}
3041

31-
if (char === "\\") {
32-
escaped = true;
33-
continue;
42+
if (openningQuote !== "'") {
43+
if (char === "\\") {
44+
escaped = true;
45+
continue;
46+
}
3447
}
3548

3649
if (openningQuote === undefined && spaceRegex.test(char)) {

0 commit comments

Comments
 (0)