Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions Lib/test/test_dstring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import unittest

_dstring_prefixes = "d db df dt dr drb drf drt".split()
_dstring_prefixes += [p.upper() for p in _dstring_prefixes]

class DStringTestCase(unittest.TestCase):
def assertAllRaise(self, exception_type, regex, error_strings):
for str in error_strings:
with self.subTest(str=str):
with self.assertRaisesRegex(exception_type, regex) as cm:
eval(str)

def test_single_quote(self):
exprs = [
f"{p}'hello, world'" for p in _dstring_prefixes
] + [
f'{p}"hello, world"' for p in _dstring_prefixes
]
self.assertAllRaise(SyntaxError, "d-string must be triple-quoted", exprs)

def test_empty_dstring(self):
exprs = [
f"{p}''''''" for p in _dstring_prefixes
] + [
f'{p}""""""' for p in _dstring_prefixes
]
self.assertAllRaise(SyntaxError, "d-string must start with a newline", exprs)

def test_simple_dstring(self):
cases = [
('{prefix}"""\n hello world\n """', "hello world\n"),
('{prefix}"""\n hello world\n """', " hello world\n"),
('{prefix}"""\n hello world\n"""', " hello world\n"),
('{prefix}"""\n hello world\\\n """', " hello world"),
('{prefix}"""\n hello world\\\n """', " hello world\\\n"),
]

for p in _dstring_prefixes:
bstring = 'b' in p.lower()
rstring = 'r' in p.lower()
for source, expected in cases:
source = source.format(prefix=p)
if rstring:
expected = expected.replace('\\', '\\\\').replace('\n', '\\n')
if bstring:
expected = expected.encode()
self.assertEqual(eval(source), expected)


if __name__ == '__main__':
unittest.main()
4 changes: 2 additions & 2 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -3420,7 +3420,7 @@ def determine_valid_prefixes():
# some uppercase-only prefix is added.
for letter in itertools.chain(string.ascii_lowercase, string.ascii_uppercase):
try:
eval(f'{letter}""')
eval(f'{letter}"""\n"""') # d-string needs multiline
single_char_valid_prefixes.add(letter.lower())
except SyntaxError:
pass
Expand All @@ -3444,7 +3444,7 @@ def determine_valid_prefixes():
# because it's a valid expression: not ""
continue
try:
eval(f'{p}""')
eval(f'{p}"""\n"""') # d-string needs multiline

# No syntax error, so p is a valid string
# prefix.
Expand Down
3 changes: 2 additions & 1 deletion Lib/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ def _all_string_prefixes():
# The valid string prefixes. Only contain the lower case versions,
# and don't contain any permutations (include 'fr', but not
# 'rf'). The various permutations will be generated.
_valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'br', 'fr', 'tr']
_valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'd', 'br', 'fr', 'tr',
'bd', 'rd', 'fd', 'td', 'brd', 'frd', 'trd']
# if we add binary f-strings, add: ['fb', 'fbr']
result = {''}
for prefix in _valid_string_prefixes:
Expand Down
6 changes: 3 additions & 3 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -13480,8 +13480,8 @@ of all lines in the [src, end).
It returns the length of the common leading whitespace and sets `output` to
point to the beginning of the common leading whitespace if length > 0.
*/
static Py_ssize_t
search_longest_common_leading_whitespace(
Py_ssize_t
_Py_search_longest_common_leading_whitespace(
const char *const src,
const char *const end,
const char **output)
Expand Down Expand Up @@ -13576,7 +13576,7 @@ _PyUnicode_Dedent(PyObject *unicode)
// [whitespace_start, whitespace_start + whitespace_len)
// describes the current longest common leading whitespace
const char *whitespace_start = NULL;
Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
Py_ssize_t whitespace_len = _Py_search_longest_common_leading_whitespace(
src, end, &whitespace_start);

if (whitespace_len == 0) {
Expand Down
218 changes: 209 additions & 9 deletions Parser/action_helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -1292,24 +1292,146 @@ _PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq

// Fstring stuff

static int
unicodewriter_write_line(Parser *p, PyUnicodeWriter *w, const char *line_start, const char *line_end,
int is_raw, Token* token)
{
if (is_raw || memchr(line_start, '\\', line_end - line_start) == NULL) {
return PyUnicodeWriter_WriteUTF8(w, line_start, line_end - line_start);
}
else {
PyObject *line = _PyPegen_decode_string(p, 1, line_start, line_end - line_start, token);
if (line == NULL || PyUnicodeWriter_WriteStr(w, line) < 0) {
Py_XDECREF(line);
return -1;
}
Py_DECREF(line);
}
return 0;
}
Comment on lines +1295 to +1311
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical: Wrong raw parameter passed to _PyPegen_decode_string.

Line 1303 passes 1 (raw=true) to _PyPegen_decode_string, but this is in the else branch where is_raw is known to be false. This means escape sequences like \n, \t, etc. won't be processed for non-raw d-strings - they'll be written as literal characters.

This is likely the root cause of the line continuation issue flagged in past reviews. When escape processing is skipped, backslash-newline continuation won't work.

Proposed fix
 static int
 unicodewriter_write_line(Parser *p, PyUnicodeWriter *w, const char *line_start, const char *line_end,
                          int is_raw, Token* token)
 {
     if (is_raw || memchr(line_start, '\\', line_end - line_start) == NULL) {
         return PyUnicodeWriter_WriteUTF8(w, line_start, line_end - line_start);
     }
     else {
-        PyObject *line = _PyPegen_decode_string(p, 1, line_start, line_end - line_start, token);
+        PyObject *line = _PyPegen_decode_string(p, 0, line_start, line_end - line_start, token);
         if (line == NULL || PyUnicodeWriter_WriteStr(w, line) < 0) {
             Py_XDECREF(line);
             return -1;
         }
         Py_DECREF(line);
     }
     return 0;
 }
🤖 Prompt for AI Agents
In `@Parser/action_helpers.c` around lines 1295 - 1311, The bug is that
unicodewriter_write_line calls _PyPegen_decode_string with a hardcoded raw=1,
preventing escape processing for non-raw strings; update the call in
unicodewriter_write_line to pass the correct raw flag (use 0 or the is_raw
variable) instead of 1 so escapes (e.g. backslash-newline) are decoded; ensure
the call to _PyPegen_decode_string(line_start, line_end - line_start, token)
uses the proper raw parameter consistent with is_raw.


static PyObject*
_PyPegen_dedent_string_part(
Parser *p, const char *s, size_t len, const char *indent, Py_ssize_t indent_len,
int is_first, int is_raw, expr_ty constant, Token* token)
{
Py_ssize_t lineno = constant->lineno;
const char *line_start = s;
const char *s_end = s + len;

int _prev_call_invalid = p->call_invalid_rules;
if (!_prev_call_invalid && !is_raw) {
// _PyPegen_decode_string() and decode_bytes_with_escapes() may call
// warn_invalid_escape_sequence(). It may emit issue or raise SyntaxError
// for invalid escape sequences.
// We need to call it before dedenting since SyntaxError needs exact lineno
// and col_offset of invalid escape sequences.
PyObject *t = _PyPegen_decode_string(p, 0, s, len, token);
if (t == NULL) {
return NULL;
}
Py_DECREF(t);
p->call_invalid_rules = 1;
}

PyUnicodeWriter *w = PyUnicodeWriter_Create(len);
if (w == NULL) {
return NULL;
}

if (is_first) {
assert (line_start[0] == '\n');
line_start++; // skip the first newline
}
else {
// Example: df"""
// first part {param} second part
// next line
// """"
// We don't need to dedent the first line in the non-first parts.
const char *line_end = memchr(line_start, '\n', s_end - line_start);
if (line_end) {
line_end++; // include the newline
}
else {
line_end = s_end;
}
if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) {
goto error;
}
line_start = line_end;
}

while (line_start < s + len) {
lineno++;

Py_ssize_t i = 0;
while (line_start + i < s_end && i < indent_len && line_start[i] == indent[i]) {
i++;
}

if (line_start[i] == '\0') { // found an empty line without newline.
break;
}
if (line_start[i] == '\n') { // found an empty line with newline.
if (PyUnicodeWriter_WriteChar(w, '\n') < 0) {
goto error;
}
line_start += i+1;
continue;
}
if (i < indent_len) { // found an invalid indent.
assert(line_start[i] != indent[i]);
RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, i, lineno, i+1,
"d-string line missing valid indentation");
goto error;
}

// found a indented line. let's dedent it.
line_start += i;
const char *line_end = memchr(line_start, '\n', s_end - line_start);
if (line_end) {
line_end++; // include the newline
}
else {
line_end = s_end;
}
if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) {
goto error;
}
line_start = line_end;
}
p->call_invalid_rules = _prev_call_invalid;
return PyUnicodeWriter_Finish(w);

error:
p->call_invalid_rules = _prev_call_invalid;
PyUnicodeWriter_Discard(w);
return NULL;
}

static expr_ty
_PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* token) {
_PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw,
const char *indent, Py_ssize_t indent_len,
expr_ty constant, Token* token)
{
assert(PyUnicode_CheckExact(constant->v.Constant.value));

const char* bstr = PyUnicode_AsUTF8(constant->v.Constant.value);
if (bstr == NULL) {
return NULL;
}
is_raw = is_raw || strchr(bstr, '\\') == NULL;

size_t len;
if (strcmp(bstr, "{{") == 0 || strcmp(bstr, "}}") == 0) {
len = 1;
} else {
len = strlen(bstr);
PyObject *str = NULL;
if (indent_len > 0) {
str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent, indent_len,
is_first, is_raw, constant, token);
}
else {
str = _PyPegen_decode_string(p, is_raw, bstr, strlen(bstr), token);
}

is_raw = is_raw || strchr(bstr, '\\') == NULL;
PyObject *str = _PyPegen_decode_string(p, is_raw, bstr, len, token);
if (str == NULL) {
_Pypegen_raise_decode_error(p);
return NULL;
Expand All @@ -1323,6 +1445,14 @@ _PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* tok
p->arena);
}

/* defined in unicodeobject.c */
extern Py_ssize_t
_Py_search_longest_common_leading_whitespace(
const char *const src,
const char *const end,
const char **output
);

static asdl_expr_seq *
_get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b, enum string_kind_t string_kind)
{
Expand All @@ -1340,12 +1470,82 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
return NULL;
}
int is_raw = strpbrk(quote_str, "rR") != NULL;
int is_dedent = strpbrk(quote_str, "dD") != NULL;

asdl_expr_seq *seq = _Py_asdl_expr_seq_new(total_items, p->arena);
if (seq == NULL) {
return NULL;
}

const char *common_indent_start = NULL;
Py_ssize_t common_indent_len = 0;

if (is_dedent) {
if (total_items == 0) {
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
a,
"d-string must start with a newline"
);
return NULL;
}
expr_ty first_item = asdl_seq_GET(raw_expressions, 0);
if (first_item->kind != Constant_kind
|| PyUnicode_ReadChar(first_item->v.Constant.value, 0) != '\n') {
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
first_item,
"d-string must start with a newline"
);
return NULL;
}

// Instead of calculating common indent from all parts,
// build temporary string and calculate common indent from it.
PyBytesWriter *w = PyBytesWriter_Create(0);
if (w == NULL) {
return NULL;
}

for (Py_ssize_t i = 0; i < n_items; i++) {
expr_ty item = asdl_seq_GET(raw_expressions, i);

if (item->kind == JoinedStr_kind) {
// Write a placeholder.
if (PyBytesWriter_WriteBytes(w, "X", 1) < 0) {
PyBytesWriter_Discard(w);
return NULL;
}
continue;
}
if (item->kind == Constant_kind) {
Py_ssize_t blen;
const char *bstr = PyUnicode_AsUTF8AndSize(item->v.Constant.value, &blen);
if (bstr == NULL || PyBytesWriter_WriteBytes(w, bstr, blen) < 0) {
PyBytesWriter_Discard(w);
return NULL;
}
continue;
}
}
// Add a terminator to include the last line before the ending quote
if (PyBytesWriter_WriteBytes(w, "X", 1) < 0) {
PyBytesWriter_Discard(w);
return NULL;
}

// TODO: instead of creating temp_bytes, we could search
// common index from each part directly. But this need reimplementation
// of _Py_search_longest_common_leading_whitespace.
PyObject *temp_bytes = PyBytesWriter_Finish(w);
if (temp_bytes == NULL) {
return NULL;
}
_PyArena_AddPyObject(p->arena, temp_bytes);
const char *temp_str = PyBytes_AsString(temp_bytes);
Comment on lines +1501 to +1543
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Check _PyArena_AddPyObject failure for temp_bytes.
If arena insertion fails, the error is ignored and temp_bytes leaks while an exception is set.

Proposed fix
-        _PyArena_AddPyObject(p->arena, temp_bytes);
+        if (_PyArena_AddPyObject(p->arena, temp_bytes) < 0) {
+            Py_DECREF(temp_bytes);
+            return NULL;
+        }
🤖 Prompt for AI Agents
In `@Parser/action_helpers.c` around lines 1475 - 1517, After creating temp_bytes
via PyBytesWriter_Finish, check the return value of
_PyArena_AddPyObject(p->arena, temp_bytes); if it fails (non-zero) then
DECREF/clean up temp_bytes (e.g., Py_DECREF(temp_bytes) or PyBytesWriter_Discard
equivalent) and return NULL instead of proceeding to use temp_bytes; ensure you
only call PyBytes_AsString(temp_bytes) after a successful _PyArena_AddPyObject
to avoid leaking temp_bytes on arena insertion failure and to propagate the
existing exception.

const char *temp_end = temp_str + PyBytes_GET_SIZE(temp_bytes);
common_indent_len = _Py_search_longest_common_leading_whitespace(
temp_str, temp_end, &common_indent_start);
}

Py_ssize_t index = 0;
for (Py_ssize_t i = 0; i < n_items; i++) {
expr_ty item = asdl_seq_GET(raw_expressions, i);
Expand Down Expand Up @@ -1377,7 +1577,7 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
}

if (item->kind == Constant_kind) {
item = _PyPegen_decode_fstring_part(p, is_raw, item, b);
item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, common_indent_start, common_indent_len, item, b);
if (item == NULL) {
return NULL;
}
Expand Down
Loading
Loading