forked from python/cpython
-
Notifications
You must be signed in to change notification settings - Fork 0
PEP 822: d-string draft implementation #108
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
methane
wants to merge
6
commits into
main
Choose a base branch
from
peps/0822-d-string
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
c354b9c
first implementation of d-string
methane 10a5073
fix test_tokenize
methane 21cc35a
use least indent instead of closing quote indent
methane 8104370
fix bugs
methane b40003b
fix invalid escape sequences position
methane 57a3451
improve tests
methane File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| import unittest | ||
|
|
||
| _dstring_prefixes = "d db df dt dr drb drf drt".split() | ||
| _dstring_prefixes += [p.upper() for p in _dstring_prefixes] | ||
|
|
||
| class DStringTestCase(unittest.TestCase): | ||
| def assertAllRaise(self, exception_type, regex, error_strings): | ||
| for str in error_strings: | ||
| with self.subTest(str=str): | ||
| with self.assertRaisesRegex(exception_type, regex) as cm: | ||
| eval(str) | ||
|
|
||
| def test_single_quote(self): | ||
| exprs = [ | ||
| f"{p}'hello, world'" for p in _dstring_prefixes | ||
| ] + [ | ||
| f'{p}"hello, world"' for p in _dstring_prefixes | ||
| ] | ||
| self.assertAllRaise(SyntaxError, "d-string must be triple-quoted", exprs) | ||
|
|
||
| def test_empty_dstring(self): | ||
| exprs = [ | ||
| f"{p}''''''" for p in _dstring_prefixes | ||
| ] + [ | ||
| f'{p}""""""' for p in _dstring_prefixes | ||
| ] | ||
| self.assertAllRaise(SyntaxError, "d-string must start with a newline", exprs) | ||
|
|
||
| def test_simple_dstring(self): | ||
| cases = [ | ||
| ('{prefix}"""\n hello world\n """', "hello world\n"), | ||
| ('{prefix}"""\n hello world\n """', " hello world\n"), | ||
| ('{prefix}"""\n hello world\n"""', " hello world\n"), | ||
| ('{prefix}"""\n hello world\\\n """', " hello world"), | ||
| ('{prefix}"""\n hello world\\\n """', " hello world\\\n"), | ||
| ] | ||
|
|
||
| for p in _dstring_prefixes: | ||
| bstring = 'b' in p.lower() | ||
| rstring = 'r' in p.lower() | ||
| for source, expected in cases: | ||
| source = source.format(prefix=p) | ||
| if rstring: | ||
| expected = expected.replace('\\', '\\\\').replace('\n', '\\n') | ||
| if bstring: | ||
| expected = expected.encode() | ||
| self.assertEqual(eval(source), expected) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| unittest.main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1292,24 +1292,146 @@ _PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq | |
|
|
||
| // Fstring stuff | ||
|
|
||
| static int | ||
| unicodewriter_write_line(Parser *p, PyUnicodeWriter *w, const char *line_start, const char *line_end, | ||
| int is_raw, Token* token) | ||
| { | ||
| if (is_raw || memchr(line_start, '\\', line_end - line_start) == NULL) { | ||
| return PyUnicodeWriter_WriteUTF8(w, line_start, line_end - line_start); | ||
| } | ||
| else { | ||
| PyObject *line = _PyPegen_decode_string(p, 1, line_start, line_end - line_start, token); | ||
| if (line == NULL || PyUnicodeWriter_WriteStr(w, line) < 0) { | ||
| Py_XDECREF(line); | ||
| return -1; | ||
| } | ||
| Py_DECREF(line); | ||
| } | ||
| return 0; | ||
| } | ||
|
|
||
| static PyObject* | ||
| _PyPegen_dedent_string_part( | ||
| Parser *p, const char *s, size_t len, const char *indent, Py_ssize_t indent_len, | ||
| int is_first, int is_raw, expr_ty constant, Token* token) | ||
| { | ||
| Py_ssize_t lineno = constant->lineno; | ||
| const char *line_start = s; | ||
| const char *s_end = s + len; | ||
|
|
||
| int _prev_call_invalid = p->call_invalid_rules; | ||
| if (!_prev_call_invalid && !is_raw) { | ||
| // _PyPegen_decode_string() and decode_bytes_with_escapes() may call | ||
| // warn_invalid_escape_sequence(). It may emit issue or raise SyntaxError | ||
| // for invalid escape sequences. | ||
| // We need to call it before dedenting since SyntaxError needs exact lineno | ||
| // and col_offset of invalid escape sequences. | ||
| PyObject *t = _PyPegen_decode_string(p, 0, s, len, token); | ||
| if (t == NULL) { | ||
| return NULL; | ||
| } | ||
| Py_DECREF(t); | ||
| p->call_invalid_rules = 1; | ||
| } | ||
|
|
||
| PyUnicodeWriter *w = PyUnicodeWriter_Create(len); | ||
| if (w == NULL) { | ||
| return NULL; | ||
| } | ||
|
|
||
| if (is_first) { | ||
| assert (line_start[0] == '\n'); | ||
| line_start++; // skip the first newline | ||
| } | ||
| else { | ||
| // Example: df""" | ||
| // first part {param} second part | ||
| // next line | ||
| // """" | ||
| // We don't need to dedent the first line in the non-first parts. | ||
| const char *line_end = memchr(line_start, '\n', s_end - line_start); | ||
| if (line_end) { | ||
| line_end++; // include the newline | ||
| } | ||
| else { | ||
| line_end = s_end; | ||
| } | ||
| if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) { | ||
| goto error; | ||
| } | ||
| line_start = line_end; | ||
| } | ||
|
|
||
| while (line_start < s + len) { | ||
| lineno++; | ||
|
|
||
| Py_ssize_t i = 0; | ||
| while (line_start + i < s_end && i < indent_len && line_start[i] == indent[i]) { | ||
| i++; | ||
| } | ||
|
|
||
| if (line_start[i] == '\0') { // found an empty line without newline. | ||
| break; | ||
| } | ||
| if (line_start[i] == '\n') { // found an empty line with newline. | ||
| if (PyUnicodeWriter_WriteChar(w, '\n') < 0) { | ||
| goto error; | ||
| } | ||
| line_start += i+1; | ||
| continue; | ||
| } | ||
| if (i < indent_len) { // found an invalid indent. | ||
| assert(line_start[i] != indent[i]); | ||
| RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, i, lineno, i+1, | ||
| "d-string line missing valid indentation"); | ||
| goto error; | ||
| } | ||
|
|
||
| // found a indented line. let's dedent it. | ||
| line_start += i; | ||
| const char *line_end = memchr(line_start, '\n', s_end - line_start); | ||
| if (line_end) { | ||
| line_end++; // include the newline | ||
| } | ||
| else { | ||
| line_end = s_end; | ||
| } | ||
| if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) { | ||
| goto error; | ||
| } | ||
| line_start = line_end; | ||
| } | ||
| p->call_invalid_rules = _prev_call_invalid; | ||
| return PyUnicodeWriter_Finish(w); | ||
|
|
||
| error: | ||
| p->call_invalid_rules = _prev_call_invalid; | ||
| PyUnicodeWriter_Discard(w); | ||
| return NULL; | ||
| } | ||
|
|
||
| static expr_ty | ||
| _PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* token) { | ||
| _PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, | ||
| const char *indent, Py_ssize_t indent_len, | ||
| expr_ty constant, Token* token) | ||
| { | ||
| assert(PyUnicode_CheckExact(constant->v.Constant.value)); | ||
|
|
||
| const char* bstr = PyUnicode_AsUTF8(constant->v.Constant.value); | ||
| if (bstr == NULL) { | ||
| return NULL; | ||
| } | ||
| is_raw = is_raw || strchr(bstr, '\\') == NULL; | ||
|
|
||
| size_t len; | ||
| if (strcmp(bstr, "{{") == 0 || strcmp(bstr, "}}") == 0) { | ||
| len = 1; | ||
| } else { | ||
| len = strlen(bstr); | ||
| PyObject *str = NULL; | ||
| if (indent_len > 0) { | ||
| str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent, indent_len, | ||
| is_first, is_raw, constant, token); | ||
| } | ||
| else { | ||
| str = _PyPegen_decode_string(p, is_raw, bstr, strlen(bstr), token); | ||
| } | ||
|
|
||
| is_raw = is_raw || strchr(bstr, '\\') == NULL; | ||
| PyObject *str = _PyPegen_decode_string(p, is_raw, bstr, len, token); | ||
| if (str == NULL) { | ||
| _Pypegen_raise_decode_error(p); | ||
| return NULL; | ||
|
|
@@ -1323,6 +1445,14 @@ _PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* tok | |
| p->arena); | ||
| } | ||
|
|
||
| /* defined in unicodeobject.c */ | ||
| extern Py_ssize_t | ||
| _Py_search_longest_common_leading_whitespace( | ||
| const char *const src, | ||
| const char *const end, | ||
| const char **output | ||
| ); | ||
|
|
||
| static asdl_expr_seq * | ||
| _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b, enum string_kind_t string_kind) | ||
| { | ||
|
|
@@ -1340,12 +1470,82 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b | |
| return NULL; | ||
| } | ||
| int is_raw = strpbrk(quote_str, "rR") != NULL; | ||
| int is_dedent = strpbrk(quote_str, "dD") != NULL; | ||
|
|
||
| asdl_expr_seq *seq = _Py_asdl_expr_seq_new(total_items, p->arena); | ||
| if (seq == NULL) { | ||
| return NULL; | ||
| } | ||
|
|
||
| const char *common_indent_start = NULL; | ||
| Py_ssize_t common_indent_len = 0; | ||
|
|
||
| if (is_dedent) { | ||
| if (total_items == 0) { | ||
| RAISE_SYNTAX_ERROR_KNOWN_LOCATION( | ||
| a, | ||
| "d-string must start with a newline" | ||
| ); | ||
| return NULL; | ||
| } | ||
| expr_ty first_item = asdl_seq_GET(raw_expressions, 0); | ||
| if (first_item->kind != Constant_kind | ||
| || PyUnicode_ReadChar(first_item->v.Constant.value, 0) != '\n') { | ||
| RAISE_SYNTAX_ERROR_KNOWN_LOCATION( | ||
| first_item, | ||
| "d-string must start with a newline" | ||
| ); | ||
| return NULL; | ||
| } | ||
|
|
||
| // Instead of calculating common indent from all parts, | ||
| // build temporary string and calculate common indent from it. | ||
| PyBytesWriter *w = PyBytesWriter_Create(0); | ||
| if (w == NULL) { | ||
| return NULL; | ||
| } | ||
|
|
||
| for (Py_ssize_t i = 0; i < n_items; i++) { | ||
| expr_ty item = asdl_seq_GET(raw_expressions, i); | ||
|
|
||
| if (item->kind == JoinedStr_kind) { | ||
| // Write a placeholder. | ||
| if (PyBytesWriter_WriteBytes(w, "X", 1) < 0) { | ||
| PyBytesWriter_Discard(w); | ||
| return NULL; | ||
| } | ||
| continue; | ||
| } | ||
| if (item->kind == Constant_kind) { | ||
| Py_ssize_t blen; | ||
| const char *bstr = PyUnicode_AsUTF8AndSize(item->v.Constant.value, &blen); | ||
| if (bstr == NULL || PyBytesWriter_WriteBytes(w, bstr, blen) < 0) { | ||
| PyBytesWriter_Discard(w); | ||
| return NULL; | ||
| } | ||
| continue; | ||
| } | ||
| } | ||
| // Add a terminator to include the last line before the ending quote | ||
| if (PyBytesWriter_WriteBytes(w, "X", 1) < 0) { | ||
| PyBytesWriter_Discard(w); | ||
| return NULL; | ||
| } | ||
|
|
||
| // TODO: instead of creating temp_bytes, we could search | ||
| // common index from each part directly. But this need reimplementation | ||
| // of _Py_search_longest_common_leading_whitespace. | ||
| PyObject *temp_bytes = PyBytesWriter_Finish(w); | ||
| if (temp_bytes == NULL) { | ||
| return NULL; | ||
| } | ||
| _PyArena_AddPyObject(p->arena, temp_bytes); | ||
| const char *temp_str = PyBytes_AsString(temp_bytes); | ||
|
Comment on lines
+1501
to
+1543
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Check Proposed fix- _PyArena_AddPyObject(p->arena, temp_bytes);
+ if (_PyArena_AddPyObject(p->arena, temp_bytes) < 0) {
+ Py_DECREF(temp_bytes);
+ return NULL;
+ }🤖 Prompt for AI Agents |
||
| const char *temp_end = temp_str + PyBytes_GET_SIZE(temp_bytes); | ||
| common_indent_len = _Py_search_longest_common_leading_whitespace( | ||
| temp_str, temp_end, &common_indent_start); | ||
| } | ||
|
|
||
| Py_ssize_t index = 0; | ||
| for (Py_ssize_t i = 0; i < n_items; i++) { | ||
| expr_ty item = asdl_seq_GET(raw_expressions, i); | ||
|
|
@@ -1377,7 +1577,7 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b | |
| } | ||
|
|
||
| if (item->kind == Constant_kind) { | ||
| item = _PyPegen_decode_fstring_part(p, is_raw, item, b); | ||
| item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, common_indent_start, common_indent_len, item, b); | ||
| if (item == NULL) { | ||
| return NULL; | ||
| } | ||
|
|
||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Critical: Wrong
rawparameter passed to_PyPegen_decode_string.Line 1303 passes
1(raw=true) to_PyPegen_decode_string, but this is in theelsebranch whereis_rawis known to be false. This means escape sequences like\n,\t, etc. won't be processed for non-raw d-strings - they'll be written as literal characters.This is likely the root cause of the line continuation issue flagged in past reviews. When escape processing is skipped, backslash-newline continuation won't work.
Proposed fix
static int unicodewriter_write_line(Parser *p, PyUnicodeWriter *w, const char *line_start, const char *line_end, int is_raw, Token* token) { if (is_raw || memchr(line_start, '\\', line_end - line_start) == NULL) { return PyUnicodeWriter_WriteUTF8(w, line_start, line_end - line_start); } else { - PyObject *line = _PyPegen_decode_string(p, 1, line_start, line_end - line_start, token); + PyObject *line = _PyPegen_decode_string(p, 0, line_start, line_end - line_start, token); if (line == NULL || PyUnicodeWriter_WriteStr(w, line) < 0) { Py_XDECREF(line); return -1; } Py_DECREF(line); } return 0; }🤖 Prompt for AI Agents