Skip to content

Commit b51651a

Browse files
committed
Auto merge of #75642 - matklad:lexer-comments, r=petrochenkov
Move doc comment parsing to rustc_lexer Plain comments are trivia, while doc comments are not, so it feels like this belongs to the rustc_lexer. The specific reason to do this is the desire to use rustc_lexer in rustdoc for syntax highlighting, without duplicating "is this a doc comment?" logic there. r? @ghost
2 parents ff5e0f1 + ccbe94b commit b51651a

File tree

4 files changed

+89
-97
lines changed

4 files changed

+89
-97
lines changed

src/librustc_ast/util/comments.rs

+4-44
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
use crate::ast::AttrStyle;
21
use rustc_span::source_map::SourceMap;
32
use rustc_span::{BytePos, CharPos, FileName, Pos, Symbol};
43

@@ -24,45 +23,6 @@ pub struct Comment {
2423
pub pos: BytePos,
2524
}
2625

27-
/// For a full line comment string returns its doc comment style if it's a doc comment
28-
/// and returns `None` if it's a regular comment.
29-
pub fn line_doc_comment_style(line_comment: &str) -> Option<AttrStyle> {
30-
let line_comment = line_comment.as_bytes();
31-
assert!(line_comment.starts_with(b"//"));
32-
match line_comment.get(2) {
33-
// `//!` is an inner line doc comment.
34-
Some(b'!') => Some(AttrStyle::Inner),
35-
Some(b'/') => match line_comment.get(3) {
36-
// `////` (more than 3 slashes) is not considered a doc comment.
37-
Some(b'/') => None,
38-
// Otherwise `///` is an outer line doc comment.
39-
_ => Some(AttrStyle::Outer),
40-
},
41-
_ => None,
42-
}
43-
}
44-
45-
/// For a full block comment string returns its doc comment style if it's a doc comment
46-
/// and returns `None` if it's a regular comment.
47-
pub fn block_doc_comment_style(block_comment: &str, terminated: bool) -> Option<AttrStyle> {
48-
let block_comment = block_comment.as_bytes();
49-
assert!(block_comment.starts_with(b"/*"));
50-
assert!(!terminated || block_comment.ends_with(b"*/"));
51-
match block_comment.get(2) {
52-
// `/*!` is an inner block doc comment.
53-
Some(b'!') => Some(AttrStyle::Inner),
54-
Some(b'*') => match block_comment.get(3) {
55-
// `/***` (more than 2 stars) is not considered a doc comment.
56-
Some(b'*') => None,
57-
// `/**/` is not considered a doc comment.
58-
Some(b'/') if block_comment.len() == 4 => None,
59-
// Otherwise `/**` is an outer block doc comment.
60-
_ => Some(AttrStyle::Outer),
61-
},
62-
_ => None,
63-
}
64-
}
65-
6626
/// Makes a doc string more presentable to users.
6727
/// Used by rustdoc and perhaps other tools, but not by rustc.
6828
pub fn beautify_doc_string(data: Symbol) -> String {
@@ -216,8 +176,8 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
216176
}
217177
}
218178
}
219-
rustc_lexer::TokenKind::BlockComment { terminated } => {
220-
if block_doc_comment_style(token_text, terminated).is_none() {
179+
rustc_lexer::TokenKind::BlockComment { doc_style, .. } => {
180+
if doc_style.is_none() {
221181
let code_to_the_right = match text[pos + token.len..].chars().next() {
222182
Some('\r' | '\n') => false,
223183
_ => true,
@@ -238,8 +198,8 @@ pub fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comme
238198
comments.push(Comment { style, lines, pos: pos_in_file })
239199
}
240200
}
241-
rustc_lexer::TokenKind::LineComment => {
242-
if line_doc_comment_style(token_text).is_none() {
201+
rustc_lexer::TokenKind::LineComment { doc_style } => {
202+
if doc_style.is_none() {
243203
comments.push(Comment {
244204
style: if code_to_the_left {
245205
CommentStyle::Trailing

src/librustc_ast/util/comments/tests.rs

-7
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,6 @@
11
use super::*;
22
use rustc_span::with_default_session_globals;
33

4-
#[test]
5-
fn line_doc_comments() {
6-
assert!(line_doc_comment_style("///").is_some());
7-
assert!(line_doc_comment_style("/// blah").is_some());
8-
assert!(line_doc_comment_style("////").is_none());
9-
}
10-
114
#[test]
125
fn test_block_doc_comment_1() {
136
with_default_session_globals(|| {

src/librustc_lexer/src/lib.rs

+30-5
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,12 @@ impl Token {
5151
pub enum TokenKind {
5252
// Multi-char tokens:
5353
/// "// comment"
54-
LineComment,
54+
LineComment { doc_style: Option<DocStyle> },
5555
/// `/* block comment */`
5656
///
5757
/// Block comments can be recursive, so the sequence like `/* /* */`
5858
/// will not be considered terminated and will result in a parsing error.
59-
BlockComment { terminated: bool },
59+
BlockComment { doc_style: Option<DocStyle>, terminated: bool },
6060
/// Any whitespace characters sequence.
6161
Whitespace,
6262
/// "ident" or "continue"
@@ -129,6 +129,12 @@ pub enum TokenKind {
129129
Unknown,
130130
}
131131

132+
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
133+
pub enum DocStyle {
134+
Outer,
135+
Inner,
136+
}
137+
132138
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
133139
pub enum LiteralKind {
134140
/// "12_u8", "0o100", "0b120i99"
@@ -188,7 +194,7 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
188194
// a doc comment (due to `TokenKind::(Line,Block)Comment` ambiguity at lexer level),
189195
// then it may be valid Rust code, so consider it Rust code.
190196
let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok|
191-
!matches!(tok, TokenKind::Whitespace | TokenKind::LineComment | TokenKind::BlockComment { .. })
197+
!matches!(tok, TokenKind::Whitespace | TokenKind::LineComment { .. } | TokenKind::BlockComment { .. })
192198
);
193199
if next_non_whitespace_token != Some(TokenKind::OpenBracket) {
194200
// No other choice than to consider this a shebang.
@@ -410,13 +416,32 @@ impl Cursor<'_> {
410416
fn line_comment(&mut self) -> TokenKind {
411417
debug_assert!(self.prev() == '/' && self.first() == '/');
412418
self.bump();
419+
420+
let doc_style = match self.first() {
421+
// `//!` is an inner line doc comment.
422+
'!' => Some(DocStyle::Inner),
423+
// `////` (more than 3 slashes) is not considered a doc comment.
424+
'/' if self.second() != '/' => Some(DocStyle::Outer),
425+
_ => None,
426+
};
427+
413428
self.eat_while(|c| c != '\n');
414-
LineComment
429+
LineComment { doc_style }
415430
}
416431

417432
fn block_comment(&mut self) -> TokenKind {
418433
debug_assert!(self.prev() == '/' && self.first() == '*');
419434
self.bump();
435+
436+
let doc_style = match self.first() {
437+
// `/*!` is an inner block doc comment.
438+
'!' => Some(DocStyle::Inner),
439+
// `/***` (more than 2 stars) is not considered a doc comment.
440+
// `/**/` is not considered a doc comment.
441+
'*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer),
442+
_ => None,
443+
};
444+
420445
let mut depth = 1usize;
421446
while let Some(c) = self.bump() {
422447
match c {
@@ -438,7 +463,7 @@ impl Cursor<'_> {
438463
}
439464
}
440465

441-
BlockComment { terminated: depth == 0 }
466+
BlockComment { doc_style, terminated: depth == 0 }
442467
}
443468

444469
fn whitespace(&mut self) -> TokenKind {

src/librustc_parse/lexer/mod.rs

+55-41
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1+
use rustc_ast::ast::AttrStyle;
12
use rustc_ast::token::{self, CommentKind, Token, TokenKind};
2-
use rustc_ast::util::comments;
33
use rustc_data_structures::sync::Lrc;
44
use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
55
use rustc_lexer::Base;
@@ -15,7 +15,7 @@ mod tokentrees;
1515
mod unescape_error_reporting;
1616
mod unicode_chars;
1717

18-
use rustc_lexer::unescape::Mode;
18+
use rustc_lexer::{unescape::Mode, DocStyle};
1919
use unescape_error_reporting::{emit_unescape_error, push_escaped_char};
2020

2121
#[derive(Clone, Debug)]
@@ -168,25 +168,23 @@ impl<'a> StringReader<'a> {
168168
/// symbols and runs additional validation.
169169
fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> TokenKind {
170170
match token {
171-
rustc_lexer::TokenKind::LineComment => {
172-
let string = self.str_from(start);
173-
if let Some(attr_style) = comments::line_doc_comment_style(string) {
174-
self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment");
175-
// Opening delimiter of the length 3 is not included into the symbol.
176-
token::DocComment(CommentKind::Line, attr_style, Symbol::intern(&string[3..]))
177-
} else {
178-
token::Comment
171+
rustc_lexer::TokenKind::LineComment { doc_style } => {
172+
match doc_style {
173+
Some(doc_style) => {
174+
// Opening delimiter of the length 3 is not included into the symbol.
175+
let content_start = start + BytePos(3);
176+
let content = self.str_from(content_start);
177+
178+
self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
179+
}
180+
None => token::Comment,
179181
}
180182
}
181-
rustc_lexer::TokenKind::BlockComment { terminated } => {
182-
let string = self.str_from(start);
183-
let attr_style = comments::block_doc_comment_style(string, terminated);
184-
183+
rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
185184
if !terminated {
186-
let msg = if attr_style.is_some() {
187-
"unterminated block doc-comment"
188-
} else {
189-
"unterminated block comment"
185+
let msg = match doc_style {
186+
Some(_) => "unterminated block doc-comment",
187+
None => "unterminated block comment",
190188
};
191189
let last_bpos = self.pos;
192190
self.sess
@@ -199,18 +197,17 @@ impl<'a> StringReader<'a> {
199197
.emit();
200198
FatalError.raise();
201199
}
202-
203-
if let Some(attr_style) = attr_style {
204-
self.forbid_bare_cr(start, string, "bare CR not allowed in block doc-comment");
205-
// Opening delimiter of the length 3 and closing delimiter of the length 2
206-
// are not included into the symbol.
207-
token::DocComment(
208-
CommentKind::Block,
209-
attr_style,
210-
Symbol::intern(&string[3..string.len() - if terminated { 2 } else { 0 }]),
211-
)
212-
} else {
213-
token::Comment
200+
match doc_style {
201+
Some(doc_style) => {
202+
// Opening delimiter of the length 3 and closing delimiter of the length 2
203+
// are not included into the symbol.
204+
let content_start = start + BytePos(3);
205+
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
206+
let content = self.str_from_to(content_start, content_end);
207+
208+
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
209+
}
210+
None => token::Comment,
214211
}
215212
}
216213
rustc_lexer::TokenKind::Whitespace => token::Whitespace,
@@ -319,6 +316,34 @@ impl<'a> StringReader<'a> {
319316
}
320317
}
321318

319+
fn cook_doc_comment(
320+
&self,
321+
content_start: BytePos,
322+
content: &str,
323+
comment_kind: CommentKind,
324+
doc_style: DocStyle,
325+
) -> TokenKind {
326+
if content.contains('\r') {
327+
for (idx, _) in content.char_indices().filter(|&(_, c)| c == '\r') {
328+
self.err_span_(
329+
content_start + BytePos(idx as u32),
330+
content_start + BytePos(idx as u32 + 1),
331+
match comment_kind {
332+
CommentKind::Line => "bare CR not allowed in doc-comment",
333+
CommentKind::Block => "bare CR not allowed in block doc-comment",
334+
},
335+
);
336+
}
337+
}
338+
339+
let attr_style = match doc_style {
340+
DocStyle::Outer => AttrStyle::Outer,
341+
DocStyle::Inner => AttrStyle::Inner,
342+
};
343+
344+
token::DocComment(comment_kind, attr_style, Symbol::intern(content))
345+
}
346+
322347
fn cook_lexer_literal(
323348
&self,
324349
start: BytePos,
@@ -472,17 +497,6 @@ impl<'a> StringReader<'a> {
472497
&self.src[self.src_index(start)..self.src_index(end)]
473498
}
474499

475-
fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) {
476-
let mut idx = 0;
477-
loop {
478-
idx = match s[idx..].find('\r') {
479-
None => break,
480-
Some(it) => idx + it + 1,
481-
};
482-
self.err_span_(start + BytePos(idx as u32 - 1), start + BytePos(idx as u32), errmsg);
483-
}
484-
}
485-
486500
fn report_raw_str_error(&self, start: BytePos, opt_err: Option<RawStrError>) {
487501
match opt_err {
488502
Some(RawStrError::InvalidStarter { bad_char }) => {

0 commit comments

Comments
 (0)