rust-lang · matklad · Oct 10, 2019
diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs
@@ -268,6 +268,7 @@ impl Cursor<'_> {
         loop {
             match self.nth_char(0) {
                 '\n' => break,
+                '\r' if self.nth_char(1) == '\n' => break,
                 EOF_CHAR if self.is_eof() => break,
                 _ => {
                     self.bump();
@@ -440,6 +441,7 @@ impl Cursor<'_> {
             match self.nth_char(0) {
                 '/' if !first => break,
                 '\n' if self.nth_char(1) != '\'' => break,
+                '\r' if self.nth_char(1) == '\n' => break,
                 EOF_CHAR if self.is_eof() => break,
                 '\'' => {
                     self.bump();

diff --git a/src/librustc_lexer/src/unescape.rs b/src/librustc_lexer/src/unescape.rs
@@ -128,7 +128,11 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
     if first_char != '\\' {
         return match first_char {
             '\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
-            '\r' => Err(EscapeError::BareCarriageReturn),
+            '\r' => Err(if chars.clone().next() == Some('\n') {
+                EscapeError::EscapeOnlyChar
+            } else {
+                EscapeError::BareCarriageReturn
+            }),
             '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
             '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
             _ => {
@@ -240,15 +244,27 @@ where
 
         let unescaped_char = match first_char {
             '\\' => {
-                let second_char = chars.clone().next();
-                match second_char {
-                    Some('\n') => {
+                let (second_char, third_char) = {
+                    let mut chars = chars.clone();
+                    (chars.next(), chars.next())
+                };
+                match (second_char, third_char) {
+                    (Some('\n'), _) | (Some('\r'), Some('\n')) => {
                         skip_ascii_whitespace(&mut chars);
                         continue;
                     }
                     _ => scan_escape(first_char, &mut chars, mode),
                 }
             }
+            '\r' => {
+                let second_char = chars.clone().next();
+                if second_char == Some('\n') {
+                    chars.next();
+                    Ok('\n')
+                } else {
+                    scan_escape(first_char, &mut chars, mode)
+                }
+            }
             '\n' => Ok('\n'),
             '\t' => Ok('\t'),
             _ => scan_escape(first_char, &mut chars, mode),
@@ -282,11 +298,15 @@ where
     while let Some(curr) = chars.next() {
         let start = initial_len - chars.as_str().len() - curr.len_utf8();
 
-        let result = match curr {
-            '\r' => Err(EscapeError::BareCarriageReturnInRawString),
-            c if mode.is_bytes() && !c.is_ascii() =>
+        let result = match (curr, chars.clone().next()) {
+            ('\r', Some('\n')) => {
+                chars.next();
+                Ok('\n')
+            },
+            ('\r', _) => Err(EscapeError::BareCarriageReturnInRawString),
+            (c, _) if mode.is_bytes() && !c.is_ascii() =>
                 Err(EscapeError::NonAsciiCharInByteString),
-            c => Ok(c),
+            (c, _) => Ok(c),
         };
         let end = initial_len - chars.as_str().len();
 

diff --git a/src/librustc_lexer/src/unescape/tests.rs b/src/librustc_lexer/src/unescape/tests.rs
@@ -11,6 +11,7 @@ fn test_unescape_char_bad() {
     check(r"\", EscapeError::LoneSlash);
 
     check("\n", EscapeError::EscapeOnlyChar);
+    check("\r\n", EscapeError::EscapeOnlyChar);
     check("\t", EscapeError::EscapeOnlyChar);
     check("'", EscapeError::EscapeOnlyChar);
     check("\r", EscapeError::BareCarriageReturn);
@@ -30,7 +31,6 @@ fn test_unescape_char_bad() {
     check(r"\v", EscapeError::InvalidEscape);
     check(r"\💩", EscapeError::InvalidEscape);
     check(r"\●", EscapeError::InvalidEscape);
-    check("\\\r", EscapeError::InvalidEscape);
 
     check(r"\x", EscapeError::TooShortHexEscape);
     check(r"\x0", EscapeError::TooShortHexEscape);
@@ -116,9 +116,10 @@ fn test_unescape_str_good() {
 
     check("foo", "foo");
     check("", "");
-    check(" \t\n", " \t\n");
+    check(" \t\n\r\n", " \t\n\n");
 
     check("hello \\\n     world", "hello world");
+    check("hello \\\r\n     world", "hello world");
     check("thread's", "thread's")
 }
 
@@ -133,6 +134,7 @@ fn test_unescape_byte_bad() {
     check(r"\", EscapeError::LoneSlash);
 
     check("\n", EscapeError::EscapeOnlyChar);
+    check("\r\n", EscapeError::EscapeOnlyChar);
     check("\t", EscapeError::EscapeOnlyChar);
     check("'", EscapeError::EscapeOnlyChar);
     check("\r", EscapeError::BareCarriageReturn);
@@ -236,9 +238,10 @@ fn test_unescape_byte_str_good() {
 
     check("foo", b"foo");
     check("", b"");
-    check(" \t\n", b" \t\n");
+    check(" \t\n\r\n", b" \t\n\n");
 
     check("hello \\\n     world", b"hello world");
+    check("hello \\\r\n     world", b"hello world");
     check("thread's", b"thread's")
 }
 
@@ -250,6 +253,7 @@ fn test_unescape_raw_str() {
         assert_eq!(unescaped, expected);
     }
 
+    check("\r\n", &[(0..2, Ok('\n'))]);
     check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
     check("\rx", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString)), (1..2, Ok('x'))]);
 }
@@ -262,6 +266,7 @@ fn test_unescape_raw_byte_str() {
         assert_eq!(unescaped, expected);
     }
 
+    check("\r\n", &[(0..2, Ok(byte_from_char('\n')))]);
     check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
     check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]);
     check(

diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs
@@ -8,7 +8,9 @@ use syntax_pos::{BytePos, Pos, Span};
 use rustc_lexer::Base;
 use rustc_lexer::unescape;
 
+use std::borrow::Cow;
 use std::char;
+use std::iter;
 use std::convert::TryInto;
 use rustc_data_structures::sync::Lrc;
 use log::debug;
@@ -179,7 +181,18 @@ impl<'a> StringReader<'a> {
                 let string = self.str_from(start);
                 // comments with only more "/"s are not doc comments
                 let tok = if is_doc_comment(string) {
-                    self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment");
+                    let mut idx = 0;
+                    loop {
+                        idx = match string[idx..].find('\r') {
+                            None => break,
+                            Some(it) => idx + it + 1
+                        };
+                        if string[idx..].chars().next() != Some('\n') {
+                            self.err_span_(start + BytePos(idx as u32 - 1),
+                                            start + BytePos(idx as u32),
+                                            "bare CR not allowed in doc-comment");
+                        }
+                    }
                     token::DocComment(Symbol::intern(string))
                 } else {
                     token::Comment
@@ -204,10 +217,15 @@ impl<'a> StringReader<'a> {
                 }
 
                 let tok = if is_doc_comment {
-                    self.forbid_bare_cr(start,
-                                        string,
-                                        "bare CR not allowed in block doc-comment");
-                    token::DocComment(Symbol::intern(string))
+                    let has_cr = string.contains('\r');
+                    let string = if has_cr {
+                        self.translate_crlf(start,
+                                            string,
+                                            "bare CR not allowed in block doc-comment")
+                    } else {
+                        string.into()
+                    };
+                    token::DocComment(Symbol::intern(&string[..]))
                 } else {
                     token::Comment
                 };
@@ -473,16 +491,49 @@ impl<'a> StringReader<'a> {
         &self.src[self.src_index(start)..self.src_index(end)]
     }
 
-    fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) {
-        let mut idx = 0;
-        loop {
-            idx = match s[idx..].find('\r') {
-                None => break,
-                Some(it) => idx + it + 1
-            };
-            self.err_span_(start + BytePos(idx as u32 - 1),
-                           start + BytePos(idx as u32),
-                           errmsg);
+    /// Converts CRLF to LF in the given string, raising an error on bare CR.
+    fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
+        let mut chars = s.char_indices().peekable();
+        while let Some((i, ch)) = chars.next() {
+            if ch == '\r' {
+                if let Some((lf_idx, '\n')) = chars.peek() {
+                    return translate_crlf_(self, start, s, *lf_idx, chars, errmsg).into();
+                }
+                let pos = start + BytePos(i as u32);
+                let end_pos = start + BytePos((i + ch.len_utf8()) as u32);
+                self.err_span_(pos, end_pos, errmsg);
+            }
+        }
+        return s.into();
+
+        fn translate_crlf_(rdr: &StringReader<'_>,
+                           start: BytePos,
+                           s: &str,
+                           mut j: usize,
+                           mut chars: iter::Peekable<impl Iterator<Item = (usize, char)>>,
+                           errmsg: &str)
+                           -> String {
+            let mut buf = String::with_capacity(s.len());
+            // Skip first CR
+            buf.push_str(&s[.. j - 1]);
+            while let Some((i, ch)) = chars.next() {
+                if ch == '\r' {
+                    if j < i {
+                        buf.push_str(&s[j..i]);
+                    }
+                    let next = i + ch.len_utf8();
+                    j = next;
+                    if chars.peek().map(|(_, ch)| *ch) != Some('\n') {
+                        let pos = start + BytePos(i as u32);
+                        let end_pos = start + BytePos(next as u32);
+                        rdr.err_span_(pos, end_pos, errmsg);
+                    }
+                }
+            }
+            if j < s.len() {
+                buf.push_str(&s[j..]);
+            }
+            buf
         }
     }
 

diff --git a/src/libsyntax_pos/lib.rs b/src/libsyntax_pos/lib.rs
@@ -1064,7 +1064,6 @@ impl SourceFile {
                mut src: String,
                start_pos: BytePos) -> Result<SourceFile, OffsetOverflowError> {
         remove_bom(&mut src);
-        normalize_newlines(&mut src);
 
         let src_hash = {
             let mut hasher: StableHasher<u128> = StableHasher::new();
@@ -1232,61 +1231,6 @@ fn remove_bom(src: &mut String) {
     }
 }
 
-
-/// Replaces `\r\n` with `\n` in-place in `src`.
-///
-/// Returns error if there's a lone `\r` in the string
-fn normalize_newlines(src: &mut String) {
-    if !src.as_bytes().contains(&b'\r') {
-        return;
-    }
-
-    // We replace `\r\n` with `\n` in-place, which doesn't break utf-8 encoding.
-    // While we *can* call `as_mut_vec` and do surgery on the live string
-    // directly, let's rather steal the contents of `src`. This makes the code
-    // safe even if a panic occurs.
-
-    let mut buf = std::mem::replace(src, String::new()).into_bytes();
-    let mut gap_len = 0;
-    let mut tail = buf.as_mut_slice();
-    loop {
-        let idx = match find_crlf(&tail[gap_len..]) {
-            None => tail.len(),
-            Some(idx) => idx + gap_len,
-        };
-        tail.copy_within(gap_len..idx, 0);
-        tail = &mut tail[idx - gap_len..];
-        if tail.len() == gap_len {
-            break;
-        }
-        gap_len += 1;
-    }
-
-    // Account for removed `\r`.
-    // After `set_len`, `buf` is guaranteed to contain utf-8 again.
-    let new_len = buf.len() - gap_len;
-    unsafe {
-        buf.set_len(new_len);
-        *src = String::from_utf8_unchecked(buf);
-    }
-
-    fn find_crlf(src: &[u8]) -> Option<usize> {
-        let mut search_idx = 0;
-        while let Some(idx) = find_cr(&src[search_idx..]) {
-            if src[search_idx..].get(idx + 1) != Some(&b'\n') {
-                search_idx += idx + 1;
-                continue;
-            }
-            return Some(search_idx + idx);
-        }
-        None
-    }
-
-    fn find_cr(src: &[u8]) -> Option<usize> {
-        src.iter().position(|&b| b == b'\r')
-    }
-}
-
 // _____________________________________________________________________________
 // Pos, BytePos, CharPos
 //

diff --git a/src/libsyntax_pos/tests.rs b/src/libsyntax_pos/tests.rs
@@ -16,23 +16,3 @@ fn test_lookup_line() {
     assert_eq!(lookup_line(lines, BytePos(28)), 2);
     assert_eq!(lookup_line(lines, BytePos(29)), 2);
 }
-
-#[test]
-fn test_normalize_newlines() {
-    fn check(before: &str, after: &str) {
-        let mut actual = before.to_string();
-        normalize_newlines(&mut actual);
-        assert_eq!(actual.as_str(), after);
-    }
-    check("", "");
-    check("\n", "\n");
-    check("\r", "\r");
-    check("\r\r", "\r\r");
-    check("\r\n", "\n");
-    check("hello world", "hello world");
-    check("hello\nworld", "hello\nworld");
-    check("hello\r\nworld", "hello\nworld");
-    check("\r\nhello\r\nworld\r\n", "\nhello\nworld\n");
-    check("\r\r\n", "\r\n");
-    check("hello\rworld", "hello\rworld");
-}