|
| 1 | +// Copyright 2014-2025 The html5ever Project Developers. See the |
| 2 | +// COPYRIGHT file at the top-level directory of this distribution. |
| 3 | +// |
| 4 | +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 5 | +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 6 | +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 7 | +// option. This file may not be copied, modified, or distributed |
| 8 | +// except according to those terms. |
| 9 | + |
| 10 | +use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_8, WINDOWS_1252, X_USER_DEFINED}; |
| 11 | +use tendril::{fmt::Bytes, Tendril}; |
| 12 | + |
| 13 | +use crate::buffer_queue::BufferQueue; |
| 14 | + |
| 15 | +/// <https://html.spec.whatwg.org/#concept-encoding-confidence> |
| 16 | +#[derive(Clone, Copy, Debug, PartialEq, Eq)] |
| 17 | +pub enum Confidence { |
| 18 | + Tentative, |
| 19 | + Certain, |
| 20 | + Irrelevant, |
| 21 | +} |
| 22 | + |
| 23 | +pub struct Decoder { |
| 24 | + inner: encoding_rs::Decoder, |
| 25 | + confidence: Confidence, |
| 26 | +} |
| 27 | + |
| 28 | +impl Decoder { |
| 29 | + pub fn new(encoding: &'static Encoding, confidence: Confidence) -> Self { |
| 30 | + Self { |
| 31 | + inner: encoding.new_decoder(), |
| 32 | + confidence, |
| 33 | + } |
| 34 | + } |
| 35 | + |
| 36 | + pub fn confidence(&self) -> Confidence { |
| 37 | + self.confidence |
| 38 | + } |
| 39 | + |
| 40 | + /// Returns `None` if the encoding should not be changed and `Some(encoding)` if the current encoding |
| 41 | + /// should be changed to `encoding` |
| 42 | + pub fn change_the_encoding_to( |
| 43 | + &mut self, |
| 44 | + mut new_encoding: &'static Encoding, |
| 45 | + ) -> Option<&'static Encoding> { |
| 46 | + let current_encoding = self.inner.encoding(); |
| 47 | + // Step 1. If the encoding that is already being used to interpret the input stream is UTF-16BE/LE, |
| 48 | + // then set the confidence to certain and return. The new encoding is ignored; if it was anything |
| 49 | + // but the same encoding, then it would be clearly incorrect. |
| 50 | + if current_encoding == UTF_16BE || current_encoding == UTF_16BE { |
| 51 | + self.confidence = Confidence::Certain; |
| 52 | + return None; |
| 53 | + } |
| 54 | + |
| 55 | + // Step 2. If the new encoding is UTF-16BE/LE, then change it to UTF-8. |
| 56 | + if new_encoding == UTF_16BE || new_encoding == UTF_16BE { |
| 57 | + new_encoding = UTF_8; |
| 58 | + } |
| 59 | + |
| 60 | + // Step 3. If the new encoding is x-user-defined, then change it to windows-1252. |
| 61 | + if new_encoding == X_USER_DEFINED { |
| 62 | + new_encoding = WINDOWS_1252; |
| 63 | + } |
| 64 | + |
| 65 | + // Step 4. If the new encoding is identical or equivalent to the encoding that is already being used to interpret |
| 66 | + // the input stream, then set the confidence to certain and return. This happens when the encoding information found |
| 67 | + // in the file matches what the encoding sniffing algorithm determined to be the encoding, and in the second pass |
| 68 | + // through the parser if the first pass found that the encoding sniffing algorithm described in the earlier section |
| 69 | + // failed to find the right encoding. |
| 70 | + if current_encoding == new_encoding { |
| 71 | + self.confidence = Confidence::Certain; |
| 72 | + return None; |
| 73 | + } |
| 74 | + |
| 75 | + // Step 5. If all the bytes up to the last byte converted by the current decoder have the same |
| 76 | + // Unicode interpretations in both the current encoding and the new encoding, and if the user agent |
| 77 | + // supports changing the converter on the fly, then the user agent may change to the new converter |
| 78 | + // for the encoding on the fly. Set the document's character encoding and the encoding used to convert |
| 79 | + // the input stream to the new encoding, set the confidence to certain, and return. |
| 80 | + // NOTE: We don't support changing the converter on the fly |
| 81 | + |
| 82 | + // Step 6. Otherwise, restart the navigate algorithm, with historyHandling set to "replace" and |
| 83 | + // other inputs kept the same, but this time skip the encoding sniffing algorithm and instead just |
| 84 | + // set the encoding to the new encoding and the confidence to certain. Whenever possible, this should |
| 85 | + // be done without actually contacting the network layer (the bytes should be re-parsed from memory), |
| 86 | + // even if, e.g., the document is marked as not being cacheable. If this is not possible and contacting |
| 87 | + // the network layer would involve repeating a request that uses a method other than `GET`, then instead |
| 88 | + // set the confidence to certain and ignore the new encoding. The resource will be misinterpreted. |
| 89 | + // User agents may notify the user of the situation, to aid in application development. |
| 90 | + Some(new_encoding) |
| 91 | + } |
| 92 | + |
| 93 | + /// Decode the given chunk with the current encoding. The result will be pushed to the end |
| 94 | + /// of the input stream. |
| 95 | + pub fn decode(&mut self, chunk: &[u8], last: bool, output: &BufferQueue) { |
| 96 | + let mut remaining = chunk; |
| 97 | + loop { |
| 98 | + let mut out: Tendril<Bytes> = Tendril::new(); |
| 99 | + let max_len = self |
| 100 | + .inner |
| 101 | + .max_utf8_buffer_length_without_replacement(remaining.len()) |
| 102 | + .unwrap_or(8192) |
| 103 | + .min(8192); |
| 104 | + |
| 105 | + // SAFETY: encoding_rs::Decoder::decode_to_utf8_without_replacement is going to initialize |
| 106 | + // part of the buffer. We are only going to access the initialized segment. |
| 107 | + unsafe { |
| 108 | + out.push_uninitialized(max_len as u32); |
| 109 | + } |
| 110 | + |
| 111 | + let (result, bytes_read, bytes_written) = self |
| 112 | + .inner |
| 113 | + .decode_to_utf8_without_replacement(&remaining, &mut out, last); |
| 114 | + |
| 115 | + if bytes_written > 0 { |
| 116 | + let bytes_chunk = out.subtendril(0, bytes_written as u32); |
| 117 | + |
| 118 | + // SAFETY: encoding_rs::Decoder::decode_to_utf8_without_replacement writes valid utf8 |
| 119 | + let utf8_chunk = unsafe { bytes_chunk.reinterpret_without_validating() }; |
| 120 | + output.push_back(utf8_chunk); |
| 121 | + } |
| 122 | + |
| 123 | + if matches!(result, DecoderResult::Malformed(_, _)) { |
| 124 | + output.push_back("\u{FFFD}".into()); |
| 125 | + } |
| 126 | + |
| 127 | + remaining = &remaining[bytes_read..]; |
| 128 | + if remaining.is_empty() { |
| 129 | + return; |
| 130 | + } |
| 131 | + } |
| 132 | + } |
| 133 | +} |
0 commit comments