encoding_rs_io-0.1.7/.github/workflows/ci.yml010066400017500001731000000032611361726413000174420ustar0000000000000000name: ci on: pull_request: push: branches: - master schedule: - cron: '00 01 * * *' jobs: test: name: test runs-on: ${{ matrix.os }} strategy: matrix: build: - pinned - stable - beta - nightly - macos - win-msvc - win-gnu include: - build: pinned os: ubuntu-18.04 rust: 1.39.0 - build: stable os: ubuntu-18.04 rust: stable - build: beta os: ubuntu-18.04 rust: beta - build: nightly os: ubuntu-18.04 rust: nightly - build: macos os: macos-latest rust: stable - build: win-msvc os: windows-2019 rust: stable - build: win-gnu os: windows-2019 rust: stable-x86_64-gnu steps: - name: Checkout repository uses: actions/checkout@v1 with: fetch-depth: 1 - name: Install Rust uses: actions-rs/toolchain@v1 with: toolchain: ${{ matrix.rust }} override: true profile: minimal - run: cargo build --verbose - run: cargo doc --verbose - run: cargo test --verbose rustfmt: name: rustfmt runs-on: ubuntu-18.04 steps: - name: Checkout repository uses: actions/checkout@v1 with: fetch-depth: 1 - name: Install Rust uses: actions-rs/toolchain@v1 with: toolchain: stable override: true profile: minimal components: rustfmt - name: Install rustfmt run: rustup component add rustfmt - name: Check formatting run: | cargo fmt -- --check encoding_rs_io-0.1.7/.gitignore010064400017500000144000000000361332376601500147120ustar0000000000000000.*.swp tags target Cargo.lock encoding_rs_io-0.1.7/COPYING010064400017500000144000000004041332732722200137510ustar0000000000000000This project is licensed under either of * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) at your option. encoding_rs_io-0.1.7/Cargo.toml.orig010066400017500001731000000013271361726413700156260ustar0000000000000000[package] name = "encoding_rs_io" version = "0.1.7" #:version authors = ["Andrew Gallant "] description = "Streaming transcoding for encoding_rs" documentation = "https://docs.rs/encoding_rs_io" repository = "https://github.com/BurntSushi/encoding_rs_io" readme = "README.md" keywords = ["encoding", "transcoding", "stream", "io", "read"] license = "MIT OR Apache-2.0" categories = ["text-processing", "encoding", "web-programming", "email"] exclude = ["/ci/*", "/.travis.yml", "/appveyor.yml"] [badges] travis-ci = { repository = "BurntSushi/encoding_rs_io" } appveyor = { repository = "BurntSushi/encoding_rs_io" } [lib] bench = false [dependencies] encoding_rs = "0.8" [profile.release] debug = true encoding_rs_io-0.1.7/Cargo.toml0000644000000023341361726414000121240ustar00# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] name = "encoding_rs_io" version = "0.1.7" authors = ["Andrew Gallant "] exclude = ["/ci/*", "/.travis.yml", "/appveyor.yml"] description = "Streaming transcoding for encoding_rs" documentation = "https://docs.rs/encoding_rs_io" readme = "README.md" keywords = ["encoding", "transcoding", "stream", "io", "read"] categories = ["text-processing", "encoding", "web-programming", "email"] license = "MIT OR Apache-2.0" repository = "https://github.com/BurntSushi/encoding_rs_io" [profile.release] debug = true [lib] bench = false [dependencies.encoding_rs] version = "0.8" [badges.appveyor] repository = "BurntSushi/encoding_rs_io" [badges.travis-ci] repository = "BurntSushi/encoding_rs_io" encoding_rs_io-0.1.7/LICENSE-APACHE010064400017500000144000000251371332732716700146640ustar0000000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. encoding_rs_io-0.1.7/LICENSE-MIT010064400017500000144000000020711311144420200143410ustar0000000000000000The MIT License (MIT) Copyright (c) 2015 Andrew Gallant Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. encoding_rs_io-0.1.7/README.md010066400017500001731000000102201361726413000141770ustar0000000000000000encoding_rs_io ============== This crate provides streaming adapters for the [`encoding_rs`](https://github.com/hsivonen/encoding_rs) crate. Adapters implement the standard library I/O traits and provide streaming transcoding support. [![Build status](https://github.com/BurntSushi/encoding_rs_io/workflows/ci/badge.svg)](https://github.com/BurntSushi/encoding_rs_io/actions) [![](http://meritbadge.herokuapp.com/encoding_rs_io)](https://crates.io/crates/encoding_rs_io) ### Documentation https://docs.rs/encoding_rs_io ### Usage Add this to your `Cargo.toml`: ```toml [dependencies] encoding_rs_io = "0.1" ``` and this to your crate root: ```rust extern crate encoding_rs_io; ``` ### Example This example shows how to create a decoder that transcodes UTF-16LE (the source, indicated by a BOM) to UTF-8 (the destination). ```rust extern crate encoding_rs; extern crate encoding_rs_io; use std::error::Error; use std::io::Read; use encoding_rs_io::DecodeReaderBytes; fn main() { example().unwrap(); } fn example() -> Result<(), Box> { let source_data = &b"\xFF\xFEf\x00o\x00o\x00b\x00a\x00r\x00"[..]; // N.B. `source_data` can be any arbitrary io::Read implementation. let mut decoder = DecodeReaderBytes::new(source_data); let mut dest = String::new(); // decoder implements the io::Read trait, so it can easily be plugged // into any consumer expecting an arbitrary reader. decoder.read_to_string(&mut dest)?; assert_eq!(dest, "foobar"); Ok(()) } ``` ### Future work Currently, this crate only provides a way to get _possibly valid_ UTF-8 from some source encoding. There are other transformations that may be useful that we could include in this crate. Namely: * An encoder that accepts an arbitrary `std::io::Write` implementation and takes valid UTF-8 and transcodes it to a selected destination encoding. This encoder would implement `std::fmt::Write`. * A decoder that accepts an arbitrary `std::fmt::Write` implementation and takes arbitrary bytes and transcodes them from a selected source encoding to valid UTF-8. This decoder would implement `std::io::Write`. * An encoder that accepts an arbitrary `UnicodeRead` implementation and takes valid UTF-8 and transcodes it to a selected destination encoding. This encoder would implement `std::io::Read`. * A decoder that accepts an arbitrary `std::io::Read` implementation and takes arbitrary bytes and transcodes them from a selected source encoding to valid UTF-8. This decoder would implement the `UnicodeRead` trait. Where `UnicodeRead` is a hypothetical trait that does not yet exist. Its definition might look something like this: ```ignore trait UnicodeRead { fn read(&mut self, buf: &mut str) -> Result; } ``` Interestingly, of the above transformations, none of them correspond to `DecodeReaderBytes`. Namely, `DecodeReaderBytes` most closely corresponds to the last option, but instead of guaranteeing valid UTF-8 by implementing a trait like `UnicodeRead`, it instead implements `std::io::Read`, which pushes UTF-8 handling on to the caller. However, it turns out that this particular use case is important for operations like search, which can often be written in a way that don't assume UTF-8 validity but still benefit from it. It's not clear which of the above transformations is actually useful, but all of them could theoretically exist. There is more discussion on this topic here (and in particular, the above formulation was taken almost verbatim from Simon Sapin's comments): https://github.com/hsivonen/encoding_rs/issues/8 It is also perhaps worth stating that this crate very much intends on remaining coupled to `encoding_rs`, which helps restrict the scope, but may be too biased toward Web oriented encoding to solve grander encoding challenges. As such, it may very well be that this crate is actually a stepping stone to something with a larger scope. But first, we must learn. ### License This project is licensed under either of * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) at your option. encoding_rs_io-0.1.7/rustfmt.toml010066400017500001731000000000541361726413000153250ustar0000000000000000max_width = 79 use_small_heuristics = "max" encoding_rs_io-0.1.7/src/lib.rs010066400017500001731000000760641361726413000146450ustar0000000000000000/*! This crate provides streaming transcoding by implementing Rust's I/O traits and delegating transcoding to the [`encoding_rs`](https://crates.io/crates/encoding_rs) crate. Currently, this crate only provides a means of transcoding from a source encoding (that is among the encodings supported by `encoding_rs`) to UTF-8 via an implementation of `std::io::Read`, where errors are handled by replacing invalid sequences with the Unicode replacement character. Future work may provide additional implementations for `std::io::Write` and/or implementations that make stronger guarantees about UTF-8 validity. # Example This example shows how to create a decoder that transcodes UTF-16LE (the source) to UTF-8 (the destination). ``` extern crate encoding_rs; extern crate encoding_rs_io; use std::error::Error; use std::io::Read; use encoding_rs_io::DecodeReaderBytes; # fn main() { example().unwrap(); } fn example() -> Result<(), Box> { let source_data = &b"\xFF\xFEf\x00o\x00o\x00b\x00a\x00r\x00"[..]; // N.B. `source_data` can be any arbitrary io::Read implementation. let mut decoder = DecodeReaderBytes::new(source_data); let mut dest = String::new(); // decoder implements the io::Read trait, so it can easily be plugged // into any consumer expecting an arbitrary reader. decoder.read_to_string(&mut dest)?; assert_eq!(dest, "foobar"); Ok(()) } ``` # Future work Currently, this crate only provides a way to get _possibly valid_ UTF-8 from some source encoding. There are other transformations that may be useful that we could include in this crate. Namely: * An encoder that accepts an arbitrary `std::io::Write` implementation and takes valid UTF-8 and transcodes it to a selected destination encoding. This encoder would implement `std::fmt::Write`. * A decoder that accepts an arbitrary `std::fmt::Write` implementation and takes arbitrary bytes and transcodes them from a selected source encoding to valid UTF-8. This decoder would implement `std::io::Write`. * An encoder that accepts an arbitrary `UnicodeRead` implementation and takes valid UTF-8 and transcodes it to a selected destination encoding. This encoder would implement `std::io::Read`. * A decoder that accepts an arbitrary `std::io::Read` implementation and takes arbitrary bytes and transcodes them from a selected source encoding to valid UTF-8. This decoder would implement the `UnicodeRead` trait. Where `UnicodeRead` is a hypothetical trait that does not yet exist. Its definition might look something like this: ```ignore trait UnicodeRead { fn read(&mut self, buf: &mut str) -> Result; } ``` Interestingly, of the above transformations, none of them correspond to `DecodeReaderBytes`. Namely, `DecodeReaderBytes` most closely corresponds to the last option, but instead of guaranteeing valid UTF-8 by implementing a trait like `UnicodeRead`, it instead implements `std::io::Read`, which pushes UTF-8 handling on to the caller. However, it turns out that this particular use case is important for operations like search, which can often be written in a way that don't assume UTF-8 validity but still benefit from it. It's not clear which of the above transformations is actually useful, but all of them could theoretically exist. There is more discussion on this topic here (and in particular, the above formulation was taken almost verbatim from Simon Sapin's comments): https://github.com/hsivonen/encoding_rs/issues/8 It is also perhaps worth stating that this crate very much intends on remaining coupled to `encoding_rs`, which helps restrict the scope, but may be too biased toward Web oriented encoding to solve grander encoding challenges. As such, it may very well be that this crate is actually a stepping stone to something with a larger scope. But first, we must learn. */ extern crate encoding_rs; use std::fmt; use std::io::{self, Read}; use encoding_rs::{Decoder, Encoding, UTF_8}; use util::{BomPeeker, TinyTranscoder}; mod util; /// A builder for constructing a byte oriented transcoder to UTF-8. #[derive(Clone, Debug)] pub struct DecodeReaderBytesBuilder { encoding: Option<&'static Encoding>, utf8_passthru: bool, bom_override: bool, strip_bom: bool, bom_sniffing: bool, } impl Default for DecodeReaderBytesBuilder { fn default() -> DecodeReaderBytesBuilder { DecodeReaderBytesBuilder::new() } } impl DecodeReaderBytesBuilder { /// Create a new decoder builder with a default configuration. /// /// By default, no explicit encoding is used, but if a UTF-8 or UTF-16 /// BOM is detected, then an appropriate encoding is automatically /// detected and transcoding is performed (where invalid sequences map to /// the Unicode replacement codepoint). pub fn new() -> DecodeReaderBytesBuilder { DecodeReaderBytesBuilder { encoding: None, utf8_passthru: false, bom_override: false, strip_bom: false, bom_sniffing: true, } } /// Build a new decoder that wraps the given reader. pub fn build(&self, rdr: R) -> DecodeReaderBytes> { self.build_with_buffer(rdr, vec![0; 8 * (1 << 10)]).unwrap() } /// Build a new decoder that wraps the given reader and uses the given /// buffer internally for transcoding. /// /// This is useful for cases where it is advantageuous to amortize /// allocation. Namely, this method permits reusing a buffer for /// subsequent decoders. /// /// This returns an error if the buffer is smaller than 4 bytes (which is /// too small to hold maximum size of a single UTF-8 encoded codepoint). pub fn build_with_buffer>( &self, rdr: R, mut buffer: B, ) -> io::Result> { if buffer.as_mut().len() < 4 { let msg = format!( "DecodeReaderBytesBuilder: buffer of size {} is too small", buffer.as_mut().len(), ); return Err(io::Error::new(io::ErrorKind::Other, msg)); } let encoding = self.encoding.map(|enc| enc.new_decoder_with_bom_removal()); // No need to do BOM detection if we opt out of it or have an explicit // encoding. let has_detected = !self.bom_sniffing || (!self.bom_override && encoding.is_some()); let peeker = if self.strip_bom { BomPeeker::without_bom(rdr) } else { BomPeeker::with_bom(rdr) }; Ok(DecodeReaderBytes { rdr: peeker, decoder: encoding, tiny: TinyTranscoder::new(), utf8_passthru: self.utf8_passthru, buf: buffer, buflen: 0, pos: 0, has_detected: has_detected, exhausted: false, }) } /// Set an explicit encoding to be used by this decoder. /// /// When an explicit encoding is set, BOM sniffing is disabled and the /// encoding provided will be used unconditionally. Errors in the encoded /// bytes are replaced by the Unicode replacement codepoint. /// /// By default, no explicit encoding is set. pub fn encoding( &mut self, encoding: Option<&'static Encoding>, ) -> &mut DecodeReaderBytesBuilder { self.encoding = encoding; self } /// Enable UTF-8 passthru, even when a UTF-8 BOM is observed. /// /// When an explicit encoding is not set (thereby invoking automatic /// encoding detection via BOM sniffing), then a UTF-8 BOM will cause /// UTF-8 transcoding to occur. In particular, if the source contains /// invalid UTF-8 sequences, then they are replaced with the Unicode /// replacement codepoint. /// /// This transcoding may not be desirable. For example, the caller may /// already have its own UTF-8 handling where invalid UTF-8 is /// appropriately handled, in which case, doing an extra transcoding /// step is extra and unnecessary work. Enabling this option will prevent /// that extra transcoding step from occurring. In this case, the bytes /// emitted by the reader are passed through unchanged (including the BOM) /// and the caller will be responsible for handling any invalid UTF-8. /// /// # Example /// /// This example demonstrates the effect of enabling this option on data /// that includes a UTF-8 BOM but also, interestingly enough, subsequently /// includes invalid UTF-8. /// /// ``` /// extern crate encoding_rs; /// extern crate encoding_rs_io; /// /// use std::error::Error; /// use std::io::Read; /// /// use encoding_rs_io::DecodeReaderBytesBuilder; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let source_data = &b"\xEF\xBB\xBFfoo\xFFbar"[..]; /// let mut decoder = DecodeReaderBytesBuilder::new() /// .utf8_passthru(true) /// .build(source_data); /// /// let mut dest = vec![]; /// decoder.read_to_end(&mut dest)?; /// // Without the passthru option, you'd get "foo\u{FFFD}bar". /// assert_eq!(dest, b"\xEF\xBB\xBFfoo\xFFbar"); /// Ok(()) /// } /// ``` pub fn utf8_passthru( &mut self, yes: bool, ) -> &mut DecodeReaderBytesBuilder { self.utf8_passthru = yes; self } /// Whether or not to always strip a BOM if one is found. /// /// When this is enabled, if a BOM is found at the beginning of a stream, /// then it is ignored. This applies even when `utf8_passthru` is enabled /// or if `bom_sniffing` is disabled. /// /// This is disabled by default. /// /// # Example /// /// This example shows how to remove the BOM if it's present even when /// `utf8_passthru` is enabled. /// /// ``` /// extern crate encoding_rs; /// extern crate encoding_rs_io; /// /// use std::error::Error; /// use std::io::Read; /// /// use encoding_rs_io::DecodeReaderBytesBuilder; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { /// let source_data = &b"\xEF\xBB\xBFfoo\xFFbar"[..]; /// let mut decoder = DecodeReaderBytesBuilder::new() /// .utf8_passthru(true) /// .strip_bom(true) /// .build(source_data); /// /// let mut dest = vec![]; /// decoder.read_to_end(&mut dest)?; /// // If `strip_bom` wasn't enabled, then this would include the BOM. /// assert_eq!(dest, b"foo\xFFbar"); /// Ok(()) /// } /// ``` pub fn strip_bom(&mut self, yes: bool) -> &mut DecodeReaderBytesBuilder { self.strip_bom = yes; self } /// Give the highest precedent to the BOM, if one is found. /// /// When this is enabled, and if a BOM is found, then the encoding /// indicated by that BOM is used even if an explicit encoding has been /// set via the `encoding` method. /// /// This does not override `utf8_passthru`. /// /// This is disabled by default. pub fn bom_override( &mut self, yes: bool, ) -> &mut DecodeReaderBytesBuilder { self.bom_override = yes; self } /// Enable BOM sniffing /// /// When this is enabled and an explicit encoding is not set, the decoder /// will try to detect the encoding with BOM. /// /// When this is disabled and an explicit encoding is not set, the decoder /// will treat the input as raw bytes. The bytes will be passed through /// unchanged, including any BOM that may be present. /// /// This is enabled by default. pub fn bom_sniffing( &mut self, yes: bool, ) -> &mut DecodeReaderBytesBuilder { self.bom_sniffing = yes; self } } /// An implementation of `io::Read` that transcodes to UTF-8 in a streaming /// fashion. /// /// The high level goal of this decoder is to provide access to byte streams /// that are assumed to be UTF-8 unless an encoding is otherwise specified /// (either via a BOM or via an explicit designation of an encoding). /// /// When no explicit source encoding is specified (via /// `DecodeReaderBytesBuilder`), the source encoding is determined by /// inspecting the BOM from the stream read from `R`, if one exists. If a /// UTF-16 BOM exists, then the source stream is transcoded to UTF-8 with /// invalid UTF-16 sequences translated to the Unicode replacement character. /// Similarly if a UTF-8 BOM is seen. In all other cases, the source of the /// underlying reader is passed through unchanged _as if_ it were UTF-8. /// /// Since this particular reader does not guarantee providing valid UTF-8 to /// the caller, the caller must be prepared to handle invalid UTF-8 itself. /// /// `R` is the type of the underlying reader and `B` is the type of an internal /// buffer used to store the results of transcoding. Callers may elect to reuse /// the internal buffer via the `DecodeReaderBytesBuilder::build_with_buffer` /// constructor. pub struct DecodeReaderBytes { /// The underlying reader, wrapped in a peeker for reading a BOM if one /// exists. rdr: BomPeeker, /// The underlying text decoder derived from the BOM or an explicitly /// specified encoding, if one exists. decoder: Option, /// A "tiny transcoder" for use when a caller provides a buffer that is /// too small to write at least one UTF-8 encoded codepoint to. tiny: TinyTranscoder, /// When enabled, if a UTF-8 BOM is observed, then the bytes are passed /// through from the underlying reader as-is instead of passing through /// the UTF-8 transcoder (which will replace invalid sequences with the /// REPLACEMENT CHARACTER). utf8_passthru: bool, /// The internal buffer to store transcoded bytes before they are read by /// callers. buf: B, /// The current position in `buf`. Subsequent reads start here. pos: usize, /// The number of transcoded bytes in `buf`. Subsequent reads end here. buflen: usize, /// Whether BOM detection has been performed yet or not. has_detected: bool, /// Whether the underlying reader has been exhausted or not. exhausted: bool, } impl> io::Read for DecodeReaderBytes { fn read(&mut self, buf: &mut [u8]) -> io::Result { self.detect()?; if self.decoder.is_none() { self.rdr.read(buf) } else { self.transcode(buf) } } } impl DecodeReaderBytes> { /// Create a new transcoder that converts a source stream to valid UTF-8 /// via BOM sniffing. /// /// To explicitly control the encoding, UTF-8 passthru or amortize /// allocation, use the /// [`DecodeReaderBytesBuilder`](struct.DecodeReaderBytesBuilder.html) /// constructor. /// /// When a BOM is found (which must correspond to UTF-8, UTF-16LE or /// UTF-16BE), then transcoding to UTF-8 is performed and any invalid /// sequences in the source data are seamlessly replaced by the Unicode /// replacement character. /// /// When no BOM is found (and no other encoding is specified via the /// builder), the underlying bytes are passed through as-is. pub fn new(rdr: R) -> DecodeReaderBytes> { DecodeReaderBytesBuilder::new().build(rdr) } } impl> DecodeReaderBytes { /// Transcode the inner stream to UTF-8 in `buf`. This assumes that there /// is a decoder capable of transcoding the inner stream to UTF-8. This /// returns the number of bytes written to `buf`. /// /// When this function returns, exactly one of the following things will /// be true: /// /// 1. A non-zero number of bytes were written to `buf`. /// 2. The underlying reader reached EOF (or `buf` is empty). /// 3. An error is returned: the internal buffer ran out of room. /// 4. An I/O error occurred. fn transcode(&mut self, buf: &mut [u8]) -> io::Result { if self.exhausted || buf.is_empty() { return Ok(0); } let nwrite = self.tiny.read(buf)?; if nwrite > 0 { // We could technically mush on if the caller provided buffer is // big enough, but to keep things we simple, we satisfy the // contract and quit. return Ok(nwrite); } if self.pos >= self.buflen { self.fill()?; } if buf.len() < 4 { return self.tiny_transcode(buf); } loop { let (_, nin, nout, _) = self.decoder.as_mut().unwrap().decode_to_utf8( &self.buf.as_mut()[self.pos..self.buflen], buf, false, ); self.pos += nin; // If we've written at least one byte to the caller-provided // buffer, then our mission is complete. if nout > 0 { return Ok(nout); } // Otherwise, we know that our internal buffer has insufficient // data to transcode at least one char, so we attempt to refill it. self.fill()?; // ... but quit on EOF. if self.buflen == 0 { let (_, _, nout, _) = self .decoder .as_mut() .unwrap() .decode_to_utf8(&[], buf, true); return Ok(nout); } } } /// Like transcode, but deals with the case where the caller provided /// buffer is less than 4. fn tiny_transcode(&mut self, buf: &mut [u8]) -> io::Result { assert!(buf.len() < 4, "have a small caller buffer"); loop { let (nin, nout) = self.tiny.transcode( self.decoder.as_mut().unwrap(), &self.buf.as_mut()[self.pos..self.buflen], false, ); self.pos += nin; if nout > 0 { // We've satisfied the contract of writing at least one byte, // so we're done. The tiny transcoder is guaranteed to yield // a non-zero number of bytes. return self.tiny.read(buf); } // Otherwise, we know that our internal buffer has insufficient // data to transcode at least one char, so we attempt to refill it. self.fill()?; // ... but quit on EOF. if self.buflen == 0 { self.tiny.transcode(self.decoder.as_mut().unwrap(), &[], true); return self.tiny.read(buf); } } } /// Peeks at the underlying reader to look for a BOM. If one exists, then /// an appropriate decoder is created corresponding to the detected BOM. fn detect(&mut self) -> io::Result<()> { if self.has_detected { return Ok(()); } self.has_detected = true; let bom = self.rdr.peek_bom()?; if let Some(encoding) = bom.encoding() { // If we got a UTF-8 BOM, and the decoder was configured for // passing through UTF-8, then don't build a decoder at all. if encoding == UTF_8 && self.utf8_passthru { return Ok(()); } self.decoder = Some(encoding.new_decoder_with_bom_removal()); } Ok(()) } /// Fill the internal buffer from the underlying reader. /// /// If there are unread bytes in the internal buffer, then we move them /// to the beginning of the internal buffer and fill the remainder. /// /// If the internal buffer is too small to read additional bytes, then an /// error is returned. fn fill(&mut self) -> io::Result<()> { if self.pos < self.buflen { // Despite my best efforts, I could not seem to actually exercise // this code path in tests. Namely, this code path occurs when the // decoder can't make any progress and also doesn't consume all of // the input. Since I'm not sure how to trigger that case, this // code path is actually untested! // We can assert this because we require that the caller provided // buffer be at least 4 bytes big. assert!( self.buflen < self.buf.as_mut().len(), "internal buffer should never be exhausted" ); let buf = self.buf.as_mut(); for (dst, src) in (self.pos..self.buflen).enumerate() { buf[dst] = buf[src]; } self.buflen -= self.pos; } else { self.buflen = 0; } self.pos = 0; self.buflen += self.rdr.read(&mut self.buf.as_mut()[self.buflen..])?; if self.buflen == 0 { self.exhausted = true; } Ok(()) } } impl fmt::Debug for DecodeReaderBytes { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let mut fmter = f.debug_struct("DecodeReaderBytes"); fmter .field("rdr", &self.rdr) .field("tiny", &self.tiny) .field("utf8_passthru", &self.utf8_passthru) .field("buf", &self.buf) .field("pos", &self.pos) .field("buflen", &self.buflen) .field("has_detected", &self.has_detected) .field("exhausted", &self.exhausted); // Because `encoding_rs::Decoder` doesn't impl `fmt::Debug`. if let Some(ref d) = self.decoder { let msg = format!("Some()", d.encoding().name()); fmter.field("decoder", &msg); } else { fmter.field("decoder", &"None"); } fmter.finish() } } #[cfg(test)] mod tests { use std::io::Read; use encoding_rs::{self, Encoding}; use super::{DecodeReaderBytes, DecodeReaderBytesBuilder}; fn read_to_string(mut rdr: R) -> String { let mut s = String::new(); rdr.read_to_string(&mut s).unwrap(); s } // In cases where all we have is a bom, we expect the bytes to be // passed through unchanged. #[test] fn trans_utf16_bom() { let srcbuf = vec![0xFF, 0xFE]; let mut dstbuf = vec![0; 8 * (1 << 10)]; let mut rdr = DecodeReaderBytes::new(&*srcbuf); let n = rdr.read(&mut dstbuf).unwrap(); assert_eq!(&*srcbuf, &dstbuf[..n]); let srcbuf = vec![0xFE, 0xFF]; let mut rdr = DecodeReaderBytes::new(&*srcbuf); let n = rdr.read(&mut dstbuf).unwrap(); assert_eq!(&*srcbuf, &dstbuf[..n]); let srcbuf = vec![0xEF, 0xBB, 0xBF]; let mut rdr = DecodeReaderBytes::new(&*srcbuf); let n = rdr.read(&mut dstbuf).unwrap(); assert_eq!(n, 0); let srcbuf = vec![0xEF, 0xBB, 0xBF]; let mut rdr = DecodeReaderBytesBuilder::new() .utf8_passthru(true) .build(&*srcbuf); let n = rdr.read(&mut dstbuf).unwrap(); assert_eq!(&*srcbuf, &dstbuf[..n]); } // Test basic UTF-16 decoding. #[test] fn trans_utf16_basic() { let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00]; let mut rdr = DecodeReaderBytes::new(&*srcbuf); assert_eq!("a", read_to_string(&mut rdr)); let srcbuf = vec![0xFE, 0xFF, 0x00, 0x61]; let mut rdr = DecodeReaderBytes::new(&*srcbuf); assert_eq!("a", read_to_string(&mut rdr)); } #[test] fn trans_utf16_basic_without_bom() { let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00]; let mut rdr = DecodeReaderBytesBuilder::new().strip_bom(true).build(&*srcbuf); assert_eq!("a", read_to_string(&mut rdr)); let srcbuf = vec![0xFE, 0xFF, 0x00, 0x61]; let mut rdr = DecodeReaderBytesBuilder::new().strip_bom(true).build(&*srcbuf); assert_eq!("a", read_to_string(&mut rdr)); } // Test the BOM override. #[test] fn trans_utf16_bom_override() { let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00]; let mut rdr = DecodeReaderBytesBuilder::new() .bom_override(true) .encoding(Some(encoding_rs::UTF_8)) .build(&*srcbuf); assert_eq!("a", read_to_string(&mut rdr)); } // Test basic UTF-16 decoding with a small buffer. #[test] fn trans_utf16_smallbuf() { let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00]; let mut rdr = DecodeReaderBytes::new(&*srcbuf); let mut tmp = [0u8; 1]; let nread = rdr.read(&mut tmp).unwrap(); assert_eq!(nread, 1); assert_eq!(tmp, [b'a'; 1]); let nread = rdr.read(&mut tmp).unwrap(); assert_eq!(nread, 1); assert_eq!(tmp, [b'b'; 1]); let nread = rdr.read(&mut tmp).unwrap(); assert_eq!(nread, 1); assert_eq!(tmp, [b'c'; 1]); let nread = rdr.read(&mut tmp).unwrap(); assert_eq!(nread, 0); } // Test incomplete UTF-16 decoding. This ensures we see a replacement char // if the stream ends with an unpaired code unit. #[test] fn trans_utf16_incomplete() { let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x00]; let mut rdr = DecodeReaderBytes::new(&*srcbuf); assert_eq!("a\u{FFFD}", read_to_string(&mut rdr)); } // Test transcoding with a minimal buffer but a large caller buffer. #[test] fn trans_utf16_minimal_buffer_normal_caller_buffer() { #[rustfmt::skip] let srcbuf = vec![ 0xFF, 0xFE, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00, 0x64, 0x00, 0x65, 0x00, 0x66, 0x00, 0x67, 0x00, 0x68, 0x00, ]; let mut rdr = DecodeReaderBytesBuilder::new() .build_with_buffer(&*srcbuf, vec![0; 4]) .unwrap(); let got = read_to_string(&mut rdr); assert_eq!(got, "abcdefgh"); } // Test transcoding with a minimal buffer and a minimal caller buffer. #[test] fn trans_utf16_minimal_buffers() { let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00]; let mut rdr = DecodeReaderBytesBuilder::new() .build_with_buffer(&*srcbuf, vec![0; 4]) .unwrap(); let mut tmp = [0u8; 1]; let nread = rdr.read(&mut tmp).unwrap(); assert_eq!(nread, 1); assert_eq!(tmp, [b'a'; 1]); let nread = rdr.read(&mut tmp).unwrap(); assert_eq!(nread, 1); assert_eq!(tmp, [b'b'; 1]); let nread = rdr.read(&mut tmp).unwrap(); assert_eq!(nread, 1); assert_eq!(tmp, [b'c'; 1]); let nread = rdr.read(&mut tmp).unwrap(); assert_eq!(nread, 0); } // Test transcoding with using byte oriented APIs. #[test] fn trans_utf16_byte_api() { #[rustfmt::skip] let srcbuf = vec![ 0xFF, 0xFE, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00, 0x64, 0x00, 0x65, 0x00, 0x66, 0x00, 0x67, 0x00, 0x68, 0x00, ]; let rdr = DecodeReaderBytes::new(&*srcbuf); let got: Vec = rdr.bytes().map(|res| res.unwrap()).collect(); assert_eq!(got, b"abcdefgh"); } #[test] fn trans_utf16_no_sniffing() { #[rustfmt::skip] let srcbuf = vec![ 0xFF, 0xFE, 0x61, 0x00, ]; let rdr = DecodeReaderBytesBuilder::new() .bom_sniffing(false) .build(&*srcbuf); let got: Vec = rdr.bytes().map(|res| res.unwrap()).collect(); assert_eq!(got, srcbuf); } #[test] fn trans_utf16_no_sniffing_strip_bom() { #[rustfmt::skip] let srcbuf = vec![ 0xFF, 0xFE, 0x61, 0x00, ]; let rdr = DecodeReaderBytesBuilder::new() .bom_sniffing(false) .strip_bom(true) .build(&*srcbuf); let got: Vec = rdr.bytes().map(|res| res.unwrap()).collect(); assert_eq!(got, &[0x61, 0x00]); } #[test] fn trans_utf16_no_sniffing_encoding_override() { #[rustfmt::skip] let srcbuf = vec![ 0xFF, 0xFE, 0x61, 0x00, ]; let rdr = DecodeReaderBytesBuilder::new() .bom_sniffing(false) .encoding(Some(encoding_rs::UTF_16LE)) .build(&*srcbuf); let got: Vec = rdr.bytes().map(|res| res.unwrap()).collect(); assert_eq!(got, b"a"); } #[test] fn trans_utf16_no_sniffing_encoding_override_strip_bom() { #[rustfmt::skip] let srcbuf = vec![ 0xFF, 0xFE, 0x61, 0x00, ]; let rdr = DecodeReaderBytesBuilder::new() .bom_sniffing(false) .strip_bom(true) .encoding(Some(encoding_rs::UTF_16LE)) .build(&*srcbuf); let got: Vec = rdr.bytes().map(|res| res.unwrap()).collect(); assert_eq!(got, b"a"); } // Test transcoding with a minimal buffer using byte oriented APIs. #[test] fn trans_utf16_minimal_buffer_byte_api() { #[rustfmt::skip] let srcbuf = vec![ 0xFF, 0xFE, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00, 0x64, 0x00, 0x65, 0x00, 0x66, 0x00, 0x67, 0x00, 0x68, 0x00, ]; let rdr = DecodeReaderBytesBuilder::new() .build_with_buffer(&*srcbuf, vec![0; 4]) .unwrap(); let got: Vec = rdr.bytes().map(|res| res.unwrap()).collect(); assert_eq!(got, b"abcdefgh"); } // Test a buffer that is too small. #[test] fn buffer_too_small() { let res = DecodeReaderBytesBuilder::new() .build_with_buffer(&[][..], vec![0; 3]); assert!(res.is_err()); } macro_rules! test_trans_simple { ($name:ident, $enc:expr, $srcbytes:expr, $dst:expr) => { #[test] fn $name() { let srcbuf = &$srcbytes[..]; let enc = Encoding::for_label($enc.as_bytes()); let mut rdr = DecodeReaderBytesBuilder::new() .encoding(enc) .build(&*srcbuf); assert_eq!($dst, read_to_string(&mut rdr)); } }; } // This isn't exhaustive obviously, but it lets us test base level support. test_trans_simple!(trans_simple_auto, "does not exist", b"\xD0\x96", "Ж"); test_trans_simple!(trans_simple_utf8, "utf-8", b"\xD0\x96", "Ж"); test_trans_simple!(trans_simple_utf16le, "utf-16le", b"\x16\x04", "Ж"); test_trans_simple!(trans_simple_utf16be, "utf-16be", b"\x04\x16", "Ж"); test_trans_simple!(trans_simple_chinese, "chinese", b"\xA7\xA8", "Ж"); test_trans_simple!(trans_simple_korean, "korean", b"\xAC\xA8", "Ж"); test_trans_simple!( trans_simple_big5_hkscs, "big5-hkscs", b"\xC7\xFA", "Ж" ); test_trans_simple!(trans_simple_gbk, "gbk", b"\xA7\xA8", "Ж"); test_trans_simple!(trans_simple_sjis, "sjis", b"\x84\x47", "Ж"); test_trans_simple!(trans_simple_eucjp, "euc-jp", b"\xA7\xA8", "Ж"); test_trans_simple!(trans_simple_latin1, "latin1", b"\xA9", "©"); } encoding_rs_io-0.1.7/src/util.rs010066400017500001731000000350541361726413000150460ustar0000000000000000use std::cmp; use std::io; use encoding_rs::{CoderResult, Decoder, Encoding}; /// This is the minimum amount of space that a decoder-to-utf8-with-replacement /// will use for any state and any input. const TINY_BUFFER_SIZE: usize = 7; /// A tiny transcoder performs transcoding incrementally even when a caller /// provided buffer is not large enough. /// /// This use case comes up when implementing streaming transcoding in cases /// where it is permissible to provide incomplete UTF-8 sequences to the /// caller (e.g., when decoding into a `&[u8]` where the caller must be capable /// of handling invalid UTF-8). In particular, this type specifically handles /// cases where a caller provided buffer is too small to store a full UTF-8 /// sequence. Thus, this type should be used in cases where the caller provided /// buffer has length 3 or fewer. /// /// This could likely be done with better performance by allocating a larger /// buffer for these cases, but we instead opt to handle this without /// allocation under the assumption that tiny caller provided buffers are /// probably a pathological case. #[derive(Clone, Debug)] pub struct TinyTranscoder { /// This is where we store the results of a transcoding. Since we are /// always decoding to UTF-8, 7 bytes is sufficient to represent any /// codepoint. partial: [u8; TINY_BUFFER_SIZE], /// The number of bytes written in `partial`. len: usize, /// The position in `partial` at which the next byte should be read. pos: usize, } impl TinyTranscoder { /// Create a new tiny transcoder that is ready for use. pub fn new() -> TinyTranscoder { TinyTranscoder { partial: [0; TINY_BUFFER_SIZE], len: 0, pos: 0 } } /// Transcode the contents of `src` into this buffer using the provided /// decoder, and return the number of bytes consumed in `src` and the /// number of bytes written to this transcoder. /// /// The results of transcoding can be read using the TinyTranscoder's /// `io::Read` implementation. /// /// If `last` is true, then this signals to the decoder that we've reached /// EOF and `src` must be empty. Otherwise, if `last` is false, then /// `src` must be non-empty. Violating either of these constraits will /// cause a panic. /// /// Finally, if this transcoder still has unconsumed bytes from a previous /// transcode, then this panics. Callers must consume all bytes from a /// previous transcoding before performing another one. pub fn transcode( &mut self, decoder: &mut Decoder, src: &[u8], last: bool, ) -> (usize, usize) { assert!(self.as_slice().is_empty(), "transcoder has unconsumed bytes"); if last { assert!(src.is_empty(), "src must be empty when last==true"); } let (res, nin, nout, _) = decoder.decode_to_utf8(src, &mut self.partial[..], last); if last { assert_eq!( res, CoderResult::InputEmpty, "input should be exhausted", ); } self.pos = 0; self.len = nout; (nin, nout) } /// Return the the bytes remaining to be read as a slice. fn as_slice(&self) -> &[u8] { &self.partial[self.pos..self.len] } } impl io::Read for TinyTranscoder { fn read(&mut self, buf: &mut [u8]) -> io::Result { if self.pos >= self.len { return Ok(0); } let mut count = 0; for (src, dst) in self.as_slice().iter().zip(buf) { *dst = *src; count += 1; } self.pos += count; Ok(count) } } /// `BomPeeker` wraps `R` and satisfies the `io::Read` interface while also /// providing a peek at the BOM if one exists. Peeking at the BOM does not /// advance the reader. #[derive(Debug)] pub struct BomPeeker { rdr: R, strip: bool, bom: Option, nread: usize, } impl BomPeeker { /// Create a new BomPeeker that includes the BOM in calls to `read`. /// /// The first three bytes can be read using the `peek_bom` method, but /// will not advance the reader. pub fn with_bom(rdr: R) -> BomPeeker { BomPeeker { rdr: rdr, strip: false, bom: None, nread: 0 } } /// Create a new BomPeeker that never includes the BOM in calls to `read`. pub fn without_bom(rdr: R) -> BomPeeker { BomPeeker { rdr: rdr, strip: true, bom: None, nread: 0 } } /// Peek at the first three bytes of the underlying reader. /// /// This does not advance the reader provided by `BomPeeker`. /// /// If the underlying reader does not have at least two bytes available, /// then `None` is returned. pub fn peek_bom(&mut self) -> io::Result { if let Some(bom) = self.bom { return Ok(bom); } // If the underlying reader fails or panics, make sure we set at least // an empty BOM so that we don't end up here again.. self.bom = Some(PossibleBom::new()); // OK, try to read the BOM. let mut buf = [0u8; 3]; let bom_len = read_full(&mut self.rdr, &mut buf)?; self.bom = Some(PossibleBom { bytes: buf, len: bom_len }); Ok(self.bom.unwrap()) } } impl io::Read for BomPeeker { fn read(&mut self, buf: &mut [u8]) -> io::Result { if self.nread < 3 { let bom = self.peek_bom()?; // If we don't have a valid BOM (e.g., no encoding for it), then // we always pass through the first 3 bytes. Otherwise, if we have // a valid BOM, we only pass it thru if we don't want to strip it. let bom = bom.as_slice(!self.strip); if self.nread < bom.len() { let rest = &bom[self.nread..]; let len = cmp::min(buf.len(), rest.len()); buf[..len].copy_from_slice(&rest[..len]); self.nread += len; return Ok(len); } } let nread = self.rdr.read(buf)?; self.nread += nread; Ok(nread) } } /// A PossibleBom is a sequence of bytes at the beginning of a stream that /// may represent an actual BOM. To detect the BOM, this must contain at /// least 3 bytes. /// /// If this is a valid UTF-8 or UTF-16 BOM, then an encoding_rs decoder can /// be built from the BOM. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub struct PossibleBom { bytes: [u8; 3], len: usize, } impl PossibleBom { /// Build a new empty BOM. fn new() -> PossibleBom { PossibleBom { bytes: [0; 3], len: 0 } } /// Return the BOM as a normal slice. /// /// If `bom` is true, then this includes any leading BOM bytes. Otherwise, /// this only includes non-BOM bytes. fn as_slice(&self, bom: bool) -> &[u8] { let slice = &self.bytes[0..self.len]; if bom || slice.len() <= 1 { slice } else if &slice[0..2] == b"\xFF\xFE" || &slice[0..2] == b"\xFE\xFF" { &slice[2..] } else if slice == b"\xEF\xBB\xBF" { &[] } else { slice } } /// If this is a valid UTF-8 or UTF-16 BOM, return its corresponding /// encoding. Otherwise, return `None`. pub fn encoding(&self) -> Option<&'static Encoding> { let bom = self.as_slice(true); if bom.len() < 3 { return None; } if let Some((enc, _)) = Encoding::for_bom(bom) { return Some(enc); } None } } /// Like `io::Read::read_exact`, except it never returns `UnexpectedEof` and /// instead returns the number of bytes read if EOF is seen before filling /// `buf`. pub fn read_full( mut rdr: R, mut buf: &mut [u8], ) -> io::Result { let mut nread = 0; while !buf.is_empty() { match rdr.read(buf) { Ok(0) => break, Ok(n) => { nread += n; let tmp = buf; buf = &mut tmp[n..]; } Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} Err(e) => return Err(e), } } Ok(nread) } #[cfg(test)] mod tests { use super::{BomPeeker, PossibleBom, TinyTranscoder}; use encoding_rs::Encoding; use std::io::Read; #[test] fn tiny_utf16_normal() { let enc = Encoding::for_label(b"utf-16le").unwrap(); let mut dec = enc.new_decoder_with_bom_removal(); let mut bytes = &b"f\x00o\x00o\x00b\x00a\x00r\x00b\x00a\x00z\x00"[..]; let mut tiny = TinyTranscoder::new(); let mut tmp = [0u8; 1]; let (nin, nout) = tiny.transcode(&mut dec, bytes, false); assert_eq!(nin, 14); assert_eq!(nout, 7); bytes = &bytes[nin..]; assert_eq!(tiny.read(&mut tmp).unwrap(), 1); assert_eq!(tmp, [b'f'; 1]); assert_eq!(tiny.read(&mut tmp).unwrap(), 1); assert_eq!(tmp, [b'o'; 1]); assert_eq!(tiny.read(&mut tmp).unwrap(), 1); assert_eq!(tmp, [b'o'; 1]); assert_eq!(tiny.read(&mut tmp).unwrap(), 1); assert_eq!(tmp, [b'b'; 1]); assert_eq!(tiny.read(&mut tmp).unwrap(), 1); assert_eq!(tmp, [b'a'; 1]); assert_eq!(tiny.read(&mut tmp).unwrap(), 1); assert_eq!(tmp, [b'r'; 1]); assert_eq!(tiny.read(&mut tmp).unwrap(), 1); assert_eq!(tmp, [b'b'; 1]); let (nin, nout) = tiny.transcode(&mut dec, bytes, false); assert_eq!(nin, 4); assert_eq!(nout, 2); bytes = &bytes[nin..]; assert_eq!(tiny.read(&mut tmp).unwrap(), 1); assert_eq!(tmp, [b'a'; 1]); assert_eq!(tiny.read(&mut tmp).unwrap(), 1); assert_eq!(tmp, [b'z'; 1]); let (nin, nout) = tiny.transcode(&mut dec, bytes, true); assert_eq!(nin, 0); assert_eq!(nout, 0); assert_eq!(tiny.read(&mut tmp).unwrap(), 0); } #[test] fn tiny_utf16_invalid() { let enc = Encoding::for_label(b"utf-16le").unwrap(); let mut dec = enc.new_decoder_with_bom_removal(); let mut bytes = &b"\x00"[..]; let mut tiny = TinyTranscoder::new(); let mut tmp = [0u8; 1]; let (nin, nout) = tiny.transcode(&mut dec, bytes, false); assert_eq!(nin, 1); assert_eq!(nout, 0); assert_eq!(tiny.read(&mut tmp).unwrap(), 0); bytes = &bytes[nin..]; let (nin, nout) = tiny.transcode(&mut dec, bytes, true); assert_eq!(nin, 0); assert_eq!(nout, 3); assert_eq!(tiny.read(&mut tmp).unwrap(), 1); assert_eq!(tmp, [b'\xEF'; 1]); assert_eq!(tiny.read(&mut tmp).unwrap(), 1); assert_eq!(tmp, [b'\xBF'; 1]); assert_eq!(tiny.read(&mut tmp).unwrap(), 1); assert_eq!(tmp, [b'\xBD'; 1]); assert_eq!(tiny.read(&mut tmp).unwrap(), 0); } #[test] fn peeker_empty() { let buf = []; let mut peeker = BomPeeker::with_bom(&buf[..]); assert_eq!(PossibleBom::new(), peeker.peek_bom().unwrap()); let mut tmp = [0; 100]; assert_eq!(0, peeker.read(&mut tmp).unwrap()); } #[test] fn peeker_one() { let buf = [1]; let mut peeker = BomPeeker::with_bom(&buf[..]); assert_eq!( PossibleBom { bytes: [1, 0, 0], len: 1 }, peeker.peek_bom().unwrap() ); let mut tmp = [0; 100]; assert_eq!(1, peeker.read(&mut tmp).unwrap()); assert_eq!(1, tmp[0]); assert_eq!(0, peeker.read(&mut tmp).unwrap()); } #[test] fn peeker_two() { let buf = [1, 2]; let mut peeker = BomPeeker::with_bom(&buf[..]); assert_eq!( PossibleBom { bytes: [1, 2, 0], len: 2 }, peeker.peek_bom().unwrap() ); let mut tmp = [0; 100]; assert_eq!(2, peeker.read(&mut tmp).unwrap()); assert_eq!(1, tmp[0]); assert_eq!(2, tmp[1]); assert_eq!(0, peeker.read(&mut tmp).unwrap()); } #[test] fn peeker_three() { let buf = [1, 2, 3]; let mut peeker = BomPeeker::with_bom(&buf[..]); assert_eq!( PossibleBom { bytes: [1, 2, 3], len: 3 }, peeker.peek_bom().unwrap() ); let mut tmp = [0; 100]; assert_eq!(3, peeker.read(&mut tmp).unwrap()); assert_eq!(1, tmp[0]); assert_eq!(2, tmp[1]); assert_eq!(3, tmp[2]); assert_eq!(0, peeker.read(&mut tmp).unwrap()); } #[test] fn peeker_four() { let buf = [1, 2, 3, 4]; let mut peeker = BomPeeker::with_bom(&buf[..]); assert_eq!( PossibleBom { bytes: [1, 2, 3], len: 3 }, peeker.peek_bom().unwrap() ); let mut tmp = [0; 100]; assert_eq!(3, peeker.read(&mut tmp).unwrap()); assert_eq!(1, tmp[0]); assert_eq!(2, tmp[1]); assert_eq!(3, tmp[2]); assert_eq!(1, peeker.read(&mut tmp).unwrap()); assert_eq!(4, tmp[0]); assert_eq!(0, peeker.read(&mut tmp).unwrap()); } #[test] fn peeker_one_at_a_time() { let buf = [1, 2, 3, 4]; let mut peeker = BomPeeker::with_bom(&buf[..]); let mut tmp = [0; 1]; assert_eq!(0, peeker.read(&mut tmp[..0]).unwrap()); assert_eq!(0, tmp[0]); assert_eq!(1, peeker.read(&mut tmp).unwrap()); assert_eq!(1, tmp[0]); assert_eq!(1, peeker.read(&mut tmp).unwrap()); assert_eq!(2, tmp[0]); assert_eq!(1, peeker.read(&mut tmp).unwrap()); assert_eq!(3, tmp[0]); assert_eq!(1, peeker.read(&mut tmp).unwrap()); assert_eq!(4, tmp[0]); } #[test] fn peeker_without_bom() { let buf = [b'\xEF', b'\xBB', b'\xBF', b'a']; let mut peeker = BomPeeker::without_bom(&buf[..]); assert_eq!( PossibleBom { bytes: [b'\xEF', b'\xBB', b'\xBF'], len: 3 }, peeker.peek_bom().unwrap() ); let mut tmp = [0; 100]; assert_eq!(1, peeker.read(&mut tmp).unwrap()); assert_eq!(b'a', tmp[0]); assert_eq!(0, peeker.read(&mut tmp).unwrap()); } #[test] fn peeker_without_bom_nobom() { let buf = [1, 2, 3, 4]; let mut peeker = BomPeeker::without_bom(&buf[..]); assert_eq!( PossibleBom { bytes: [1, 2, 3], len: 3 }, peeker.peek_bom().unwrap() ); let mut tmp = [0; 100]; assert_eq!(3, peeker.read(&mut tmp).unwrap()); assert_eq!(1, tmp[0]); assert_eq!(2, tmp[1]); assert_eq!(3, tmp[2]); assert_eq!(1, peeker.read(&mut tmp).unwrap()); assert_eq!(4, tmp[0]); assert_eq!(0, peeker.read(&mut tmp).unwrap()); } } encoding_rs_io-0.1.7/.cargo_vcs_info.json0000644000000001121361726414000141160ustar00{ "git": { "sha1": "ad0a122d5c42dd2375917d32ad50fbbc7be3213d" } }