|
| 1 | +# bytes2chars |
| 2 | + |
| 3 | +lazily decodes utf-8 [`char`]s from bytes |
| 4 | + |
| 5 | +provides lazy, fallible analogs to [`str::Chars`] ([`Utf8Chars`]) and [`str::CharIndices`] ([`Utf8CharIndices`]), as well as a lower-level push-based [`Utf8Decoder`] |
| 6 | + |
| 7 | +[`str::Chars`]: core::str::Chars |
| 8 | +[`str::CharIndices`]: core::str::CharIndices |
| 9 | +[`Utf8Chars`]: crate::Utf8Chars |
| 10 | +[`Utf8CharIndices`]: crate::Utf8CharIndices |
| 11 | + |
| 12 | +## design goals |
| 13 | + |
| 14 | +- rich errors—what went wrong and where |
| 15 | +- lazy |
| 16 | +- `no-std` |
| 17 | +- performance |
| 18 | + |
| 19 | +## quick start |
| 20 | + |
| 21 | +prefer iterators like [`Utf8CharIndices`] or [`Utf8Chars`] if you have access to a byte iterator. [`Utf8Chars`] still tracks bytes for error context, so it's purely a convenience wrapper |
| 22 | + |
| 23 | +if you receive bytes in chunks, use the push-based [`Utf8Decoder`] |
| 24 | + |
| 25 | +## examples |
| 26 | + |
| 27 | +### iterator api |
| 28 | + |
| 29 | +```rust |
| 30 | +# use bytes2chars::{Result, Utf8CharIndices, Utf8Chars}; |
| 31 | +# fn main() -> Result<()> { |
| 32 | +let input = b"\xF0\x9F\xA6\x80 rust".iter().copied(); |
| 33 | + |
| 34 | +// decode into an iterator of chars and their positions |
| 35 | +let indexed = Utf8CharIndices::from(input.clone()).collect::<Result<Vec<_>>>()?; |
| 36 | +let expected = vec![(0, '🦀'), (4, ' '), (5, 'r'), (6, 'u'), (7, 's'), (8, 't')]; |
| 37 | +assert_eq!(indexed, expected); |
| 38 | + |
| 39 | +// convenience wrapper to decode into an iterator of chars |
| 40 | +let chars = Utf8Chars::from(input).collect::<Result<String>>()?; |
| 41 | +assert_eq!(chars, "🦀 rust"); |
| 42 | +# Ok(()) |
| 43 | +# } |
| 44 | +``` |
| 45 | + |
| 46 | +### error handling |
| 47 | + |
| 48 | +```rust |
| 49 | +# use bytes2chars::{Error, ErrorKind, Result, Utf8Chars}; |
| 50 | +# fn main() -> Result<()> { |
| 51 | +let err = Utf8Chars::from(b"hello \x80 world".iter().copied()) |
| 52 | + .collect::<Result<String>>() |
| 53 | + .unwrap_err(); |
| 54 | + |
| 55 | +assert_eq!(err, Error { range: 6..7, kind: ErrorKind::InvalidLead(0x80) }); |
| 56 | +assert_eq!( |
| 57 | + err.to_string(), |
| 58 | + "invalid utf-8 at bytes 6..7: byte 0x80 cannot start a UTF-8 sequence" |
| 59 | +); |
| 60 | +# Ok(()) |
| 61 | +# } |
| 62 | +``` |
| 63 | + |
| 64 | +### push based decoder |
| 65 | + |
| 66 | +```rust |
| 67 | +# use bytes2chars::Utf8Decoder; |
| 68 | +# fn main() -> bytes2chars::Result<()> { |
| 69 | +let mut decoder = Utf8Decoder::new(0); |
| 70 | +assert_eq!(decoder.push(0xF0), None); // accumulating |
| 71 | +assert_eq!(decoder.push(0x9F), None); |
| 72 | +assert_eq!(decoder.push(0xA6), None); |
| 73 | +assert_eq!(decoder.push(0x80), Some(Ok((0, '🦀')))); // complete |
| 74 | +decoder.finish()?; // check for truncated sequence |
| 75 | +# Ok(()) |
| 76 | +# } |
| 77 | +``` |
| 78 | + |
| 79 | +## alternatives |
| 80 | + |
| 81 | +### [`std::str::from_utf8`](https://doc.rust-lang.org/std/str/fn.from_utf8.html) |
| 82 | + |
| 83 | +eager and error context provides a range but not a particular cause |
| 84 | + |
| 85 | +### [`utf8-decode`](https://docs.rs/utf8-decode/latest/utf8_decode/index.html) |
| 86 | + |
| 87 | +also lazy. error provides a range but not a particular cause. does not provide a push based decoder |
0 commit comments