Skip to content

Commit 5d6d68e

Browse files
committed
feat: add bytes2chars crate
1 parent 1a548bf commit 5d6d68e

7 files changed

Lines changed: 1012 additions & 14 deletions

File tree

Cargo.lock

Lines changed: 25 additions & 13 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ path = "tests/integration/main.rs"
2222

2323

2424
[workspace]
25-
members = ["crates/ui", "crates/parse", "xtask"]
25+
members = ["crates/ui", "crates/parse", "xtask", "crates/bytes2chars"]
2626

2727
[workspace.dependencies]
2828
jjpwrgem-parse = { path = "crates/parse", version = "0.3.0" }

crates/bytes2chars/Cargo.toml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
[package]
2+
name = "bytes2chars"
3+
version = "0.1.0"
4+
edition = "2024"
5+
rust-version = "1.85"
6+
description = "lazy utf-8 decoder iterator with rich errors"
7+
license = "MIT"
8+
repository = "https://github.com/20jasper/jjpwrgem"
9+
documentation = "https://docs.rs/bytes2chars"
10+
keywords = ["utf8", "unicode", "iterator", "decoder"]
11+
categories = ["encoding", "no-std"]
12+
13+
[lints.rust]
14+
missing_debug_implementations = "warn"
15+
missing_docs = "warn"
16+
17+
[lints.rustdoc]
18+
broken_intra_doc_links = "warn"
19+
20+
[lints.clippy]
21+
cast_lossless = "warn"
22+
doc_markdown = "warn"
23+
missing_errors_doc = "warn"
24+
missing_panics_doc = "warn"
25+
undocumented_unsafe_blocks = "warn"
26+
unreadable_literal = "warn"
27+
28+
[dependencies]
29+
displaydoc.workspace = true

crates/bytes2chars/README.md

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# bytes2chars
2+
3+
lazily decodes utf-8 [`char`]s from bytes
4+
5+
provides lazy, fallible analogs to [`str::Chars`] ([`Utf8Chars`]) and [`str::CharIndices`] ([`Utf8CharIndices`]), as well as a lower-level push-based [`Utf8Decoder`]
6+
7+
[`str::Chars`]: core::str::Chars
8+
[`str::CharIndices`]: core::str::CharIndices
9+
[`Utf8Chars`]: crate::Utf8Chars
10+
[`Utf8CharIndices`]: crate::Utf8CharIndices
11+
12+
## design goals
13+
14+
- rich errors—what went wrong and where
15+
- lazy
16+
- `no-std`
17+
- performance
18+
19+
## quick start
20+
21+
prefer iterators like [`Utf8CharIndices`] or [`Utf8Chars`] if you have access to a byte iterator. [`Utf8Chars`] still tracks bytes for error context, so it's purely a convenience wrapper
22+
23+
if you receive bytes in chunks, use the push-based [`Utf8Decoder`]
24+
25+
## examples
26+
27+
### iterator api
28+
29+
```rust
30+
# use bytes2chars::{Result, Utf8CharIndices, Utf8Chars};
31+
# fn main() -> Result<()> {
32+
let input = b"\xF0\x9F\xA6\x80 rust".iter().copied();
33+
34+
// decode into an iterator of chars and their positions
35+
let indexed = Utf8CharIndices::from(input.clone()).collect::<Result<Vec<_>>>()?;
36+
let expected = vec![(0, '🦀'), (4, ' '), (5, 'r'), (6, 'u'), (7, 's'), (8, 't')];
37+
assert_eq!(indexed, expected);
38+
39+
// convenience wrapper to decode into an iterator of chars
40+
let chars = Utf8Chars::from(input).collect::<Result<String>>()?;
41+
assert_eq!(chars, "🦀 rust");
42+
# Ok(())
43+
# }
44+
```
45+
46+
### error handling
47+
48+
```rust
49+
# use bytes2chars::{Error, ErrorKind, Result, Utf8Chars};
50+
# fn main() -> Result<()> {
51+
let err = Utf8Chars::from(b"hello \x80 world".iter().copied())
52+
.collect::<Result<String>>()
53+
.unwrap_err();
54+
55+
assert_eq!(err, Error { range: 6..7, kind: ErrorKind::InvalidLead(0x80) });
56+
assert_eq!(
57+
err.to_string(),
58+
"invalid utf-8 at bytes 6..7: byte 0x80 cannot start a UTF-8 sequence"
59+
);
60+
# Ok(())
61+
# }
62+
```
63+
64+
### push based decoder
65+
66+
```rust
67+
# use bytes2chars::Utf8Decoder;
68+
# fn main() -> bytes2chars::Result<()> {
69+
let mut decoder = Utf8Decoder::new(0);
70+
assert_eq!(decoder.push(0xF0), None); // accumulating
71+
assert_eq!(decoder.push(0x9F), None);
72+
assert_eq!(decoder.push(0xA6), None);
73+
assert_eq!(decoder.push(0x80), Some(Ok((0, '🦀')))); // complete
74+
decoder.finish()?; // check for truncated sequence
75+
# Ok(())
76+
# }
77+
```
78+
79+
## alternatives
80+
81+
### [`std::str::from_utf8`](https://doc.rust-lang.org/std/str/fn.from_utf8.html)
82+
83+
eager and error context provides a range but not a particular cause
84+
85+
### [`utf8-decode`](https://docs.rs/utf8-decode/latest/utf8_decode/index.html)
86+
87+
also lazy. error provides a range but not a particular cause. does not provide a push based decoder

0 commit comments

Comments
 (0)