Trait encode_unicode::SliceExt [−][src]
Methods for iterating over u8
and u16
slices as UTF-8 or UTF-16 characters.
The iterators are slightly faster than the similar methods in IterExt
because they con “push back” items for free after errors and don’t need a
separate buffer that must be checked on every call to .next()
.
Required methods
fn utf8char_indices(&self) -> Utf8CharDecoder<'_>ⓘNotable traits for Utf8CharDecoder<'a>
impl<'a> Iterator for Utf8CharDecoder<'a> type Item = (usize, Result<Utf8Char, InvalidUtf8Slice>, usize);
where
Self::Output: Borrow<[u8]>,
[src]
Notable traits for Utf8CharDecoder<'a>
impl<'a> Iterator for Utf8CharDecoder<'a> type Item = (usize, Result<Utf8Char, InvalidUtf8Slice>, usize);
Self::Output: Borrow<[u8]>,
Decode u8
slices as UTF-8 and iterate over the codepoints as Utf8Char
s,
Examples
Get the index and error type of the first error:
use encode_unicode::{SliceExt, Utf8Char}; use encode_unicode::error::InvalidUtf8Slice; let slice = b"ab\0\xe0\xbc\xa9 \xf3\x80\x77"; let result = slice.utf8char_indices() .map(|(offset,r,length)| r.map_err(|e| (offset,e,length) ) ) .collect::<Result<String,(usize,InvalidUtf8Slice,usize)>>(); assert_eq!(result, Err((7, InvalidUtf8Slice::TooShort(4), 1)));
use encode_unicode::{SliceExt, Utf8Char}; use std::error::Error; let slice = b"\xf0\xbf\xbf\xbfXY\xdd\xbb\xe1\x80\x99quux123"; let mut fixed_size = [Utf8Char::default(); 8]; for (cp_i, (byte_index, r, _)) in slice.utf8char_indices().enumerate().take(8) { match r { Ok(u8c) => fixed_size[cp_i] = u8c, Err(e) => panic!("Invalid codepoint at index {} ({})", cp_i, e.description()), } } let chars = ['\u{3ffff}', 'X', 'Y', '\u{77b}', '\u{1019}', 'q', 'u', 'u']; assert_eq!(fixed_size, chars);
use encode_unicode::{SliceExt, Utf8Char}; use encode_unicode::error::InvalidUtf8Slice::*; use encode_unicode::error::{InvalidUtf8, InvalidUtf8FirstByte, InvalidCodepoint}; let bytes = b"\xfa-\xf4\x8f\xee\xa1\x8f-\xed\xa9\x87\xf0\xcc\xbb"; let mut errors = Vec::new(); let mut lengths = Vec::new(); let mut string = String::new(); for (offset,result,length) in bytes.utf8char_indices() { lengths.push((offset,length)); let c = result.unwrap_or_else(|error| { errors.push((offset,error)); Utf8Char::from('\u{fffd}') // replacement character }); string.push_str(c.as_str()); } assert_eq!(string, "�-��\u{e84f}-����\u{33b}"); assert_eq!(lengths, [(0,1), (1,1), (2,1), (3,1), (4,3), (7,1), (8,1), (9,1), (10,1), (11,1), (12,2)]); assert_eq!(errors, [ ( 0, Utf8(InvalidUtf8::FirstByte(InvalidUtf8FirstByte::TooLongSeqence))), ( 2, Utf8(InvalidUtf8::NotAContinuationByte(2))), ( 3, Utf8(InvalidUtf8::FirstByte(InvalidUtf8FirstByte::ContinuationByte))), ( 8, Codepoint(InvalidCodepoint::Utf16Reserved)), ( 9, Utf8(InvalidUtf8::FirstByte(InvalidUtf8FirstByte::ContinuationByte))), (10, Utf8(InvalidUtf8::FirstByte(InvalidUtf8FirstByte::ContinuationByte))), (11, TooShort(4)), // (but it was not the last element returned!) ]);
fn utf16char_indices(&self) -> Utf16CharDecoder<'_>ⓘNotable traits for Utf16CharDecoder<'a>
impl<'a> Iterator for Utf16CharDecoder<'a> type Item = (usize, Result<Utf16Char, Utf16PairError>, usize);
where
Self::Output: Borrow<[u16]>,
[src]
Notable traits for Utf16CharDecoder<'a>
impl<'a> Iterator for Utf16CharDecoder<'a> type Item = (usize, Result<Utf16Char, Utf16PairError>, usize);
Self::Output: Borrow<[u16]>,
Decode u16
slices as UTF-16 and iterate over the codepoints as Utf16Char
s,
The iterator produces (usize,Result<Utf16Char,Utf16Error>,usize)
,
and the slice is validated as you go.
The first usize
contains the offset from the start of the slice and
the last usize
contains the length of the codepoint or error.
The length is either 1 or 2, and always 1 for errors.
Examples
use encode_unicode::{SliceExt, Utf8Char}; let slice = &['a' as u16, 0xdf00, 0xd83c, 0xdca0][..]; let mut errors = Vec::new(); let string = slice.utf16char_indices().map(|(offset,r,_)| match r { Ok(u16c) => Utf8Char::from(u16c), Err(_) => { errors.push(offset); Utf8Char::from('\u{fffd}') // REPLACEMENT_CHARACTER } }).collect::<String>(); assert_eq!(string, "a�🂠"); assert_eq!(errors, [1]);
Search for a codepoint and return its unit and codepoint index.
use encode_unicode::{SliceExt, Utf16Char}; let slice = [0xd875,/*'𝕏'*/ 0xdd4f, '≈' as u16, '2' as u16]; let position = slice.utf16char_indices() .enumerate() .find(|&(_,(_,r,_))| r == Ok(Utf16Char::from('≈')) ) .map(|(codepoint, (offset, _, _))| (codepoint, offset) ); assert_eq!(position, Some((1,2)));
Error types:
use encode_unicode::{SliceExt, Utf16Char}; use encode_unicode::error::Utf16PairError::*; let slice = [0xdcba, 0xdeff, 0xd8be, 0xdeee, 'λ' as u16, 0xdab1, 0xdab1]; let mut iter = slice.utf16char_indices(); assert_eq!(iter.next(), Some((0, Err(UnexpectedTrailingSurrogate), 1))); assert_eq!(iter.next(), Some((1, Err(UnexpectedTrailingSurrogate), 1))); assert_eq!(iter.next(), Some((2, Ok(Utf16Char::from('\u{3faee}')), 2))); assert_eq!(iter.next(), Some((4, Ok(Utf16Char::from('λ')), 1))); assert_eq!(iter.next(), Some((5, Err(UnmatchedLeadingSurrogate), 1))); assert_eq!(iter.next(), Some((6, Err(Incomplete), 1))); assert_eq!(iter.next(), None); assert_eq!(iter.as_slice(), [])
Implementors
impl<S: ?Sized + Index<RangeFull>> SliceExt for S
[src]
fn utf8char_indices(&self) -> Utf8CharDecoder<'_>ⓘNotable traits for Utf8CharDecoder<'a>
impl<'a> Iterator for Utf8CharDecoder<'a> type Item = (usize, Result<Utf8Char, InvalidUtf8Slice>, usize);
where
Self::Output: Borrow<[u8]>,
[src]
Notable traits for Utf8CharDecoder<'a>
impl<'a> Iterator for Utf8CharDecoder<'a> type Item = (usize, Result<Utf8Char, InvalidUtf8Slice>, usize);
Self::Output: Borrow<[u8]>,
fn utf16char_indices(&self) -> Utf16CharDecoder<'_>ⓘNotable traits for Utf16CharDecoder<'a>
impl<'a> Iterator for Utf16CharDecoder<'a> type Item = (usize, Result<Utf16Char, Utf16PairError>, usize);
where
Self::Output: Borrow<[u16]>,
[src]
Notable traits for Utf16CharDecoder<'a>
impl<'a> Iterator for Utf16CharDecoder<'a> type Item = (usize, Result<Utf16Char, Utf16PairError>, usize);
Self::Output: Borrow<[u16]>,