Skip to content

Commit a782c3c

Browse files
feat: add support for preserving characters when decoding
1 parent 5505565 commit a782c3c

File tree

1 file changed

+104
-13
lines changed

1 file changed

+104
-13
lines changed

percent_encoding/src/lib.rs

+104-13
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ use core::{fmt, mem, ops, slice, str};
6666
/// /// https://url.spec.whatwg.org/#fragment-percent-encode-set
6767
/// const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`');
6868
/// ```
69-
#[derive(Debug, PartialEq, Eq)]
69+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
7070
pub struct AsciiSet {
7171
mask: [Chunk; ASCII_RANGE_LEN / BITS_PER_CHUNK],
7272
}
@@ -79,7 +79,7 @@ const BITS_PER_CHUNK: usize = 8 * mem::size_of::<Chunk>();
7979

8080
impl AsciiSet {
8181
/// An empty set.
82-
pub const EMPTY: AsciiSet = AsciiSet {
82+
pub const EMPTY: &'static AsciiSet = &AsciiSet {
8383
mask: [0; ASCII_RANGE_LEN / BITS_PER_CHUNK],
8484
};
8585

@@ -108,7 +108,7 @@ impl AsciiSet {
108108
}
109109

110110
/// Return the union of two sets.
111-
pub const fn union(&self, other: Self) -> Self {
111+
pub const fn union(&self, other: &Self) -> Self {
112112
let mask = [
113113
self.mask[0] | other.mask[0],
114114
self.mask[1] | other.mask[1],
@@ -128,15 +128,31 @@ impl AsciiSet {
128128
impl ops::Add for AsciiSet {
129129
type Output = Self;
130130

131-
fn add(self, other: Self) -> Self {
131+
fn add(self, other: Self) -> Self::Output {
132+
self.union(&other)
133+
}
134+
}
135+
136+
impl ops::Add for &AsciiSet {
137+
type Output = AsciiSet;
138+
139+
fn add(self, other: Self) -> Self::Output {
132140
self.union(other)
133141
}
134142
}
135143

136144
impl ops::Not for AsciiSet {
137145
type Output = Self;
138146

139-
fn not(self) -> Self {
147+
fn not(self) -> Self::Output {
148+
self.complement()
149+
}
150+
}
151+
152+
impl ops::Not for &AsciiSet {
153+
type Output = AsciiSet;
154+
155+
fn not(self) -> Self::Output {
140156
self.complement()
141157
}
142158
}
@@ -268,7 +284,7 @@ pub fn percent_encode_byte(byte: u8) -> &'static str {
268284
/// assert_eq!(percent_encode(b"foo bar?", NON_ALPHANUMERIC).to_string(), "foo%20bar%3F");
269285
/// ```
270286
#[inline]
271-
pub fn percent_encode<'a>(input: &'a [u8], ascii_set: &'static AsciiSet) -> PercentEncode<'a> {
287+
pub fn percent_encode<'a>(input: &'a [u8], ascii_set: &'a AsciiSet) -> PercentEncode<'a> {
272288
PercentEncode {
273289
bytes: input,
274290
ascii_set,
@@ -287,15 +303,15 @@ pub fn percent_encode<'a>(input: &'a [u8], ascii_set: &'static AsciiSet) -> Perc
287303
/// assert_eq!(utf8_percent_encode("foo bar?", NON_ALPHANUMERIC).to_string(), "foo%20bar%3F");
288304
/// ```
289305
#[inline]
290-
pub fn utf8_percent_encode<'a>(input: &'a str, ascii_set: &'static AsciiSet) -> PercentEncode<'a> {
306+
pub fn utf8_percent_encode<'a>(input: &'a str, ascii_set: &'a AsciiSet) -> PercentEncode<'a> {
291307
percent_encode(input.as_bytes(), ascii_set)
292308
}
293309

294310
/// The return type of [`percent_encode`] and [`utf8_percent_encode`].
295311
#[derive(Clone)]
296312
pub struct PercentEncode<'a> {
297313
bytes: &'a [u8],
298-
ascii_set: &'static AsciiSet,
314+
ascii_set: &'a AsciiSet,
299315
}
300316

301317
impl<'a> Iterator for PercentEncode<'a> {
@@ -372,6 +388,19 @@ pub fn percent_decode_str(input: &str) -> PercentDecode<'_> {
372388
percent_decode(input.as_bytes())
373389
}
374390

391+
/// Percent-decode the given string preserving the given ascii_set.
392+
///
393+
/// <https://url.spec.whatwg.org/#string-percent-decode>
394+
///
395+
/// See [`percent_decode`] regarding the return type.
396+
#[inline]
397+
pub fn percent_decode_str_with_set<'a>(
398+
input: &'a str,
399+
ascii_set: &'a AsciiSet,
400+
) -> PercentDecode<'a> {
401+
percent_decode_with_set(input.as_bytes(), ascii_set)
402+
}
403+
375404
/// Percent-decode the given bytes.
376405
///
377406
/// <https://url.spec.whatwg.org/#percent-decode>
@@ -394,13 +423,44 @@ pub fn percent_decode_str(input: &str) -> PercentDecode<'_> {
394423
pub fn percent_decode(input: &[u8]) -> PercentDecode<'_> {
395424
PercentDecode {
396425
bytes: input.iter(),
426+
ascii_set: None,
427+
}
428+
}
429+
430+
/// Percent-decode the given bytes preserving the given ascii_set.
431+
///
432+
/// <https://url.spec.whatwg.org/#percent-decode>
433+
///
434+
/// Any sequence of `%` followed by two hexadecimal digits expect for the given [AsciiSet] is decoded.
435+
/// The return type:
436+
///
437+
/// * Implements `Into<Cow<u8>>` borrowing `input` when it contains no percent-encoded sequence,
438+
/// * Implements `Iterator<Item = u8>` and therefore has a `.collect::<Vec<u8>>()` method,
439+
/// * Has `decode_utf8()` and `decode_utf8_lossy()` methods.
440+
///
441+
/// # Examples
442+
///
443+
/// ```
444+
/// use percent_encoding::{percent_decode_with_set, NON_ALPHANUMERIC};
445+
///
446+
/// assert_eq!(percent_decode_with_set(b"%66oo%20bar%3f", &!NON_ALPHANUMERIC).decode_utf8().unwrap(), "%66oo bar?");
447+
/// ```
448+
#[inline]
449+
pub fn percent_decode_with_set<'a>(
450+
input: &'a [u8],
451+
ascii_set: &'a AsciiSet,
452+
) -> PercentDecode<'a> {
453+
PercentDecode {
454+
bytes: input.iter(),
455+
ascii_set: Some(ascii_set),
397456
}
398457
}
399458

400459
/// The return type of [`percent_decode`].
401460
#[derive(Clone, Debug)]
402461
pub struct PercentDecode<'a> {
403462
bytes: slice::Iter<'a, u8>,
463+
ascii_set: Option<&'a AsciiSet>,
404464
}
405465

406466
fn after_percent_sign(iter: &mut slice::Iter<'_, u8>) -> Option<u8> {
@@ -411,13 +471,35 @@ fn after_percent_sign(iter: &mut slice::Iter<'_, u8>) -> Option<u8> {
411471
Some(h as u8 * 0x10 + l as u8)
412472
}
413473

474+
fn after_percent_sign_lookahead<'a>(
475+
iter: &mut slice::Iter<'a, u8>,
476+
) -> Option<(u8, slice::Iter<'a, u8>)> {
477+
let mut cloned_iter = iter.clone();
478+
let h = char::from(*cloned_iter.next()?).to_digit(16)?;
479+
let l = char::from(*cloned_iter.next()?).to_digit(16)?;
480+
Some((h as u8 * 0x10 + l as u8, cloned_iter))
481+
}
482+
414483
impl<'a> Iterator for PercentDecode<'a> {
415484
type Item = u8;
416485

417486
fn next(&mut self) -> Option<u8> {
418487
self.bytes.next().map(|&byte| {
419-
if byte == b'%' {
420-
after_percent_sign(&mut self.bytes).unwrap_or(byte)
488+
if byte != b'%' {
489+
return byte;
490+
}
491+
492+
let Some((decoded_byte, iter)) = after_percent_sign_lookahead(&mut self.bytes) else {
493+
return byte;
494+
};
495+
496+
let should_decode = self
497+
.ascii_set
498+
.map_or(true, |ascii_set| !ascii_set.contains(decoded_byte));
499+
500+
if should_decode {
501+
self.bytes = iter;
502+
decoded_byte
421503
} else {
422504
byte
423505
}
@@ -447,11 +529,20 @@ impl<'a> PercentDecode<'a> {
447529
let mut bytes_iter = self.bytes.clone();
448530
while bytes_iter.any(|&b| b == b'%') {
449531
if let Some(decoded_byte) = after_percent_sign(&mut bytes_iter) {
532+
if let Some(ascii_set) = self.ascii_set {
533+
if ascii_set.contains(decoded_byte) {
534+
continue;
535+
}
536+
}
537+
450538
let initial_bytes = self.bytes.as_slice();
451539
let unchanged_bytes_len = initial_bytes.len() - bytes_iter.len() - 3;
452540
let mut decoded = initial_bytes[..unchanged_bytes_len].to_owned();
453541
decoded.push(decoded_byte);
454-
decoded.extend(PercentDecode { bytes: bytes_iter });
542+
decoded.extend(PercentDecode {
543+
bytes: bytes_iter,
544+
ascii_set: self.ascii_set,
545+
});
455546
return Some(decoded);
456547
}
457548
}
@@ -542,8 +633,8 @@ mod tests {
542633
/// useful for defining sets in a modular way.
543634
#[test]
544635
fn union() {
545-
const A: AsciiSet = AsciiSet::EMPTY.add(b'A');
546-
const B: AsciiSet = AsciiSet::EMPTY.add(b'B');
636+
const A: &AsciiSet = &AsciiSet::EMPTY.add(b'A');
637+
const B: &AsciiSet = &AsciiSet::EMPTY.add(b'B');
547638
const UNION: AsciiSet = A.union(B);
548639
const EXPECTED: AsciiSet = AsciiSet::EMPTY.add(b'A').add(b'B');
549640
assert_eq!(UNION, EXPECTED);

0 commit comments

Comments
 (0)