From 893c4fe9a7728ad6318c4368f2d27004ae40fe1e Mon Sep 17 00:00:00 2001 From: Jonathan Kew Date: Fri, 1 Mar 2024 11:38:24 +0000 Subject: [PATCH 1/4] Use a simplified codepath if no bidi isolation controls are present. If there aren't any bidi isolation control characters in the paragraph, each level run directly corresponds to one isolating run sequence. In this case we can take a simplified codepath to process them. --- src/lib.rs | 68 ++++++++++++----- src/prepare.rs | 200 ++++++++++++++++++++++++++++++++----------------- src/utf16.rs | 24 +++--- 3 files changed, 192 insertions(+), 100 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index dfe3b22..8d1067c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -248,8 +248,14 @@ struct InitialInfoExt<'text> { /// Parallel to base.paragraphs, records whether each paragraph is "pure LTR" that /// requires no further bidi processing (i.e. there are no RTL characters or bidi - /// control codes present). - pure_ltr: Vec, + /// control codes present), and whether any bidi isolation controls are present. + flags: Vec, +} + +#[derive(PartialEq, Debug)] +struct ParagraphInfoFlags { + is_pure_ltr: bool, + has_isolate_controls: bool, } impl<'text> InitialInfoExt<'text> { @@ -269,12 +275,12 @@ impl<'text> InitialInfoExt<'text> { default_para_level: Option, ) -> InitialInfoExt<'a> { let mut paragraphs = Vec::::new(); - let mut pure_ltr = Vec::::new(); - let (original_classes, _, _) = compute_initial_info( + let mut flags = Vec::::new(); + let (original_classes, _, _, _) = compute_initial_info( data_source, text, default_para_level, - Some((&mut paragraphs, &mut pure_ltr)), + Some((&mut paragraphs, &mut flags)), ); InitialInfoExt { @@ -283,7 +289,7 @@ impl<'text> InitialInfoExt<'text> { original_classes, paragraphs, }, - pure_ltr, + flags, } } } @@ -299,8 +305,8 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( data_source: &D, text: &'a T, default_para_level: Option, - mut split_paragraphs: Option<(&mut Vec, &mut Vec)>, -) -> (Vec, Level, bool) { + mut split_paragraphs: Option<(&mut Vec, &mut Vec)>, +) -> (Vec, Level, bool, bool) { let mut original_classes = Vec::with_capacity(text.len()); // The stack contains the starting code unit index for each nested isolate we're inside. @@ -310,8 +316,8 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( let mut isolate_stack = Vec::new(); debug_assert!( - if let Some((ref paragraphs, ref pure_ltr)) = split_paragraphs { - paragraphs.is_empty() && pure_ltr.is_empty() + if let Some((ref paragraphs, ref flags)) = split_paragraphs { + paragraphs.is_empty() && flags.is_empty() } else { true } @@ -323,6 +329,8 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( // Per-paragraph flag: can subsequent processing be skipped? Set to false if any // RTL characters or bidi control characters are encountered in the paragraph. let mut is_pure_ltr = true; + // Set to true if any bidi isolation controls are present in the paragraph. + let mut has_isolate_controls = false; #[cfg(feature = "flame_it")] flame::start("compute_initial_info(): iter text.char_indices()"); @@ -341,7 +349,7 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( match class { B => { - if let Some((ref mut paragraphs, ref mut pure_ltr)) = split_paragraphs { + if let Some((ref mut paragraphs, ref mut flags)) = split_paragraphs { // P1. Split the text into separate paragraphs. The paragraph separator is kept // with the previous paragraph. let para_end = i + len; @@ -350,7 +358,10 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( // P3. If no character is found in p2, set the paragraph level to zero. level: para_level.unwrap_or(LTR_LEVEL), }); - pure_ltr.push(is_pure_ltr); + flags.push(ParagraphInfoFlags { + is_pure_ltr, + has_isolate_controls, + }); // Reset state for the start of the next paragraph. para_start = para_end; // TODO: Support defaulting to direction of previous paragraph @@ -358,6 +369,7 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( // para_level = default_para_level; is_pure_ltr = true; + has_isolate_controls = false; isolate_stack.clear(); } } @@ -394,6 +406,7 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( RLI | LRI | FSI => { is_pure_ltr = false; + has_isolate_controls = true; isolate_stack.push(i); } @@ -405,15 +418,18 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( } } - if let Some((paragraphs, pure_ltr)) = split_paragraphs { + if let Some((paragraphs, flags)) = split_paragraphs { if para_start < text.len() { paragraphs.push(ParagraphInfo { range: para_start..text.len(), level: para_level.unwrap_or(LTR_LEVEL), }); - pure_ltr.push(is_pure_ltr); + flags.push(ParagraphInfoFlags { + is_pure_ltr, + has_isolate_controls, + }); } - debug_assert_eq!(paragraphs.len(), pure_ltr.len()); + debug_assert_eq!(paragraphs.len(), flags.len()); } debug_assert_eq!(original_classes.len(), text.len()); @@ -424,6 +440,7 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( original_classes, para_level.unwrap_or(LTR_LEVEL), is_pure_ltr, + has_isolate_controls, ) } @@ -482,20 +499,21 @@ impl<'text> BidiInfo<'text> { text: &'a str, default_para_level: Option, ) -> BidiInfo<'a> { - let InitialInfoExt { base, pure_ltr, .. } = + let InitialInfoExt { base, flags, .. } = InitialInfoExt::new_with_data_source(data_source, text, default_para_level); let mut levels = Vec::::with_capacity(text.len()); let mut processing_classes = base.original_classes.clone(); - for (para, is_pure_ltr) in base.paragraphs.iter().zip(pure_ltr.iter()) { + for (para, flags) in base.paragraphs.iter().zip(flags.iter()) { let text = &text[para.range.clone()]; let original_classes = &base.original_classes[para.range.clone()]; compute_bidi_info_for_para( data_source, para, - *is_pure_ltr, + flags.is_pure_ltr, + flags.has_isolate_controls, text, original_classes, &mut processing_classes, @@ -720,7 +738,7 @@ impl<'text> ParagraphBidiInfo<'text> { ) -> ParagraphBidiInfo<'a> { // Here we could create a ParagraphInitialInfo struct to parallel the one // used by BidiInfo, but there doesn't seem any compelling reason for it. - let (original_classes, paragraph_level, is_pure_ltr) = + let (original_classes, paragraph_level, is_pure_ltr, has_isolate_controls) = compute_initial_info(data_source, text, default_para_level, None); let mut levels = Vec::::with_capacity(text.len()); @@ -738,6 +756,7 @@ impl<'text> ParagraphBidiInfo<'text> { data_source, ¶_info, is_pure_ltr, + has_isolate_controls, text, &original_classes, &mut processing_classes, @@ -1066,6 +1085,7 @@ fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized> data_source: &D, para: &ParagraphInfo, is_pure_ltr: bool, + has_isolate_controls: bool, text: &'a T, original_classes: &[BidiClass], processing_classes: &mut [BidiClass], @@ -1088,7 +1108,14 @@ fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized> processing_classes, ); - let sequences = prepare::isolating_run_sequences(para.level, original_classes, levels); + let mut sequences = prepare::IsolatingRunSequenceVec::new(); + prepare::isolating_run_sequences( + para.level, + original_classes, + levels, + has_isolate_controls, + &mut sequences, + ); for sequence in &sequences { implicit::resolve_weak(text, sequence, processing_classes); implicit::resolve_neutral( @@ -1100,6 +1127,7 @@ fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized> processing_classes, ); } + implicit::resolve_levels(processing_classes, levels); assign_levels_to_removed_chars(para.level, original_classes, levels); diff --git a/src/prepare.rs b/src/prepare.rs index f864229..688ff9e 100644 --- a/src/prepare.rs +++ b/src/prepare.rs @@ -33,6 +33,11 @@ pub struct IsolatingRunSequence { pub eos: BidiClass, // End-of-sequence type. } +#[cfg(feature = "smallvec")] +pub type IsolatingRunSequenceVec = SmallVec<[IsolatingRunSequence; 8]>; +#[cfg(not(feature = "smallvec"))] +pub type IsolatingRunSequenceVec = Vec; + /// Compute the set of isolating run sequences. /// /// An isolating run sequence is a maximal sequence of level runs such that for all level runs @@ -45,9 +50,61 @@ pub fn isolating_run_sequences( para_level: Level, original_classes: &[BidiClass], levels: &[Level], -) -> Vec { + has_isolate_controls: bool, + isolating_run_sequences: &mut IsolatingRunSequenceVec, +) { let runs = level_runs(levels, original_classes); + // Per http://www.unicode.org/reports/tr9/#BD13: + // "In the absence of isolate initiators, each isolating run sequence in a paragraph + // consists of exactly one level run, and each level run constitutes a separate + // isolating run sequence." + // We can take a simplified path to handle this case. + if !has_isolate_controls { + isolating_run_sequences.reserve_exact(runs.len()); + for run in runs { + // Determine the `sos` and `eos` class for the sequence. + // + + let run_levels = &levels[run.clone()]; + let run_classes = &original_classes[run.clone()]; + let seq_level = run_levels[run_classes + .iter() + .position(|c| not_removed_by_x9(c)) + .unwrap_or(0)]; + + let end_level = run_levels[run_classes + .iter() + .rposition(|c| not_removed_by_x9(c)) + .unwrap_or(run.end - run.start - 1)]; + + // Get the level of the last non-removed char before the run. + let pred_level = match original_classes[..run.start] + .iter() + .rposition(not_removed_by_x9) + { + Some(idx) => levels[idx], + None => para_level, + }; + + // Get the level of the next non-removed char after the run. + let succ_level = match original_classes[run.end..] + .iter() + .position(not_removed_by_x9) + { + Some(idx) => levels[run.end + idx], + None => para_level, + }; + + isolating_run_sequences.push(IsolatingRunSequence { + runs: vec![run], + sos: max(seq_level, pred_level).bidi_class(), + eos: max(end_level, succ_level).bidi_class(), + }); + } + return; + } + // Compute the set of isolating run sequences. // let mut sequences = Vec::with_capacity(runs.len()); @@ -98,79 +155,77 @@ pub fn isolating_run_sequences( // Determine the `sos` and `eos` class for each sequence. // - sequences - .into_iter() - .map(|sequence: Vec| { - assert!(!sequence.is_empty()); + for sequence in sequences { + assert!(!sequence.is_empty()); - let mut result = IsolatingRunSequence { - runs: sequence, - sos: L, - eos: L, - }; + let start_of_seq = sequence[0].start; + let runs_len = sequence.len(); + let end_of_seq = sequence[runs_len - 1].end; + + let mut result = IsolatingRunSequence { + runs: sequence, + sos: L, + eos: L, + }; - let start_of_seq = result.runs[0].start; - let runs_len = result.runs.len(); - let end_of_seq = result.runs[runs_len - 1].end; - - // > (not counting characters removed by X9) - let seq_level = levels[result - .iter_forwards_from(start_of_seq, 0) - .find(|i| not_removed_by_x9(&original_classes[*i])) - .unwrap_or(start_of_seq)]; - - // XXXManishearth the spec talks of a start and end level, - // but for a given IRS the two should be equivalent, yes? - let end_level = levels[result - .iter_backwards_from(end_of_seq, runs_len - 1) - .find(|i| not_removed_by_x9(&original_classes[*i])) - .unwrap_or(end_of_seq - 1)]; - - #[cfg(test)] - for idx in result.runs.clone().into_iter().flatten() { - if not_removed_by_x9(&original_classes[idx]) { - assert_eq!(seq_level, levels[idx]); - } + // > (not counting characters removed by X9) + let seq_level = levels[result + .iter_forwards_from(start_of_seq, 0) + .find(|i| not_removed_by_x9(&original_classes[*i])) + .unwrap_or(start_of_seq)]; + + // XXXManishearth the spec talks of a start and end level, + // but for a given IRS the two should be equivalent, yes? + let end_level = levels[result + .iter_backwards_from(end_of_seq, runs_len - 1) + .find(|i| not_removed_by_x9(&original_classes[*i])) + .unwrap_or(end_of_seq - 1)]; + + #[cfg(test)] + for idx in result.runs.clone().into_iter().flatten() { + if not_removed_by_x9(&original_classes[idx]) { + assert_eq!(seq_level, levels[idx]); } + } + + // Get the level of the last non-removed char before the runs. + let pred_level = match original_classes[..start_of_seq] + .iter() + .rposition(not_removed_by_x9) + { + Some(idx) => levels[idx], + None => para_level, + }; + + // Get the last non-removed character to check if it is an isolate initiator. + // The spec calls for an unmatched one, but matched isolate initiators + // will never be at the end of a level run (otherwise there would be more to the run). + // We unwrap_or(BN) because BN marks removed classes and it won't matter for the check. + let last_non_removed = original_classes[..end_of_seq] + .iter() + .copied() + .rev() + .find(not_removed_by_x9) + .unwrap_or(BN); - // Get the level of the last non-removed char before the runs. - let pred_level = match original_classes[..start_of_seq] + // Get the level of the next non-removed char after the runs. + let succ_level = if matches!(last_non_removed, RLI | LRI | FSI) { + para_level + } else { + match original_classes[end_of_seq..] .iter() - .rposition(not_removed_by_x9) + .position(not_removed_by_x9) { - Some(idx) => levels[idx], + Some(idx) => levels[end_of_seq + idx], None => para_level, - }; + } + }; - // Get the last non-removed character to check if it is an isolate initiator. - // The spec calls for an unmatched one, but matched isolate initiators - // will never be at the end of a level run (otherwise there would be more to the run). - // We unwrap_or(BN) because BN marks removed classes and it won't matter for the check. - let last_non_removed = original_classes[..end_of_seq] - .iter() - .copied() - .rev() - .find(not_removed_by_x9) - .unwrap_or(BN); - - // Get the level of the next non-removed char after the runs. - let succ_level = if matches!(last_non_removed, RLI | LRI | FSI) { - para_level - } else { - match original_classes[end_of_seq..] - .iter() - .position(not_removed_by_x9) - { - Some(idx) => levels[end_of_seq + idx], - None => para_level, - } - }; + result.sos = max(seq_level, pred_level).bidi_class(); + result.eos = max(end_level, succ_level).bidi_class(); - result.sos = max(seq_level, pred_level).bidi_class(); - result.eos = max(end_level, succ_level).bidi_class(); - result - }) - .collect() + isolating_run_sequences.push(result); + } } impl IsolatingRunSequence { @@ -276,7 +331,8 @@ mod tests { let classes = &[L, RLE, L, PDF, RLE, L, PDF, L]; let levels = &[0, 1, 1, 1, 1, 1, 1, 0]; let para_level = Level::ltr(); - let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); + let mut sequences = IsolatingRunSequenceVec::new(); + isolating_run_sequences(para_level, classes, &Level::vec(levels), false, &mut sequences); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); assert_eq!( sequences.iter().map(|s| s.runs.clone()).collect::>(), @@ -289,7 +345,8 @@ mod tests { let classes = &[L, RLI, L, PDI, RLI, L, PDI, L]; let levels = &[0, 0, 1, 0, 0, 1, 0, 0]; let para_level = Level::ltr(); - let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); + let mut sequences = IsolatingRunSequenceVec::new(); + isolating_run_sequences(para_level, classes, &Level::vec(levels), true, &mut sequences); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); assert_eq!( sequences.iter().map(|s| s.runs.clone()).collect::>(), @@ -302,7 +359,8 @@ mod tests { let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI, L]; let levels = &[0, 0, 1, 1, 2, 3, 3, 3, 2, 1, 1, 0, 0]; let para_level = Level::ltr(); - let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); + let mut sequences = IsolatingRunSequenceVec::new(); + isolating_run_sequences(para_level, classes, &Level::vec(levels), true, &mut sequences); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); assert_eq!( sequences.iter().map(|s| s.runs.clone()).collect::>(), @@ -321,7 +379,8 @@ mod tests { let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF, L]; let levels = &[0, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 0]; let para_level = Level::ltr(); - let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); + let mut sequences = IsolatingRunSequenceVec::new(); + isolating_run_sequences(para_level, classes, &Level::vec(levels), false, &mut sequences); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); // text1 @@ -380,7 +439,8 @@ mod tests { let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI, L]; let levels = &[0, 0, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0]; let para_level = Level::ltr(); - let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); + let mut sequences = IsolatingRunSequenceVec::new(); + isolating_run_sequences(para_level, classes, &Level::vec(levels), true, &mut sequences); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); // text1·RLI·PDI·RLI·PDI·text6 diff --git a/src/utf16.rs b/src/utf16.rs index 02f83aa..11b386f 100644 --- a/src/utf16.rs +++ b/src/utf16.rs @@ -18,7 +18,9 @@ use crate::{ compute_bidi_info_for_para, compute_initial_info, level, para_direction, reorder_levels, reorder_visual, visual_runs_for_line, }; -use crate::{BidiClass, BidiDataSource, Direction, Level, LevelRun, ParagraphInfo}; +use crate::{ + BidiClass, BidiDataSource, Direction, Level, LevelRun, ParagraphInfo, ParagraphInfoFlags, +}; #[cfg(feature = "hardcoded-data")] use crate::HardcodedBidiData; @@ -83,7 +85,7 @@ struct InitialInfoExt<'text> { /// Parallel to base.paragraphs, records whether each paragraph is "pure LTR" that /// requires no further bidi processing (i.e. there are no RTL characters or bidi /// control codes present). - pure_ltr: Vec, + flags: Vec, } impl<'text> InitialInfoExt<'text> { @@ -103,12 +105,12 @@ impl<'text> InitialInfoExt<'text> { default_para_level: Option, ) -> InitialInfoExt<'a> { let mut paragraphs = Vec::::new(); - let mut pure_ltr = Vec::::new(); - let (original_classes, _, _) = compute_initial_info( + let mut flags = Vec::::new(); + let (original_classes, _, _, _) = compute_initial_info( data_source, text, default_para_level, - Some((&mut paragraphs, &mut pure_ltr)), + Some((&mut paragraphs, &mut flags)), ); InitialInfoExt { @@ -117,7 +119,7 @@ impl<'text> InitialInfoExt<'text> { original_classes, paragraphs, }, - pure_ltr, + flags, } } } @@ -177,20 +179,21 @@ impl<'text> BidiInfo<'text> { text: &'a [u16], default_para_level: Option, ) -> BidiInfo<'a> { - let InitialInfoExt { base, pure_ltr, .. } = + let InitialInfoExt { base, flags, .. } = InitialInfoExt::new_with_data_source(data_source, text, default_para_level); let mut levels = Vec::::with_capacity(text.len()); let mut processing_classes = base.original_classes.clone(); - for (para, is_pure_ltr) in base.paragraphs.iter().zip(pure_ltr.iter()) { + for (para, flags) in base.paragraphs.iter().zip(flags.iter()) { let text = &text[para.range.clone()]; let original_classes = &base.original_classes[para.range.clone()]; compute_bidi_info_for_para( data_source, para, - *is_pure_ltr, + flags.is_pure_ltr, + flags.has_isolate_controls, text, original_classes, &mut processing_classes, @@ -411,7 +414,7 @@ impl<'text> ParagraphBidiInfo<'text> { ) -> ParagraphBidiInfo<'a> { // Here we could create a ParagraphInitialInfo struct to parallel the one // used by BidiInfo, but there doesn't seem any compelling reason for it. - let (original_classes, paragraph_level, is_pure_ltr) = + let (original_classes, paragraph_level, is_pure_ltr, has_isolate_controls) = compute_initial_info(data_source, text, default_para_level, None); let mut levels = Vec::::with_capacity(text.len()); @@ -429,6 +432,7 @@ impl<'text> ParagraphBidiInfo<'text> { data_source, ¶_info, is_pure_ltr, + has_isolate_controls, text, &original_classes, &mut processing_classes, From 77ca01f088129d7d2e22209e3216688da60130f6 Mon Sep 17 00:00:00 2001 From: Jonathan Kew Date: Fri, 1 Mar 2024 20:10:15 +0000 Subject: [PATCH 2/4] Collect level runs during explicit::compute(). We can easily accumulate the level runs as part of the initial explicit::compute() pass over the text; this avoids the need for a separate pass over the levels array at the beginning of prepare::isolating_run_sequences to collect them. --- src/explicit.rs | 24 ++++++++++++++++++++++ src/lib.rs | 5 ++++- src/prepare.rs | 53 +++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 73 insertions(+), 9 deletions(-) diff --git a/src/explicit.rs b/src/explicit.rs index 3c98bf4..3da6d96 100644 --- a/src/explicit.rs +++ b/src/explicit.rs @@ -19,12 +19,16 @@ use super::char_data::{ BidiClass::{self, *}, }; use super::level::Level; +use super::prepare::removed_by_x9; +use super::LevelRunVec; use super::TextSource; /// Compute explicit embedding levels for one paragraph of text (X1-X8). /// /// `processing_classes[i]` must contain the `BidiClass` of the char at byte index `i`, /// for each char in `text`. +/// +/// `runs` returns the list of level runs (BD7) of the text. #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn compute<'a, T: TextSource<'a> + ?Sized>( text: &'a T, @@ -32,6 +36,7 @@ pub fn compute<'a, T: TextSource<'a> + ?Sized>( original_classes: &[BidiClass], levels: &mut [Level], processing_classes: &mut [BidiClass], + runs: &mut LevelRunVec, ) { assert_eq!(text.len(), original_classes.len()); @@ -51,6 +56,9 @@ pub fn compute<'a, T: TextSource<'a> + ?Sized>( let mut overflow_embedding_count = 0u32; let mut valid_isolate_count = 0u32; + let mut current_run_level = Level::ltr(); + let mut current_run_start = 0; + for (i, len) in text.indices_lengths() { let last = stack.last().unwrap(); @@ -182,6 +190,22 @@ pub fn compute<'a, T: TextSource<'a> + ?Sized>( levels[i + j] = levels[i]; processing_classes[i + j] = processing_classes[i]; } + + // Check if we need to start a new level run. + if i == 0 { + current_run_level = levels[i]; + } else { + if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level { + // End the last run and start a new one. + runs.push(current_run_start..i); + current_run_level = levels[i]; + current_run_start = i; + } + } + } + + if levels.len() > current_run_start { + runs.push(current_run_start..levels.len()); } } diff --git a/src/lib.rs b/src/lib.rs index 8d1067c..8b99ca1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -88,7 +88,7 @@ mod prepare; pub use crate::char_data::{BidiClass, UNICODE_VERSION}; pub use crate::data_source::BidiDataSource; pub use crate::level::{Level, LTR_LEVEL, RTL_LEVEL}; -pub use crate::prepare::LevelRun; +pub use crate::prepare::{LevelRun, LevelRunVec}; #[cfg(feature = "hardcoded-data")] pub use crate::char_data::{bidi_class, HardcodedBidiData}; @@ -1099,6 +1099,7 @@ fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized> let processing_classes = &mut processing_classes[para.range.clone()]; let levels = &mut levels[para.range.clone()]; + let mut level_runs = LevelRunVec::new(); explicit::compute( text, @@ -1106,6 +1107,7 @@ fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized> original_classes, levels, processing_classes, + &mut level_runs, ); let mut sequences = prepare::IsolatingRunSequenceVec::new(); @@ -1113,6 +1115,7 @@ fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized> para.level, original_classes, levels, + level_runs, has_isolate_controls, &mut sequences, ); diff --git a/src/prepare.rs b/src/prepare.rs index 688ff9e..adebfde 100644 --- a/src/prepare.rs +++ b/src/prepare.rs @@ -25,6 +25,11 @@ use super::BidiClass::{self, *}; /// Represented as a range of byte indices. pub type LevelRun = Range; +#[cfg(feature = "smallvec")] +pub type LevelRunVec = SmallVec<[LevelRun; 8]>; +#[cfg(not(feature = "smallvec"))] +pub type LevelRunVec = Vec; + /// Output of `isolating_run_sequences` (steps X9-X10) #[derive(Debug, PartialEq)] pub struct IsolatingRunSequence { @@ -50,11 +55,10 @@ pub fn isolating_run_sequences( para_level: Level, original_classes: &[BidiClass], levels: &[Level], + runs: LevelRunVec, has_isolate_controls: bool, isolating_run_sequences: &mut IsolatingRunSequenceVec, ) { - let runs = level_runs(levels, original_classes); - // Per http://www.unicode.org/reports/tr9/#BD13: // "In the absence of isolate initiators, each isolating run sequence in a paragraph // consists of exactly one level run, and each level run constitutes a separate @@ -97,7 +101,7 @@ pub fn isolating_run_sequences( }; isolating_run_sequences.push(IsolatingRunSequence { - runs: vec![run], + runs: vec![run.clone()], sos: max(seq_level, pred_level).bidi_class(), eos: max(end_level, succ_level).bidi_class(), }); @@ -272,6 +276,9 @@ impl IsolatingRunSequence { /// Finds the level runs in a paragraph. /// /// +/// +/// Only used for tests; runs are identified during explicit::compute. +#[cfg(test)] fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec { assert_eq!(levels.len(), original_classes.len()); @@ -332,7 +339,13 @@ mod tests { let levels = &[0, 1, 1, 1, 1, 1, 1, 0]; let para_level = Level::ltr(); let mut sequences = IsolatingRunSequenceVec::new(); - isolating_run_sequences(para_level, classes, &Level::vec(levels), false, &mut sequences); + isolating_run_sequences( + para_level, + classes, + &Level::vec(levels), + level_runs(&Level::vec(levels), classes).into(), + false, + &mut sequences); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); assert_eq!( sequences.iter().map(|s| s.runs.clone()).collect::>(), @@ -346,7 +359,13 @@ mod tests { let levels = &[0, 0, 1, 0, 0, 1, 0, 0]; let para_level = Level::ltr(); let mut sequences = IsolatingRunSequenceVec::new(); - isolating_run_sequences(para_level, classes, &Level::vec(levels), true, &mut sequences); + isolating_run_sequences( + para_level, + classes, + &Level::vec(levels), + level_runs(&Level::vec(levels), classes).into(), + true, + &mut sequences); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); assert_eq!( sequences.iter().map(|s| s.runs.clone()).collect::>(), @@ -360,7 +379,13 @@ mod tests { let levels = &[0, 0, 1, 1, 2, 3, 3, 3, 2, 1, 1, 0, 0]; let para_level = Level::ltr(); let mut sequences = IsolatingRunSequenceVec::new(); - isolating_run_sequences(para_level, classes, &Level::vec(levels), true, &mut sequences); + isolating_run_sequences( + para_level, + classes, + &Level::vec(levels), + level_runs(&Level::vec(levels), classes).into(), + true, + &mut sequences); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); assert_eq!( sequences.iter().map(|s| s.runs.clone()).collect::>(), @@ -380,7 +405,13 @@ mod tests { let levels = &[0, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 0]; let para_level = Level::ltr(); let mut sequences = IsolatingRunSequenceVec::new(); - isolating_run_sequences(para_level, classes, &Level::vec(levels), false, &mut sequences); + isolating_run_sequences( + para_level, + classes, + &Level::vec(levels), + level_runs(&Level::vec(levels), classes).into(), + false, + &mut sequences); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); // text1 @@ -440,7 +471,13 @@ mod tests { let levels = &[0, 0, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0]; let para_level = Level::ltr(); let mut sequences = IsolatingRunSequenceVec::new(); - isolating_run_sequences(para_level, classes, &Level::vec(levels), true, &mut sequences); + isolating_run_sequences( + para_level, + classes, + &Level::vec(levels), + level_runs(&Level::vec(levels), classes).into(), + true, + &mut sequences); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); // text1·RLI·PDI·RLI·PDI·text6 From 7233d8d73193089ba0a6f22ac872ba9287562574 Mon Sep 17 00:00:00 2001 From: Jonathan Kew Date: Mon, 4 Mar 2024 09:29:02 +0000 Subject: [PATCH 3/4] Remove unnecessary .clone() --- src/prepare.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/prepare.rs b/src/prepare.rs index adebfde..a353a16 100644 --- a/src/prepare.rs +++ b/src/prepare.rs @@ -101,7 +101,7 @@ pub fn isolating_run_sequences( }; isolating_run_sequences.push(IsolatingRunSequence { - runs: vec![run.clone()], + runs: vec![run], sos: max(seq_level, pred_level).bidi_class(), eos: max(end_level, succ_level).bidi_class(), }); From aa06343e7927318e7e8cfffd4d5a741ec5c5cce7 Mon Sep 17 00:00:00 2001 From: Jonathan Kew Date: Mon, 4 Mar 2024 10:02:52 +0000 Subject: [PATCH 4/4] Add comments re level-run processing. --- src/explicit.rs | 9 +++++++-- src/lib.rs | 31 +++++++++++++++++++++++++++++++ src/prepare.rs | 2 +- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/src/explicit.rs b/src/explicit.rs index 3da6d96..5760ab8 100644 --- a/src/explicit.rs +++ b/src/explicit.rs @@ -23,7 +23,8 @@ use super::prepare::removed_by_x9; use super::LevelRunVec; use super::TextSource; -/// Compute explicit embedding levels for one paragraph of text (X1-X8). +/// Compute explicit embedding levels for one paragraph of text (X1-X8), and identify +/// level runs (BD7) for use when determining Isolating Run Sequences (X10). /// /// `processing_classes[i]` must contain the `BidiClass` of the char at byte index `i`, /// for each char in `text`. @@ -191,10 +192,13 @@ pub fn compute<'a, T: TextSource<'a> + ?Sized>( processing_classes[i + j] = processing_classes[i]; } - // Check if we need to start a new level run. + // Identify level runs to be passed to prepare::isolating_run_sequences(). if i == 0 { + // Initialize for the first (or only) run. current_run_level = levels[i]; } else { + // Check if we need to start a new level run. + // if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level { // End the last run and start a new one. runs.push(current_run_start..i); @@ -204,6 +208,7 @@ pub fn compute<'a, T: TextSource<'a> + ?Sized>( } } + // Append the trailing level run, if non-empty. if levels.len() > current_run_start { runs.push(current_run_start..levels.len()); } diff --git a/src/lib.rs b/src/lib.rs index 8b99ca1..4899275 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1580,6 +1580,24 @@ mod tests { #[cfg(feature = "hardcoded-data")] fn test_process_text() { let tests = vec![ + ( + // text + "", + // base level + Some(RTL_LEVEL), + // levels + Level::vec(&[]), + // original_classes + vec![], + // paragraphs + vec![], + // levels_u16 + Level::vec(&[]), + // original_classes_u16 + vec![], + // paragraphs_u16 + vec![], + ), ( // text "abc123", @@ -1741,6 +1759,19 @@ mod tests { paragraphs: t.4.clone(), } ); + // If it was empty, also test that ParagraphBidiInfo handles it safely. + if t.4.len() == 0 { + assert_eq!( + ParagraphBidiInfo::new(t.0, t.1), + ParagraphBidiInfo { + text: t.0, + original_classes: t.3.clone(), + levels: t.2.clone(), + paragraph_level: RTL_LEVEL, + is_pure_ltr: true, + } + ) + } // If it was a single paragraph, also test ParagraphBidiInfo. if t.4.len() == 1 { assert_eq!( diff --git a/src/prepare.rs b/src/prepare.rs index a353a16..f7b35ad 100644 --- a/src/prepare.rs +++ b/src/prepare.rs @@ -277,7 +277,7 @@ impl IsolatingRunSequence { /// /// /// -/// Only used for tests; runs are identified during explicit::compute. +/// This is only used by tests; normally level runs are identified during explicit::compute. #[cfg(test)] fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec { assert_eq!(levels.len(), original_classes.len());