diff --git a/examples/rust/src/main.rs b/examples/rust/src/main.rs index 38551b9..8909eb5 100644 --- a/examples/rust/src/main.rs +++ b/examples/rust/src/main.rs @@ -1,24 +1,33 @@ -use parattice::PaRattice; -use parattice::LatticeKMP; use parattice::Lattice; +use parattice::LatticeKMP; +use parattice::PaRattice; use std::fs::File; -use std::io::prelude::*; use std::io; +use std::io::prelude::*; use std::path::Path; fn main() -> Result<(), io::Error> { // initialization - let paradict - = vec![vec![vec!["blood", "stem", "cell"], vec!["造血", "幹", "細胞"], - vec!["hematopoietic", "stem", "cell"]], - vec![vec!["造血", "幹", "細胞", "移植"], - vec!["hematopoietic", "stem", "cell", "transplantation"]], - vec![vec!["stem", "cell"], vec!["幹", "細胞"]], - vec![vec!["幹", "細胞", "移植"], vec!["rescue", "transplant"], - vec!["stem", "cell", "rescue"]], - vec![vec!["rescue"], vec!["救命"]], - vec![vec!["blood"], vec!["血液"]]]; + let paradict = vec![ + vec![ + vec!["blood", "stem", "cell"], + vec!["造血", "幹", "細胞"], + vec!["hematopoietic", "stem", "cell"], + ], + vec![ + vec!["造血", "幹", "細胞", "移植"], + vec!["hematopoietic", "stem", "cell", "transplantation"], + ], + vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + vec![ + vec!["幹", "細胞", "移植"], + vec!["rescue", "transplant"], + vec!["stem", "cell", "rescue"], + ], + vec![vec!["rescue"], vec!["救命"]], + vec![vec!["blood"], vec!["血液"]], + ]; let parattice = PaRattice::new(paradict); // lattice generation diff --git a/src/externs.rs b/src/externs.rs index d3f123b..59ebbb5 100644 --- a/src/externs.rs +++ b/src/externs.rs @@ -1,40 +1,41 @@ -use crate::parattice::PaRattice; use crate::lattice::Lattice; use crate::lattice_kmp::LatticeKMP; +use crate::parattice::PaRattice; use libc::c_char; use std::ffi::CStr; use std::ffi::CString; use std::mem; -use std::ptr; use std::slice; use std::str; #[no_mangle] -pub unsafe extern fn parattice_free_string(s: *mut c_char) { +pub unsafe extern "C" fn parattice_free_string(s: *mut c_char) { if !s.is_null() { CString::from_raw(s); } } #[no_mangle] -pub unsafe extern fn parattice_free_bytes(bytes: *mut u8, length: usize) { +pub unsafe extern "C" fn parattice_free_bytes(bytes: *mut u8, length: usize) { mem::drop(Vec::from_raw_parts(bytes, length, length)); } #[no_mangle] -pub unsafe extern fn parattice_parattice_new<'a>(dict: *const *const *const *const c_char) -> *mut PaRattice<'a> { +pub unsafe extern "C" fn parattice_parattice_new<'a>( + dict: *const *const *const *const c_char, +) -> *mut PaRattice<'a> { let mut dict_vec = vec![]; let mut i = 0; - while *dict.offset(i) != ptr::null() { + while !dict.offset(i).is_null() { let group: *const *const *const c_char = *dict.offset(i); let mut group_vec = vec![]; let mut j = 0; - while *group.offset(j) != ptr::null() { + while !group.offset(j).is_null() { let phrase: *const *const c_char = *group.offset(j); let mut phrase_vec = vec![]; let mut k = 0; - while *phrase.offset(k) != ptr::null() { + while !phrase.offset(k).is_null() { let word = *phrase.offset(k); let c_str = CStr::from_ptr(word); phrase_vec.push(str::from_utf8_unchecked(c_str.to_bytes())); @@ -50,28 +51,44 @@ pub unsafe extern fn parattice_parattice_new<'a>(dict: *const *const *const *con } #[no_mangle] -pub unsafe extern fn parattice_parattice_free(parattice: *mut PaRattice) { +pub unsafe extern "C" fn parattice_parattice_free(parattice: *mut PaRattice) { Box::from_raw(parattice); } #[no_mangle] -pub unsafe extern fn parattice_parattice_get_lattice(parattice: *const PaRattice, words: *const *const c_char, length: usize, shrink: bool, max_depth: usize) -> *mut Lattice { +pub unsafe extern "C" fn parattice_parattice_get_lattice( + parattice: *const PaRattice, + words: *const *const c_char, + length: usize, + shrink: bool, + max_depth: usize, +) -> *mut Lattice { let mut words_vec = Vec::with_capacity(length); for i in 0..length { - let word = *words.offset(i as isize); + let word = *words.add(i); let c_str = CStr::from_ptr(word); words_vec.push(str::from_utf8_unchecked(c_str.to_bytes())); } - Box::into_raw(Box::new((*parattice).get_lattice(&words_vec, shrink, max_depth))) + Box::into_raw(Box::new( + (*parattice).get_lattice(&words_vec, shrink, max_depth), + )) } #[no_mangle] -pub unsafe extern fn parattice_lattice_new_from_bytes<'a>(data: *const u8, length: usize) -> *mut Lattice<'a> { - Box::into_raw(Box::new(Lattice::new_from_bytes(slice::from_raw_parts(data, length)))) +pub unsafe extern "C" fn parattice_lattice_new_from_bytes<'a>( + data: *const u8, + length: usize, +) -> *mut Lattice<'a> { + Box::into_raw(Box::new(Lattice::new_from_bytes(slice::from_raw_parts( + data, length, + )))) } #[no_mangle] -pub unsafe extern fn parattice_lattice_to_bytes(lattice: *const Lattice, length: *mut usize) -> *mut u8 { +pub unsafe extern "C" fn parattice_lattice_to_bytes( + lattice: *const Lattice, + length: *mut usize, +) -> *mut u8 { let mut bytes = (*lattice).to_bytes(); *length = bytes.len(); let ptr = bytes.as_mut_ptr(); @@ -80,72 +97,101 @@ pub unsafe extern fn parattice_lattice_to_bytes(lattice: *const Lattice, length: } #[no_mangle] -pub unsafe extern fn parattice_lattice_free(lattice: *mut Lattice) { +pub unsafe extern "C" fn parattice_lattice_free(lattice: *mut Lattice) { Box::from_raw(lattice); } #[no_mangle] -pub unsafe extern fn parattice_lattice_get_size(lattice: *const Lattice) -> usize { +pub unsafe extern "C" fn parattice_lattice_get_size(lattice: *const Lattice) -> usize { (*lattice).lattice.len() } #[no_mangle] -pub unsafe extern fn parattice_lattice_get_required_capacity(lattice: *const Lattice) -> usize { +pub unsafe extern "C" fn parattice_lattice_get_required_capacity(lattice: *const Lattice) -> usize { (*lattice).capacity } #[no_mangle] -pub unsafe extern fn parattice_lattice_dump_dot(lattice: *const Lattice, is_numbered: bool) -> *mut c_char { +pub unsafe extern "C" fn parattice_lattice_dump_dot( + lattice: *const Lattice, + is_numbered: bool, +) -> *mut c_char { let s = (*lattice).dump_dot(is_numbered); let c_string = CString::new(s).unwrap(); c_string.into_raw() } #[no_mangle] -pub unsafe extern fn parattice_lattice_get_trunk_span<'a>(lattice: *const Lattice, edge_labels: *const *const c_char, node_ids: *const usize, length: usize, new_edge_labels: *mut *const u8, new_edge_label_length: *mut usize, new_node_ids: *mut usize) -> usize { +pub unsafe extern "C" fn parattice_lattice_get_trunk_span( + lattice: *const Lattice, + edge_labels: *const *const c_char, + node_ids: *const usize, + length: usize, + new_edge_labels: *mut *const u8, + new_edge_label_length: *mut usize, + new_node_ids: *mut usize, +) -> usize { let mut path = Vec::with_capacity(length); for i in 0..length { - let word = *edge_labels.offset(i as isize); + let word = *edge_labels.add(i); let c_str = CStr::from_ptr(word); - path.push((str::from_utf8_unchecked(c_str.to_bytes()), *node_ids.offset(i as isize))); + path.push(( + str::from_utf8_unchecked(c_str.to_bytes()), + *node_ids.add(i), + )); } let trunk_span = (*lattice).get_trunk_span(path); - for i in 0..trunk_span.len() { - *new_edge_labels.offset(i as isize) = trunk_span[i].0.as_ptr(); - *new_edge_label_length.offset(i as isize) = trunk_span[i].0.len(); - *new_node_ids.offset(i as isize) = trunk_span[i].1; + for (i, span) in trunk_span.iter().enumerate() { + *new_edge_labels.add(i) = span.0.as_ptr(); + *new_edge_label_length.add(i) = span.0.len(); + *new_node_ids.add(i) = span.1; } trunk_span.len() } #[no_mangle] -pub unsafe extern fn parattice_lattice_get_trunk_spans(lattice: *const Lattice, trunk_lefts: *mut usize, trunk_rights: *mut usize) { +pub unsafe extern "C" fn parattice_lattice_get_trunk_spans( + lattice: *const Lattice, + trunk_lefts: *mut usize, + trunk_rights: *mut usize, +) { let trunk_spans = (*lattice).get_trunk_spans(); for (i, (trunk_left, trunk_right)) in trunk_spans.into_iter().enumerate() { - *trunk_lefts.offset(i as isize) = trunk_left; - *trunk_rights.offset(i as isize) = trunk_right; + *trunk_lefts.add(i) = trunk_left; + *trunk_rights.add(i) = trunk_right; } } #[no_mangle] -pub unsafe extern fn parattice_lattice_dump_for_search_index(lattice: *const Lattice, texts: *mut *const u8, text_lengths: *mut usize, offset_starts: *mut usize, offset_ends: *mut usize, increments: *mut usize, lengths: *mut usize) -> usize { +pub unsafe extern "C" fn parattice_lattice_dump_for_search_index( + lattice: *const Lattice, + texts: *mut *const u8, + text_lengths: *mut usize, + offset_starts: *mut usize, + offset_ends: *mut usize, + increments: *mut usize, + lengths: *mut usize, +) -> usize { let search_index_nodes = (*lattice).dump_for_search_index(); - for i in 0..search_index_nodes.len() { - *texts.offset(i as isize) = search_index_nodes[i].text.as_ptr(); - *text_lengths.offset(i as isize) = search_index_nodes[i].text.len(); - *offset_starts.offset(i as isize) = search_index_nodes[i].offset.0; - *offset_ends.offset(i as isize) = search_index_nodes[i].offset.1; - *increments.offset(i as isize) = search_index_nodes[i].increment; - *lengths.offset(i as isize) = search_index_nodes[i].length; + for (i, node) in search_index_nodes.iter().enumerate() { + *texts.add(i) = node.text.as_ptr(); + *text_lengths.add(i) = node.text.len(); + *offset_starts.add(i) = node.offset.0; + *offset_ends.add(i) = node.offset.1; + *increments.add(i) = node.increment; + *lengths.add(i) = node.length; } search_index_nodes.len() } #[no_mangle] -pub unsafe extern fn parattice_lattice_kmp_new<'a>(pattern: *const *const c_char, length: usize) -> *mut LatticeKMP<'a> { +pub unsafe extern "C" fn parattice_lattice_kmp_new<'a>( + pattern: *const *const c_char, + length: usize, +) -> *mut LatticeKMP<'a> { let mut pattern_vec = Vec::with_capacity(length); for i in 0..length { - let word = *pattern.offset(i as isize); + let word = *pattern.add(i); let c_str = CStr::from_ptr(word); pattern_vec.push(str::from_utf8_unchecked(c_str.to_bytes())); } @@ -153,41 +199,60 @@ pub unsafe extern fn parattice_lattice_kmp_new<'a>(pattern: *const *const c_char } #[no_mangle] -pub unsafe extern fn parattice_lattice_kmp_free(latticekmp: *mut LatticeKMP) { +pub unsafe extern "C" fn parattice_lattice_kmp_free(latticekmp: *mut LatticeKMP) { Box::from_raw(latticekmp); } #[no_mangle] -pub unsafe extern fn parattice_lattice_kmp_search<'a>(latticekmp: *const LatticeKMP<'a>, lattice: *const Lattice<'a>) -> *mut Vec> { +pub unsafe extern "C" fn parattice_lattice_kmp_search<'a>( + latticekmp: *const LatticeKMP<'a>, + lattice: *const Lattice<'a>, +) -> *mut Vec> { Box::into_raw(Box::new((*latticekmp).search(&(*lattice)))) } #[no_mangle] -pub unsafe extern fn parattice_lattice_kmp_free_result<'a>(results: *mut Vec>) { +pub unsafe extern "C" fn parattice_lattice_kmp_free_result<'a>( + results: *mut Vec>, +) { Box::from_raw(results); } #[no_mangle] -pub unsafe extern fn parattice_lattice_kmp_results_size<'a>(results: *const Vec>) -> usize { +pub unsafe extern "C" fn parattice_lattice_kmp_results_size<'a>( + results: *const Vec>, +) -> usize { (*results).len() } #[no_mangle] -pub unsafe extern fn parattice_lattice_kmp_result_length<'a>(results: *const Vec>, index: usize) -> usize { +pub unsafe extern "C" fn parattice_lattice_kmp_result_length<'a>( + results: *const Vec>, + index: usize, +) -> usize { (*results)[index].len() } #[no_mangle] -pub unsafe extern fn parattice_lattice_kmp_result_nodes<'a>(results: *const Vec>, index: usize, nodes: *mut usize) { +pub unsafe extern "C" fn parattice_lattice_kmp_result_nodes<'a>( + results: *const Vec>, + index: usize, + nodes: *mut usize, +) { for i in 0..(*results)[index].len() { - *nodes.offset(i as isize) = (*results)[index][i].1; + *nodes.add(i) = (*results)[index][i].1; } } #[no_mangle] -pub unsafe extern fn parattice_lattice_kmp_result_edge_labels<'a>(results: *const Vec>, index: usize, edge_labels: *mut *const u8, edge_label_length: *mut usize) { +pub unsafe extern "C" fn parattice_lattice_kmp_result_edge_labels<'a>( + results: *const Vec>, + index: usize, + edge_labels: *mut *const u8, + edge_label_length: *mut usize, +) { for i in 0..(*results)[index].len() { - *edge_labels.offset(i as isize) = (*results)[index][i].0.as_ptr(); - *edge_label_length.offset(i as isize) = (*results)[index][i].0.len(); + *edge_labels.add(i) = (*results)[index][i].0.as_ptr(); + *edge_label_length.add(i) = (*results)[index][i].0.len(); } } diff --git a/src/lattice.rs b/src/lattice.rs index d164e9a..cbf4989 100644 --- a/src/lattice.rs +++ b/src/lattice.rs @@ -15,7 +15,10 @@ pub struct LatticeNode<'a> { impl<'a> LatticeNode<'a> { pub fn new>, T2: Into>>( - forward_main: T1, backward_main: T2, depth: usize) -> LatticeNode<'a> { + forward_main: T1, + backward_main: T2, + depth: usize, + ) -> LatticeNode<'a> { let mut forwards = BTreeSet::new(); let mut backwards = BTreeSet::new(); let forward_main = forward_main.into(); @@ -27,11 +30,11 @@ impl<'a> LatticeNode<'a> { backwards.insert(x); } LatticeNode { - forwards: forwards, - backwards: backwards, - forward_main: forward_main, - backward_main: backward_main, - depth: depth, + forwards, + backwards, + forward_main, + backward_main, + depth, } } @@ -72,14 +75,14 @@ fn usize_to_vec(x: usize) -> Vec { } fn vec_to_usize(x: &[u8]) -> usize { - x[0] as usize | - (x[1] as usize) << 8 | - (x[2] as usize) << 16 | - (x[3] as usize) << 24 | - (x[4] as usize) << 32 | - (x[5] as usize) << 40 | - (x[6] as usize) << 48 | - (x[7] as usize) << 56 + x[0] as usize + | (x[1] as usize) << 8 + | (x[2] as usize) << 16 + | (x[3] as usize) << 24 + | (x[4] as usize) << 32 + | (x[5] as usize) << 40 + | (x[6] as usize) << 48 + | (x[7] as usize) << 56 } impl<'a> Lattice<'a> { @@ -92,6 +95,32 @@ impl<'a> Lattice<'a> { /// # Example /// /// ``` + /// use parattice::PaRattice; + /// use parattice::Lattice; + /// + /// let paradict = vec![ + /// vec![ + /// vec!["blood", "stem", "cell"], + /// vec!["造血", "幹", "細胞"], + /// vec!["hematopoietic", "stem", "cell"], + /// ], + /// vec![ + /// vec!["造血", "幹", "細胞", "移植"], + /// vec!["hematopoietic", "stem", "cell", "transplantation"], + /// ], + /// vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + /// vec![ + /// vec!["幹", "細胞", "移植"], + /// vec!["rescue", "transplant"], + /// vec!["stem", "cell", "rescue"], + /// ], + /// vec![vec!["rescue"], vec!["救命"]], + /// vec![vec!["blood"], vec!["血液"]], + /// ]; + /// let parattice = PaRattice::new(paradict); + /// let words = vec!["造血", "幹", "細胞", "移植"]; + /// let lattice = parattice.get_lattice(&words, true, 2); + /// /// let bytes = lattice.to_bytes(); /// let new_lattice = Lattice::new_from_bytes(&bytes); /// ``` @@ -99,60 +128,65 @@ impl<'a> Lattice<'a> { let mut lattice = Vec::with_capacity(vec_to_usize(&data[0..8])); let mut offset = 8; while offset < data.len() { - let num_forwards = vec_to_usize(&data[offset..offset+8]); - let num_backwards = vec_to_usize(&data[offset+8..offset+16]); + let num_forwards = vec_to_usize(&data[offset..offset + 8]); + let num_backwards = vec_to_usize(&data[offset + 8..offset + 16]); offset += 16; let mut forwards = BTreeSet::new(); - let forward_main = - if num_forwards != 0 { - let forward_main_num_chars = vec_to_usize(&data[offset..offset+8]); - let forward_main_edge_string = str::from_utf8( - &data[offset+8..offset+8+forward_main_num_chars]).unwrap(); - let forward_main_edge_target = vec_to_usize( - &data[offset+8+forward_main_num_chars..offset+16+forward_main_num_chars]); - offset += 16 + forward_main_num_chars; - forwards.insert((forward_main_edge_string, forward_main_edge_target)); - for _ in 1..num_forwards { - let forward_num_chars = vec_to_usize(&data[offset..offset+8]); - let forward_edge_string = str::from_utf8( - &data[offset+8..offset+8+forward_num_chars]).unwrap(); - let forward_edge_target = vec_to_usize( - &data[offset+8+forward_num_chars..offset+16+forward_num_chars]); - offset += 16 + forward_num_chars; - forwards.insert((forward_edge_string, forward_edge_target)); - } - Some((forward_main_edge_string, forward_main_edge_target)) - } else { - None - }; + let forward_main = if num_forwards != 0 { + let forward_main_num_chars = vec_to_usize(&data[offset..offset + 8]); + let forward_main_edge_string = + str::from_utf8(&data[offset + 8..offset + 8 + forward_main_num_chars]).unwrap(); + let forward_main_edge_target = vec_to_usize( + &data + [offset + 8 + forward_main_num_chars..offset + 16 + forward_main_num_chars], + ); + offset += 16 + forward_main_num_chars; + forwards.insert((forward_main_edge_string, forward_main_edge_target)); + for _ in 1..num_forwards { + let forward_num_chars = vec_to_usize(&data[offset..offset + 8]); + let forward_edge_string = + str::from_utf8(&data[offset + 8..offset + 8 + forward_num_chars]).unwrap(); + let forward_edge_target = vec_to_usize( + &data[offset + 8 + forward_num_chars..offset + 16 + forward_num_chars], + ); + offset += 16 + forward_num_chars; + forwards.insert((forward_edge_string, forward_edge_target)); + } + Some((forward_main_edge_string, forward_main_edge_target)) + } else { + None + }; let mut backwards = BTreeSet::new(); - let backward_main = - if num_backwards != 0 { - let backward_main_num_chars = vec_to_usize(&data[offset..offset+8]); - let backward_main_edge_string = str::from_utf8( - &data[offset+8..offset+8+backward_main_num_chars]).unwrap(); - let backward_main_edge_target = vec_to_usize( - &data[offset+8+backward_main_num_chars..offset+16+backward_main_num_chars]); - offset += 16 + backward_main_num_chars; - backwards.insert((backward_main_edge_string, backward_main_edge_target)); - for _ in 1..num_backwards { - let backward_num_chars = vec_to_usize(&data[offset..offset+8]); - let backward_edge_string = str::from_utf8( - &data[offset+8..offset+8+backward_num_chars]).unwrap(); - let backward_edge_target = vec_to_usize( - &data[offset+8+backward_num_chars..offset+16+backward_num_chars]); - offset += 16 + backward_num_chars; - backwards.insert((backward_edge_string, backward_edge_target)); - } - Some((backward_main_edge_string, backward_main_edge_target)) - } else { - None - }; + let backward_main = if num_backwards != 0 { + let backward_main_num_chars = vec_to_usize(&data[offset..offset + 8]); + let backward_main_edge_string = + str::from_utf8(&data[offset + 8..offset + 8 + backward_main_num_chars]) + .unwrap(); + let backward_main_edge_target = vec_to_usize( + &data[offset + 8 + backward_main_num_chars + ..offset + 16 + backward_main_num_chars], + ); + offset += 16 + backward_main_num_chars; + backwards.insert((backward_main_edge_string, backward_main_edge_target)); + for _ in 1..num_backwards { + let backward_num_chars = vec_to_usize(&data[offset..offset + 8]); + let backward_edge_string = + str::from_utf8(&data[offset + 8..offset + 8 + backward_num_chars]).unwrap(); + let backward_edge_target = vec_to_usize( + &data[offset + 8 + backward_num_chars..offset + 16 + backward_num_chars], + ); + offset += 16 + backward_num_chars; + backwards.insert((backward_edge_string, backward_edge_target)); + } + Some((backward_main_edge_string, backward_main_edge_target)) + } else { + None + }; lattice.push(LatticeNode { - forwards: forwards, - backwards: backwards, - forward_main: forward_main, - backward_main: backward_main, + forwards, + backwards, + forward_main, + backward_main, depth: 0, }); } @@ -166,9 +200,9 @@ impl<'a> Lattice<'a> { trunk.insert(node_id, orig_node_id); } Lattice { - trunk: trunk, + trunk, capacity: lattice.iter().fold(0, |sum, x| sum + x.forwards.len()), - lattice: lattice, + lattice, } } @@ -177,6 +211,32 @@ impl<'a> Lattice<'a> { /// # Example /// /// ``` + /// use parattice::PaRattice; + /// use parattice::Lattice; + /// + /// let paradict = vec![ + /// vec![ + /// vec!["blood", "stem", "cell"], + /// vec!["造血", "幹", "細胞"], + /// vec!["hematopoietic", "stem", "cell"], + /// ], + /// vec![ + /// vec!["造血", "幹", "細胞", "移植"], + /// vec!["hematopoietic", "stem", "cell", "transplantation"], + /// ], + /// vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + /// vec![ + /// vec!["幹", "細胞", "移植"], + /// vec!["rescue", "transplant"], + /// vec!["stem", "cell", "rescue"], + /// ], + /// vec![vec!["rescue"], vec!["救命"]], + /// vec![vec!["blood"], vec!["血液"]], + /// ]; + /// let parattice = PaRattice::new(paradict); + /// let words = vec!["造血", "幹", "細胞", "移植"]; + /// let lattice = parattice.get_lattice(&words, true, 2); + /// /// let bytes = lattice.to_bytes(); /// let new_lattice = Lattice::new_from_bytes(&bytes); /// ``` @@ -223,6 +283,32 @@ impl<'a> Lattice<'a> { /// # Example /// /// ``` + /// use parattice::PaRattice; + /// use parattice::Lattice; + /// + /// let paradict = vec![ + /// vec![ + /// vec!["blood", "stem", "cell"], + /// vec!["造血", "幹", "細胞"], + /// vec!["hematopoietic", "stem", "cell"], + /// ], + /// vec![ + /// vec!["造血", "幹", "細胞", "移植"], + /// vec!["hematopoietic", "stem", "cell", "transplantation"], + /// ], + /// vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + /// vec![ + /// vec!["幹", "細胞", "移植"], + /// vec!["rescue", "transplant"], + /// vec!["stem", "cell", "rescue"], + /// ], + /// vec![vec!["rescue"], vec!["救命"]], + /// vec![vec!["blood"], vec!["血液"]], + /// ]; + /// let parattice = PaRattice::new(paradict); + /// let words = vec!["造血", "幹", "細胞", "移植"]; + /// let lattice = parattice.get_lattice(&words, true, 2); + /// /// let dot = lattice.dump_dot(true); /// println!("{}", dot); /// ``` @@ -230,19 +316,39 @@ impl<'a> Lattice<'a> { let mut result = "digraph { graph [rankdir=LR];\n".to_string(); for (i, node) in self.lattice.iter().enumerate() { if is_numbered { - result = result + &format!("\"{}\" [label=\"{}\",shape=plaintext,width=\"0.1\"];\n", i, i); + result = result + + &format!( + "\"{}\" [label=\"{}\",shape=plaintext,width=\"0.1\"];\n", + i, i + ); } else { result = result + &format!("\"{}\" [label=\"\",shape=circle,width=\"0.1\"];\n", i); } for (j, &edge) in node.forwards.iter().enumerate() { - result = result + &format!("\"{}-{}-{}\" [label=\"{}\",shape=box];\n", i, j, edge.1, edge.0); + result = result + + &format!( + "\"{}-{}-{}\" [label=\"{}\",shape=box];\n", + i, j, edge.1, edge.0 + ); if edge == node.forward_main.unwrap() { - result = result + &format!("\"{}\" -> \"{}-{}-{}\" [arrowhead=none,color=\"#ff0000\"];\n", i, i, j, edge.1); + result = result + + &format!( + "\"{}\" -> \"{}-{}-{}\" [arrowhead=none,color=\"#ff0000\"];\n", + i, i, j, edge.1 + ); } else { - result = result + &format!("\"{}\" -> \"{}-{}-{}\" [arrowhead=none];\n", i, i, j, edge.1); + result = result + + &format!( + "\"{}\" -> \"{}-{}-{}\" [arrowhead=none];\n", + i, i, j, edge.1 + ); } if self.lattice[edge.1].backward_main == Some((edge.0, i)) { - result = result + &format!("\"{}-{}-{}\" -> \"{}\" [color=\"#0000ff\"];\n", i, j, edge.1, edge.1); + result = result + + &format!( + "\"{}-{}-{}\" -> \"{}\" [color=\"#0000ff\"];\n", + i, j, edge.1, edge.1 + ); } else { result = result + &format!("\"{}-{}-{}\" -> \"{}\";\n", i, j, edge.1, edge.1); } @@ -261,14 +367,14 @@ impl<'a> Lattice<'a> { let mut new_path: VecDeque<(&str, usize)> = path.into_iter().collect(); let mut edge_bw = new_path.pop_front().unwrap(); while !self.trunk.contains_key(&edge_bw.1) { - let next_edge = self.lattice[edge_bw.1].backward_main.unwrap().clone(); + let next_edge = self.lattice[edge_bw.1].backward_main.unwrap(); new_path.push_front((next_edge.0, edge_bw.1)); edge_bw = next_edge; } new_path.push_front(("", edge_bw.1)); - let mut edge_fw = new_path.back().unwrap().clone(); + let mut edge_fw = *new_path.back().unwrap(); while !self.trunk.contains_key(&edge_fw.1) { - edge_fw = self.lattice[edge_fw.1].forward_main.unwrap().clone(); + edge_fw = self.lattice[edge_fw.1].forward_main.unwrap(); new_path.push_back(edge_fw); } new_path.into_iter().collect() @@ -278,23 +384,25 @@ impl<'a> Lattice<'a> { pub fn get_trunk_spans(&self) -> Vec<(usize, usize)> { let mut left_trunks = vec![0; self.lattice.len()]; let mut right_trunks = vec![self.lattice.len() - 1; self.lattice.len()]; - for (&node_id, _) in &self.trunk { + for &node_id in self.trunk.keys() { left_trunks[node_id] = node_id; right_trunks[node_id] = node_id; } - for node_id in 1..self.lattice.len()-1 { + for node_id in 1..self.lattice.len() - 1 { for edge in &self.lattice[node_id].forwards { if left_trunks[edge.1] == 0 - && self.lattice[edge.1].backward_main.unwrap().1 == node_id { - left_trunks[edge.1] = left_trunks[node_id]; + && self.lattice[edge.1].backward_main.unwrap().1 == node_id + { + left_trunks[edge.1] = left_trunks[node_id]; } } } - for node_id in (1..self.lattice.len()-1).rev() { + for node_id in (1..self.lattice.len() - 1).rev() { for edge in &self.lattice[node_id].backwards { if right_trunks[edge.1] == self.lattice.len() - 1 - && self.lattice[edge.1].forward_main.unwrap().1 == node_id { - right_trunks[edge.1] = right_trunks[node_id]; + && self.lattice[edge.1].forward_main.unwrap().1 == node_id + { + right_trunks[edge.1] = right_trunks[node_id]; } } } @@ -309,15 +417,14 @@ impl<'a> Lattice<'a> { pub fn dump_for_search_index(&self) -> Vec { let trunk_spans = self.get_trunk_spans(); let mut result = Vec::with_capacity(self.capacity); - for i in 0..self.lattice.len()-1 { + for i in 0..self.lattice.len() - 1 { for (j, edge) in self.lattice[i].forwards.iter().enumerate() { - result.push( - SearchIndexNode { - text: edge.0, - offset: (trunk_spans[i].0, trunk_spans[edge.1].1), - increment: if j == 0 { 1 } else { 0 }, - length: edge.1 - i, - }); + result.push(SearchIndexNode { + text: edge.0, + offset: (trunk_spans[i].0, trunk_spans[edge.1].1), + increment: if j == 0 { 1 } else { 0 }, + length: edge.1 - i, + }); } } result diff --git a/src/lattice_kmp.rs b/src/lattice_kmp.rs index 6aea480..6fbf8ab 100644 --- a/src/lattice_kmp.rs +++ b/src/lattice_kmp.rs @@ -18,6 +18,8 @@ impl<'a> LatticeKMP<'a> { /// # Example /// /// ``` + /// use parattice::LatticeKMP; + /// /// let pattern = vec!["幹", "細胞"]; /// let kmp = LatticeKMP::new(pattern); /// ``` @@ -32,8 +34,8 @@ impl<'a> LatticeKMP<'a> { cpattern.push(j + if pattern[j] == pattern[i] { 1 } else { 0 }); } LatticeKMP { - pattern: pattern, - cpattern: cpattern, + pattern, + cpattern, } } @@ -46,6 +48,36 @@ impl<'a> LatticeKMP<'a> { /// # Example /// /// ``` + /// use parattice::PaRattice; + /// use parattice::Lattice; + /// use parattice::LatticeKMP; + /// + /// let pattern = vec!["幹", "細胞"]; + /// let kmp = LatticeKMP::new(pattern); + /// + /// let paradict = vec![ + /// vec![ + /// vec!["blood", "stem", "cell"], + /// vec!["造血", "幹", "細胞"], + /// vec!["hematopoietic", "stem", "cell"], + /// ], + /// vec![ + /// vec!["造血", "幹", "細胞", "移植"], + /// vec!["hematopoietic", "stem", "cell", "transplantation"], + /// ], + /// vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + /// vec![ + /// vec!["幹", "細胞", "移植"], + /// vec!["rescue", "transplant"], + /// vec!["stem", "cell", "rescue"], + /// ], + /// vec![vec!["rescue"], vec!["救命"]], + /// vec![vec!["blood"], vec!["血液"]], + /// ]; + /// let parattice = PaRattice::new(paradict); + /// let words = vec!["造血", "幹", "細胞", "移植"]; + /// let lattice = parattice.get_lattice(&words, true, 2); + /// /// let results = kmp.search(&lattice); /// ``` pub fn search(&self, lattice: &'a Lattice) -> Vec> { @@ -70,11 +102,11 @@ impl<'a> LatticeKMP<'a> { j += 1; } let mut new_candidate = VecDeque::new(); - new_candidate.push_back(edge.clone()); + new_candidate.push_back(*edge); let mut k = candidate.len(); while new_candidate.len() < j { k -= 1; - new_candidate.push_front(candidate[k].clone()); + new_candidate.push_front(candidate[k]); } new_candidate.push_front(("", candidate[k - 1].1)); if j == self.pattern.len() { diff --git a/src/lib.rs b/src/lib.rs index fddb203..6b4c6a0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,7 @@ pub mod utils; pub mod externs; -pub use crate::lattice_kmp::LatticeKMP; pub use crate::lattice::Lattice; pub use crate::lattice::SearchIndexNode; +pub use crate::lattice_kmp::LatticeKMP; pub use crate::parattice::PaRattice; diff --git a/src/parattice.rs b/src/parattice.rs index 12fb587..373f992 100644 --- a/src/parattice.rs +++ b/src/parattice.rs @@ -1,3 +1,4 @@ +use std::cmp; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::collections::HashMap; @@ -5,7 +6,6 @@ use std::collections::HashSet; use std::collections::VecDeque; use std::mem; use std::usize; -use std::cmp; use crate::lattice::Lattice; use crate::lattice::LatticeNode; @@ -34,16 +34,26 @@ impl<'a> PaRattice<'a> { /// /// ``` /// use parattice::PaRattice; - /// let paradict - /// = vec![vec![vec!["blood", "stem", "cell"], vec!["造血", "幹", "細胞"], - /// vec!["hematopoietic", "stem", "cell"]], - /// vec![vec!["造血", "幹", "細胞", "移植"], - /// vec!["hematopoietic", "stem", "cell", "transplantation"]], - /// vec![vec!["stem", "cell"], vec!["幹", "細胞"]], - /// vec![vec!["幹", "細胞", "移植"], vec!["rescue", "transplant"], - /// vec!["stem", "cell", "rescue"]], - /// vec![vec!["rescue"], vec!["救命"]], - /// vec![vec!["blood"], vec!["血液"]]]; + /// + /// let paradict = vec![ + /// vec![ + /// vec!["blood", "stem", "cell"], + /// vec!["造血", "幹", "細胞"], + /// vec!["hematopoietic", "stem", "cell"], + /// ], + /// vec![ + /// vec!["造血", "幹", "細胞", "移植"], + /// vec!["hematopoietic", "stem", "cell", "transplantation"], + /// ], + /// vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + /// vec![ + /// vec!["幹", "細胞", "移植"], + /// vec!["rescue", "transplant"], + /// vec!["stem", "cell", "rescue"], + /// ], + /// vec![vec!["rescue"], vec!["救命"]], + /// vec![vec!["blood"], vec!["血液"]], + /// ]; /// let parattice = PaRattice::new(paradict); /// ``` pub fn new(dict: Vec>>) -> PaRattice<'a> { @@ -100,13 +110,18 @@ impl<'a> PaRattice<'a> { } } PaRattice { - pma: pma, - phrases: phrases, + pma, + phrases, dict: ids, } } - fn backward_match(phrase: &Vec<&str>, lattice: &Vec, pos: usize, max_depth: usize) -> Vec<(usize, usize)> { + fn backward_match( + phrase: &[&str], + lattice: &[LatticeNode], + pos: usize, + max_depth: usize, + ) -> Vec<(usize, usize)> { let mut result = vec![]; let mut backward_queue = VecDeque::new(); if lattice[pos].depth < max_depth { @@ -117,8 +132,13 @@ impl<'a> PaRattice<'a> { result.push((lattice_node_id, depth)); } else { for &(edge_str, edge_target) in &lattice[lattice_node_id].backwards { - if edge_str == phrase[phrase_pos - 1] && lattice[edge_target].depth < max_depth { - backward_queue.push_back((phrase_pos - 1, edge_target, cmp::max(depth, lattice[edge_target].depth))); + if edge_str == phrase[phrase_pos - 1] && lattice[edge_target].depth < max_depth + { + backward_queue.push_back(( + phrase_pos - 1, + edge_target, + cmp::max(depth, lattice[edge_target].depth), + )); } } } @@ -126,7 +146,7 @@ impl<'a> PaRattice<'a> { result } - fn next_pma_state_id(pma: &Vec, state_id: usize, edge_str: &str) -> usize { + fn next_pma_state_id(pma: &[PMANode], state_id: usize, edge_str: &str) -> usize { let mut next_state_id = state_id; loop { if let Some(&x) = pma[next_state_id].edges.get(edge_str) { @@ -139,44 +159,79 @@ impl<'a> PaRattice<'a> { } } - fn insert_branch(lattice: &mut Vec>, state_id_cache: &mut Vec>, phrase: &Vec<&'a str>, start_node_id: usize, end_node_id: usize, depth: usize) -> usize { + fn insert_branch( + lattice: &mut Vec>, + state_id_cache: &mut Vec>, + phrase: &[&'a str], + start_node_id: usize, + end_node_id: usize, + depth: usize, + ) -> usize { let new_node_id = lattice.len(); - assert!(lattice.len() >= 1); + assert!(!lattice.is_empty()); match phrase.len() { 1 => { lattice[start_node_id].insert_forward(&phrase[0], end_node_id); lattice[end_node_id].insert_backward(&phrase[0], start_node_id); end_node_id - }, + } 2 => { lattice[start_node_id].insert_forward(&phrase[0], new_node_id); - lattice.push(LatticeNode::new((phrase[1].clone(), end_node_id), (phrase[0].clone(), start_node_id), depth)); + lattice.push(LatticeNode::new( + (phrase[1], end_node_id), + (phrase[0], start_node_id), + depth, + )); state_id_cache.push(BTreeSet::new()); lattice[end_node_id].insert_backward(&phrase[1], new_node_id); new_node_id - }, + } 3 => { lattice[start_node_id].insert_forward(&phrase[0], new_node_id); - lattice.push(LatticeNode::new((phrase[1].clone(), new_node_id + 1), (phrase[0].clone(), start_node_id), depth)); + lattice.push(LatticeNode::new( + (phrase[1], new_node_id + 1), + (phrase[0], start_node_id), + depth, + )); state_id_cache.push(BTreeSet::new()); - lattice.push(LatticeNode::new((phrase[2].clone(), end_node_id), (phrase[1].clone(), new_node_id), depth)); + lattice.push(LatticeNode::new( + (phrase[2], end_node_id), + (phrase[1], new_node_id), + depth, + )); state_id_cache.push(BTreeSet::new()); lattice[end_node_id].insert_backward(&phrase[2], new_node_id + 1); new_node_id - }, + } _ => { lattice[start_node_id].insert_forward(&phrase[0], new_node_id); - lattice.push(LatticeNode::new((phrase[1].clone(), new_node_id + 1), (phrase[0].clone(), start_node_id), depth)); + lattice.push(LatticeNode::new( + (phrase[1], new_node_id + 1), + (phrase[0], start_node_id), + depth, + )); state_id_cache.push(BTreeSet::new()); for i in 0..phrase.len() - 3 { - lattice.push(LatticeNode::new((phrase[i + 2].clone(), new_node_id + i + 2), (phrase[i + 1].clone(), new_node_id + i), depth)); + lattice.push(LatticeNode::new( + (phrase[i + 2], new_node_id + i + 2), + (phrase[i + 1], new_node_id + i), + depth, + )); state_id_cache.push(BTreeSet::new()); } - lattice.push(LatticeNode::new((phrase[phrase.len() - 1].clone(), end_node_id), (phrase[phrase.len() - 2].clone(), new_node_id + phrase.len() - 3), depth)); + lattice.push(LatticeNode::new( + (phrase[phrase.len() - 1], end_node_id), + ( + phrase[phrase.len() - 2], + new_node_id + phrase.len() - 3, + ), + depth, + )); state_id_cache.push(BTreeSet::new()); - lattice[end_node_id].insert_backward(&phrase[phrase.len() - 1], new_node_id + phrase.len() - 2); + lattice[end_node_id] + .insert_backward(&phrase[phrase.len() - 1], new_node_id + phrase.len() - 2); new_node_id - }, + } } } @@ -191,10 +246,33 @@ impl<'a> PaRattice<'a> { /// # Example /// /// ``` + /// use parattice::PaRattice; + /// + /// let paradict = vec![ + /// vec![ + /// vec!["blood", "stem", "cell"], + /// vec!["造血", "幹", "細胞"], + /// vec!["hematopoietic", "stem", "cell"], + /// ], + /// vec![ + /// vec!["造血", "幹", "細胞", "移植"], + /// vec!["hematopoietic", "stem", "cell", "transplantation"], + /// ], + /// vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + /// vec![ + /// vec!["幹", "細胞", "移植"], + /// vec!["rescue", "transplant"], + /// vec!["stem", "cell", "rescue"], + /// ], + /// vec![vec!["rescue"], vec!["救命"]], + /// vec![vec!["blood"], vec!["血液"]], + /// ]; + /// let parattice = PaRattice::new(paradict); + /// /// let words = vec!["造血", "幹", "細胞", "移植"]; /// let lattice = parattice.get_lattice(&words, true, 2); /// ``` - pub fn get_lattice(&self, words: &Vec<&'a str>, shrink: bool, max_depth: usize) -> Lattice<'a> { + pub fn get_lattice(&self, words: &[&'a str], shrink: bool, max_depth: usize) -> Lattice<'a> { let mut inserted_branches = HashSet::new(); // generate initial lattice let mut lattice = vec![]; @@ -203,13 +281,21 @@ impl<'a> PaRattice<'a> { lattice.push(LatticeNode::new(None, None, 0)); state_id_cache.push(BTreeSet::new()); } else { - lattice.push(LatticeNode::new((words[0].clone(), 1), None, 0)); + lattice.push(LatticeNode::new((words[0], 1), None, 0)); state_id_cache.push(BTreeSet::new()); for node_id in 1..words.len() { - lattice.push(LatticeNode::new((words[node_id].clone(), node_id + 1), (words[node_id - 1].clone(), node_id - 1), 0)); + lattice.push(LatticeNode::new( + (words[node_id], node_id + 1), + (words[node_id - 1], node_id - 1), + 0, + )); state_id_cache.push(BTreeSet::new()); } - lattice.push(LatticeNode::new(None, (words[words.len() - 1].clone(), words.len() - 1), 0)); + lattice.push(LatticeNode::new( + None, + (words[words.len() - 1], words.len() - 1), + 0, + )); state_id_cache.push(BTreeSet::new()); } // search phrases @@ -217,8 +303,10 @@ impl<'a> PaRattice<'a> { queue.push_back((0, 0)); state_id_cache[0].insert(0); while let Some((lattice_node_id, pma_state_id)) = queue.pop_front() { - for (lattice_edge_str, lattice_egde_target) in lattice[lattice_node_id].forwards.iter() { - let pma_state_id_new = Self::next_pma_state_id(&self.pma, pma_state_id, lattice_edge_str); + for (lattice_edge_str, lattice_egde_target) in lattice[lattice_node_id].forwards.iter() + { + let pma_state_id_new = + Self::next_pma_state_id(&self.pma, pma_state_id, lattice_edge_str); if !state_id_cache[*lattice_egde_target].contains(&pma_state_id_new) { // queue next node queue.push_back((*lattice_egde_target, pma_state_id_new)); @@ -228,7 +316,9 @@ impl<'a> PaRattice<'a> { for &phrase_id in &self.pma[pma_state_id].matched { let (phrase, group_id) = &self.phrases[phrase_id]; let trunk_end = Self::main_branch_fw(&lattice, lattice_node_id, words.len()); - for (branch_start, depth) in Self::backward_match(phrase, &lattice, lattice_node_id, max_depth) { + for (branch_start, depth) in + Self::backward_match(phrase, &lattice, lattice_node_id, max_depth) + { let trunk_start = Self::main_branch_bw(&lattice, branch_start, words.len()); if inserted_branches.contains(&(group_id, trunk_start, trunk_end)) { continue; @@ -239,10 +329,25 @@ impl<'a> PaRattice<'a> { continue; } let paraphrase = &self.phrases[paraphrase_id].0; - let inserted_first_node_id = Self::insert_branch(&mut lattice, &mut state_id_cache, paraphrase, branch_start, lattice_node_id, depth + 1); - let (state_id_cache_current, state_id_cache_next) = get_two_mut_elems(&mut state_id_cache, branch_start, inserted_first_node_id); + let inserted_first_node_id = Self::insert_branch( + &mut lattice, + &mut state_id_cache, + paraphrase, + branch_start, + lattice_node_id, + depth + 1, + ); + let (state_id_cache_current, state_id_cache_next) = get_two_mut_elems( + &mut state_id_cache, + branch_start, + inserted_first_node_id, + ); for pma_state_id_cached in state_id_cache_current.iter() { - let pma_state_id_new = Self::next_pma_state_id(&self.pma, *pma_state_id_cached, ¶phrase[0]); + let pma_state_id_new = Self::next_pma_state_id( + &self.pma, + *pma_state_id_cached, + ¶phrase[0], + ); if !state_id_cache_next.contains(&pma_state_id_new) { // queue added node queue.push_back((inserted_first_node_id, pma_state_id_new)); @@ -267,13 +372,13 @@ impl<'a> PaRattice<'a> { trunk.insert(node_id, orig_node_id); } Lattice { - trunk: trunk, + trunk, capacity: new_lattice.iter().fold(0, |sum, x| sum + x.forwards.len()), lattice: new_lattice, } } - fn main_branch_bw(g: &Vec, begin: usize, eos: usize) -> usize { + fn main_branch_bw(g: &[LatticeNode], begin: usize, eos: usize) -> usize { let mut b = begin; while b > eos { b = g[b].backward_main.unwrap().1; @@ -281,7 +386,7 @@ impl<'a> PaRattice<'a> { b } - fn main_branch_fw(g: &Vec, end: usize, eos: usize) -> usize { + fn main_branch_fw(g: &[LatticeNode], end: usize, eos: usize) -> usize { let mut e = end; while e > eos { e = g[e].forward_main.unwrap().1; @@ -296,23 +401,30 @@ impl<'a> PaRattice<'a> { let mut backward_map = BTreeMap::new(); for &i in &updated_node_bw { if !lattice[i].backwards.is_empty() { - backward_map.entry(lattice[i].backwards.clone()).or_insert(vec![]).push(i); + backward_map + .entry(lattice[i].backwards.clone()) + .or_insert(vec![]) + .push(i); } } updated_node_bw.clear(); for nodes in backward_map.values() { if nodes.len() >= 2 { for i in 1..nodes.len() { - let backward_tmp = mem::replace(&mut lattice[nodes[i]].backwards, BTreeSet::new()); + let backward_tmp = + mem::replace(&mut lattice[nodes[i]].backwards, BTreeSet::new()); for (edge_str, prev_node_id) in backward_tmp { lattice[prev_node_id].forwards.remove(&(edge_str, nodes[i])); if lattice[prev_node_id].forward_main == Some((edge_str, nodes[i])) { lattice[prev_node_id].forward_main = Some((edge_str, nodes[0])); } } - let forward_tmp = mem::replace(&mut lattice[nodes[i]].forwards, BTreeSet::new()); + let forward_tmp = + mem::replace(&mut lattice[nodes[i]].forwards, BTreeSet::new()); for (edge_str, next_node_id) in forward_tmp { - lattice[next_node_id].backwards.remove(&(edge_str, nodes[i])); + lattice[next_node_id] + .backwards + .remove(&(edge_str, nodes[i])); lattice[next_node_id].backwards.insert((edge_str, nodes[0])); if lattice[next_node_id].backward_main == Some((edge_str, nodes[i])) { lattice[next_node_id].backward_main = Some((edge_str, nodes[0])); @@ -329,21 +441,28 @@ impl<'a> PaRattice<'a> { let mut forward_map = BTreeMap::new(); for &i in &updated_node_fw { if !lattice[i].forwards.is_empty() { - forward_map.entry(lattice[i].forwards.clone()).or_insert(vec![]).push(i); + forward_map + .entry(lattice[i].forwards.clone()) + .or_insert(vec![]) + .push(i); } } updated_node_fw.clear(); for nodes in forward_map.values() { if nodes.len() >= 2 { for i in 1..nodes.len() { - let forward_tmp = mem::replace(&mut lattice[nodes[i]].forwards, BTreeSet::new()); + let forward_tmp = + mem::replace(&mut lattice[nodes[i]].forwards, BTreeSet::new()); for (edge_str, next_node_id) in forward_tmp { - lattice[next_node_id].backwards.remove(&(edge_str, nodes[i])); + lattice[next_node_id] + .backwards + .remove(&(edge_str, nodes[i])); if lattice[next_node_id].backward_main == Some((edge_str, nodes[i])) { lattice[next_node_id].backward_main = Some((edge_str, nodes[0])); } } - let backward_tmp = mem::replace(&mut lattice[nodes[i]].backwards, BTreeSet::new()); + let backward_tmp = + mem::replace(&mut lattice[nodes[i]].backwards, BTreeSet::new()); for (edge_str, prev_node_id) in backward_tmp { lattice[prev_node_id].forwards.remove(&(edge_str, nodes[i])); lattice[prev_node_id].forwards.insert((edge_str, nodes[0])); @@ -362,7 +481,7 @@ impl<'a> PaRattice<'a> { } } - fn index_left_to_right(lattice: &Vec>) -> Vec> { + fn index_left_to_right(lattice: &[LatticeNode<'a>]) -> Vec> { let mut node_id_map = vec![0; lattice.len()]; let mut node_id_map_rev = Vec::with_capacity(lattice.len()); let mut queue = VecDeque::new(); @@ -388,16 +507,19 @@ impl<'a> PaRattice<'a> { for &(s, prev_node_id) in &lattice[node_id].backwards { new_backwards.insert((s, node_id_map[prev_node_id])); } - let forward_main = lattice[node_id].forward_main.map(|(x, i)| (x, node_id_map[i])); - let backward_main = lattice[node_id].backward_main.map(|(x, i)| (x, node_id_map[i])); - new_lattice.push( - LatticeNode { - forwards: new_forwards, - backwards: new_backwards, - forward_main: forward_main, - backward_main: backward_main, - depth: 0, - }); + let forward_main = lattice[node_id] + .forward_main + .map(|(x, i)| (x, node_id_map[i])); + let backward_main = lattice[node_id] + .backward_main + .map(|(x, i)| (x, node_id_map[i])); + new_lattice.push(LatticeNode { + forwards: new_forwards, + backwards: new_backwards, + forward_main, + backward_main, + depth: 0, + }); } new_lattice } diff --git a/src/utils.rs b/src/utils.rs index 4567cc0..cff4ffe 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -4,7 +4,5 @@ pub fn get_two_mut_elems<'a, T>(x: &'a mut Vec, i: usize, j: usize) -> (&'a m assert!(i != len); assert!(j != len); let ptr = x.as_mut_ptr(); - unsafe { - (ptr.add(i).as_mut().unwrap(), ptr.add(j).as_mut().unwrap()) - } + unsafe { (ptr.add(i).as_mut().unwrap(), ptr.add(j).as_mut().unwrap()) } } diff --git a/tests/lattice_kmp.rs b/tests/lattice_kmp.rs index ba4df2f..8fd21de 100644 --- a/tests/lattice_kmp.rs +++ b/tests/lattice_kmp.rs @@ -5,27 +5,38 @@ use parattice::PaRattice; #[test] fn lattice_kmp_test() { - let paradict - = vec![vec![vec!["blood", "stem", "cell"], vec!["造血", "幹", "細胞"], vec!["hematopoietic", "stem", "cell"]], - vec![vec!["造血", "幹", "細胞", "移植"], vec!["hematopoietic", "stem", "cell", "transplantation"]], - vec![vec!["stem", "cell"], vec!["幹", "細胞"]], - vec![vec!["幹", "細胞", "移植"], vec!["rescue", "transplant"], vec!["stem", "cell", "rescue"]], - vec![vec!["rescue"], vec!["救命"]], - vec![vec!["blood"], vec!["血液"]]]; - let parattice = PaRattice::new(paradict); - let words = vec!["造血", "幹", "細胞", "移植"]; - let lattice = parattice.get_lattice(&words, true, 10); - let pattern = vec!["幹", "細胞"]; - let kmp = LatticeKMP::new(pattern); - let mut results = kmp.search(&lattice); - results.sort(); - let expected = vec![ - vec![("", 1), ("幹", 10), ("細胞", 15)], - vec![("", 2), ("幹", 7), ("細胞", 13)], - vec![("", 3), ("幹", 9), ("細胞", 13)], - vec![("", 3), ("幹", 9), ("細胞", 14)], - vec![("", 3), ("幹", 10), ("細胞", 15)], - ]; - assert_eq!(expected, results); - + let paradict = vec![ + vec![ + vec!["blood", "stem", "cell"], + vec!["造血", "幹", "細胞"], + vec!["hematopoietic", "stem", "cell"], + ], + vec![ + vec!["造血", "幹", "細胞", "移植"], + vec!["hematopoietic", "stem", "cell", "transplantation"], + ], + vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + vec![ + vec!["幹", "細胞", "移植"], + vec!["rescue", "transplant"], + vec!["stem", "cell", "rescue"], + ], + vec![vec!["rescue"], vec!["救命"]], + vec![vec!["blood"], vec!["血液"]], + ]; + let parattice = PaRattice::new(paradict); + let words = vec!["造血", "幹", "細胞", "移植"]; + let lattice = parattice.get_lattice(&words, true, 10); + let pattern = vec!["幹", "細胞"]; + let kmp = LatticeKMP::new(pattern); + let mut results = kmp.search(&lattice); + results.sort(); + let expected = vec![ + vec![("", 1), ("幹", 10), ("細胞", 15)], + vec![("", 2), ("幹", 7), ("細胞", 13)], + vec![("", 3), ("幹", 9), ("細胞", 13)], + vec![("", 3), ("幹", 9), ("細胞", 14)], + vec![("", 3), ("幹", 10), ("細胞", 15)], + ]; + assert_eq!(expected, results); } diff --git a/tests/parattice.rs b/tests/parattice.rs index b688630..eac8eab 100644 --- a/tests/parattice.rs +++ b/tests/parattice.rs @@ -6,102 +6,204 @@ use parattice::SearchIndexNode; #[test] fn dump_for_search_index_test() { - let paradict - = vec![vec![vec!["blood", "stem", "cell"], vec!["造血", "幹", "細胞"], vec!["hematopoietic", "stem", "cell"]], - vec![vec!["造血", "幹", "細胞", "移植"], vec!["hematopoietic", "stem", "cell", "transplantation"]], - vec![vec!["stem", "cell"], vec!["幹", "細胞"]], - vec![vec!["幹", "細胞", "移植"], vec!["rescue", "transplant"], vec!["stem", "cell", "rescue"]], - vec![vec!["rescue"], vec!["救命"]], - vec![vec!["blood"], vec!["血液"]]]; - let parattice = PaRattice::new(paradict); - let words = vec!["造血", "幹", "細胞", "移植"]; - let lattice = parattice.get_lattice(&words, true, 10); - let index_data = lattice.dump_for_search_index(); - let mut expected = vec![ - ("造血", 0, 3, 0, 1), ("blood", 0, 2, 0, 3), ("血液", 0, 2, 0, 3), ("hematopoietic", 0, 1, 0, 3), - ("stem", 1, 4, 0, 3), ("stem", 1, 6, 0, 4), ("幹", 1, 10, 0, 4), - ("rescue", 2, 11, 0, 4), ("幹", 2, 7, 0, 3), ("stem", 2, 5, 0, 3), ("stem", 2, 6, 0, 4), - ("stem", 3, 8, 1, 3), ("幹", 3, 9, 1, 2), ("幹", 3, 10, 1, 4), ("救命", 3, 11, 1, 4), ("rescue", 3, 11, 1, 4), - ("cell", 4, 13, 0, 3), ("cell", 5, 12, 0, 4), ("cell", 5, 13, 0, 3), ("cell", 6, 15, 0, 4), ("細胞", 7, 13, 0, 3), - ("cell", 8, 13, 1, 3), ("cell", 8, 14, 1, 4), ("細胞", 9, 13, 2, 3), ("細胞", 9, 14, 2, 4), ("細胞", 10, 15, 1, 4), - ("transplant", 11, 16, 1, 4), ("rescue", 12, 16, 0, 4), ("救命", 12, 16, 0, 4), ("移植", 13, 16, 3, 4), - ("rescue", 14, 16, 1, 4), ("transplantation", 15, 16, 0, 4), - ]; - expected.sort(); - assert_eq!(expected, search_index_relative_to_absolute(&index_data)); + let paradict = vec![ + vec![ + vec!["blood", "stem", "cell"], + vec!["造血", "幹", "細胞"], + vec!["hematopoietic", "stem", "cell"], + ], + vec![ + vec!["造血", "幹", "細胞", "移植"], + vec!["hematopoietic", "stem", "cell", "transplantation"], + ], + vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + vec![ + vec!["幹", "細胞", "移植"], + vec!["rescue", "transplant"], + vec!["stem", "cell", "rescue"], + ], + vec![vec!["rescue"], vec!["救命"]], + vec![vec!["blood"], vec!["血液"]], + ]; + let parattice = PaRattice::new(paradict); + let words = vec!["造血", "幹", "細胞", "移植"]; + let lattice = parattice.get_lattice(&words, true, 10); + let index_data = lattice.dump_for_search_index(); + let mut expected = vec![ + ("造血", 0, 3, 0, 1), + ("blood", 0, 2, 0, 3), + ("血液", 0, 2, 0, 3), + ("hematopoietic", 0, 1, 0, 3), + ("stem", 1, 4, 0, 3), + ("stem", 1, 6, 0, 4), + ("幹", 1, 10, 0, 4), + ("rescue", 2, 11, 0, 4), + ("幹", 2, 7, 0, 3), + ("stem", 2, 5, 0, 3), + ("stem", 2, 6, 0, 4), + ("stem", 3, 8, 1, 3), + ("幹", 3, 9, 1, 2), + ("幹", 3, 10, 1, 4), + ("救命", 3, 11, 1, 4), + ("rescue", 3, 11, 1, 4), + ("cell", 4, 13, 0, 3), + ("cell", 5, 12, 0, 4), + ("cell", 5, 13, 0, 3), + ("cell", 6, 15, 0, 4), + ("細胞", 7, 13, 0, 3), + ("cell", 8, 13, 1, 3), + ("cell", 8, 14, 1, 4), + ("細胞", 9, 13, 2, 3), + ("細胞", 9, 14, 2, 4), + ("細胞", 10, 15, 1, 4), + ("transplant", 11, 16, 1, 4), + ("rescue", 12, 16, 0, 4), + ("救命", 12, 16, 0, 4), + ("移植", 13, 16, 3, 4), + ("rescue", 14, 16, 1, 4), + ("transplantation", 15, 16, 0, 4), + ]; + expected.sort(); + assert_eq!(expected, search_index_relative_to_absolute(&index_data)); } #[test] fn serialize_test() { - let paradict - = vec![vec![vec!["blood", "stem", "cell"], vec!["造血", "幹", "細胞"], vec!["hematopoietic", "stem", "cell"]], - vec![vec!["造血", "幹", "細胞", "移植"], vec!["hematopoietic", "stem", "cell", "transplantation"]], - vec![vec!["stem", "cell"], vec!["幹", "細胞"]], - vec![vec!["幹", "細胞", "移植"], vec!["rescue", "transplant"], vec!["stem", "cell", "rescue"]], - vec![vec!["rescue"], vec!["救命"]], - vec![vec!["blood"], vec!["血液"]]]; - let parattice = PaRattice::new(paradict); - let words = vec!["造血", "幹", "細胞", "移植"]; - let lattice = parattice.get_lattice(&words, true, 10); - let bytes = lattice.to_bytes(); - let lattice_from_bytes = Lattice::new_from_bytes(&bytes); - assert_eq!(lattice, lattice_from_bytes); + let paradict = vec![ + vec![ + vec!["blood", "stem", "cell"], + vec!["造血", "幹", "細胞"], + vec!["hematopoietic", "stem", "cell"], + ], + vec![ + vec!["造血", "幹", "細胞", "移植"], + vec!["hematopoietic", "stem", "cell", "transplantation"], + ], + vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + vec![ + vec!["幹", "細胞", "移植"], + vec!["rescue", "transplant"], + vec!["stem", "cell", "rescue"], + ], + vec![vec!["rescue"], vec!["救命"]], + vec![vec!["blood"], vec!["血液"]], + ]; + let parattice = PaRattice::new(paradict); + let words = vec!["造血", "幹", "細胞", "移植"]; + let lattice = parattice.get_lattice(&words, true, 10); + let bytes = lattice.to_bytes(); + let lattice_from_bytes = Lattice::new_from_bytes(&bytes); + assert_eq!(lattice, lattice_from_bytes); } -fn search_index_relative_to_absolute<'a>(data: &'a Vec) -> Vec<(&'a str, usize, usize, usize, usize)> { - let mut new_data = vec![]; - let mut node_id = 0; - for node in data { - node_id += node.increment; - new_data.push((node.text, node_id - 1, node_id + node.length - 1, node.offset.0, node.offset.1)); - } - new_data.sort(); - new_data +fn search_index_relative_to_absolute<'a>( + data: &'a Vec, +) -> Vec<(&'a str, usize, usize, usize, usize)> { + let mut new_data = vec![]; + let mut node_id = 0; + for node in data { + node_id += node.increment; + new_data.push(( + node.text, + node_id - 1, + node_id + node.length - 1, + node.offset.0, + node.offset.1, + )); + } + new_data.sort(); + new_data } #[test] fn get_trunk_span_test() { - let paradict - = vec![vec![vec!["blood", "stem", "cell"], vec!["造血", "幹", "細胞"], vec!["hematopoietic", "stem", "cell"]], - vec![vec!["造血", "幹", "細胞", "移植"], vec!["hematopoietic", "stem", "cell", "transplantation"]], - vec![vec!["stem", "cell"], vec!["幹", "細胞"]], - vec![vec!["幹", "細胞", "移植"], vec!["rescue", "transplant"], vec!["stem", "cell", "rescue"]], - vec![vec!["rescue"], vec!["救命"]], - vec![vec!["blood"], vec!["血液"]]]; - let parattice = PaRattice::new(paradict); - let words = vec!["造血", "幹", "細胞", "移植"]; - let lattice = parattice.get_lattice(&words, true, 10); - assert_eq!( - vec![("", 0), ("hematopoietic", 1), ("stem", 4), ("cell", 13)], - lattice.get_trunk_span(vec![("", 1), ("stem", 4)])); - assert_eq!( - vec![("", 3), ("rescue", 11), ("transplant", 16)], - lattice.get_trunk_span(vec![("", 3), ("rescue", 11)])); - assert_eq!( - vec![("", 0), ("hematopoietic", 1), ("stem", 6), ("cell", 15), ("transplantation", 16)], - lattice.get_trunk_span(vec![("", 15), ("transplantation", 16)])); + let paradict = vec![ + vec![ + vec!["blood", "stem", "cell"], + vec!["造血", "幹", "細胞"], + vec!["hematopoietic", "stem", "cell"], + ], + vec![ + vec!["造血", "幹", "細胞", "移植"], + vec!["hematopoietic", "stem", "cell", "transplantation"], + ], + vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + vec![ + vec!["幹", "細胞", "移植"], + vec!["rescue", "transplant"], + vec!["stem", "cell", "rescue"], + ], + vec![vec!["rescue"], vec!["救命"]], + vec![vec!["blood"], vec!["血液"]], + ]; + let parattice = PaRattice::new(paradict); + let words = vec!["造血", "幹", "細胞", "移植"]; + let lattice = parattice.get_lattice(&words, true, 10); + assert_eq!( + vec![("", 0), ("hematopoietic", 1), ("stem", 4), ("cell", 13)], + lattice.get_trunk_span(vec![("", 1), ("stem", 4)]) + ); + assert_eq!( + vec![("", 3), ("rescue", 11), ("transplant", 16)], + lattice.get_trunk_span(vec![("", 3), ("rescue", 11)]) + ); + assert_eq!( + vec![ + ("", 0), + ("hematopoietic", 1), + ("stem", 6), + ("cell", 15), + ("transplantation", 16) + ], + lattice.get_trunk_span(vec![("", 15), ("transplantation", 16)]) + ); } #[test] fn max_depth_test() { - let paradict - = vec![vec![vec!["blood", "stem", "cell"], vec!["造血", "幹", "細胞"], vec!["hematopoietic", "stem", "cell"]], - vec![vec!["造血", "幹", "細胞", "移植"], vec!["hematopoietic", "stem", "cell", "transplantation"]], - vec![vec!["stem", "cell"], vec!["幹", "細胞"]], - vec![vec!["幹", "細胞", "移植"], vec!["rescue", "transplant"], vec!["stem", "cell", "rescue"]], - vec![vec!["rescue"], vec!["救命"]], - vec![vec!["blood"], vec!["血液"]]]; - let parattice = PaRattice::new(paradict); - let words = vec!["造血", "幹", "細胞", "移植"]; - let lattice = parattice.get_lattice(&words, true, 1); - let index_data = lattice.dump_for_search_index(); - let mut expected = vec![ - ("造血", 0, 3, 0, 1), ("blood", 0, 1, 0, 3), ("hematopoietic", 0, 2, 0, 3), - ("rescue", 3, 6, 1, 4), ("stem", 3, 7, 1, 3), ("幹", 3, 8, 1, 2), ("stem", 1, 4, 0, 3), - ("stem", 2, 4, 0, 3), ("stem", 2, 5, 0, 4), ("cell", 7, 10, 1, 4), ("cell", 7, 11, 1, 3), - ("細胞", 8, 11, 2, 3), ("cell", 4, 11, 0, 3), ("cell", 5, 9, 0, 4), ("transplant", 6, 12, 1, 4), - ("rescue", 10, 12, 1, 4), ("移植", 11, 12, 3, 4), ("transplantation", 9, 12, 0, 4), - ]; - expected.sort(); - assert_eq!(expected, search_index_relative_to_absolute(&index_data)); + let paradict = vec![ + vec![ + vec!["blood", "stem", "cell"], + vec!["造血", "幹", "細胞"], + vec!["hematopoietic", "stem", "cell"], + ], + vec![ + vec!["造血", "幹", "細胞", "移植"], + vec!["hematopoietic", "stem", "cell", "transplantation"], + ], + vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + vec![ + vec!["幹", "細胞", "移植"], + vec!["rescue", "transplant"], + vec!["stem", "cell", "rescue"], + ], + vec![vec!["rescue"], vec!["救命"]], + vec![vec!["blood"], vec!["血液"]], + ]; + let parattice = PaRattice::new(paradict); + let words = vec!["造血", "幹", "細胞", "移植"]; + let lattice = parattice.get_lattice(&words, true, 1); + let index_data = lattice.dump_for_search_index(); + let mut expected = vec![ + ("造血", 0, 3, 0, 1), + ("blood", 0, 1, 0, 3), + ("hematopoietic", 0, 2, 0, 3), + ("rescue", 3, 6, 1, 4), + ("stem", 3, 7, 1, 3), + ("幹", 3, 8, 1, 2), + ("stem", 1, 4, 0, 3), + ("stem", 2, 4, 0, 3), + ("stem", 2, 5, 0, 4), + ("cell", 7, 10, 1, 4), + ("cell", 7, 11, 1, 3), + ("細胞", 8, 11, 2, 3), + ("cell", 4, 11, 0, 3), + ("cell", 5, 9, 0, 4), + ("transplant", 6, 12, 1, 4), + ("rescue", 10, 12, 1, 4), + ("移植", 11, 12, 3, 4), + ("transplantation", 9, 12, 0, 4), + ]; + expected.sort(); + assert_eq!(expected, search_index_relative_to_absolute(&index_data)); }