From 2388fb056372714abc30d5344810b443acceaaaf Mon Sep 17 00:00:00 2001 From: Koichi Akabe Date: Fri, 19 Jun 2020 15:42:39 +0900 Subject: [PATCH] Initial commit --- .gitignore | 4 + Cargo.toml | 15 + LICENSE | 202 +++++++ README.md | 63 ++ bindings/cpp/CMakeLists.txt | 37 ++ bindings/cpp/include/CMakeLists.txt | 1 + bindings/cpp/include/parattice.hh | 319 ++++++++++ bindings/cpp/test/CMakeLists.txt | 42 ++ bindings/cpp/test/lattice_kmp_test.cc | 45 ++ bindings/cpp/test/parattice_test.cc | 103 ++++ bindings/jni/.gitignore | 7 + bindings/jni/build.gradle | 144 +++++ bindings/jni/settings.gradle | 1 + bindings/jni/src/main/cpp/Lattice.cc | 92 +++ bindings/jni/src/main/cpp/LatticeKMP.cc | 55 ++ bindings/jni/src/main/cpp/NativeLibrary.cc | 30 + bindings/jni/src/main/cpp/PaRattice.cc | 40 ++ bindings/jni/src/main/cpp/internal.hh | 302 ++++++++++ .../jni/src/main/java/parattice/Lattice.java | 164 +++++ .../src/main/java/parattice/LatticeKMP.java | 65 ++ .../src/main/java/parattice/PaRattice.java | 63 ++ .../jni/src/main/java/parattice/Pair.java | 46 ++ .../main/java/parattice/SearchIndexNode.java | 17 + .../java/parattice/internal/JNILoader.java | 44 ++ .../test/java/parattice/LatticeKMPTest.java | 57 ++ .../test/java/parattice/PaRatticeTest.java | 165 +++++ examples/README.md | 81 +++ examples/clojure/project.clj | 5 + .../clojure/src/parattice/example/main.clj | 35 ++ examples/cpp/CMakeLists.txt | 29 + examples/cpp/parattice_example.cc | 47 ++ examples/graph.svg | 569 ++++++++++++++++++ examples/rust/Cargo.toml | 8 + examples/rust/src/main.rs | 53 ++ src/externs.rs | 193 ++++++ src/lattice.rs | 325 ++++++++++ src/lattice_kmp.rs | 96 +++ src/lattice_searcher.rs | 130 ++++ src/lib.rs | 11 + src/parattice.rs | 404 +++++++++++++ src/utils.rs | 10 + tests/lattice_kmp.rs | 31 + tests/parattice.rs | 107 ++++ 43 files changed, 4257 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 LICENSE create mode 100644 README.md create mode 100644 bindings/cpp/CMakeLists.txt create mode 100644 bindings/cpp/include/CMakeLists.txt create mode 100644 bindings/cpp/include/parattice.hh create mode 100644 bindings/cpp/test/CMakeLists.txt create mode 100644 bindings/cpp/test/lattice_kmp_test.cc create mode 100644 bindings/cpp/test/parattice_test.cc create mode 100644 bindings/jni/.gitignore create mode 100644 bindings/jni/build.gradle create mode 100644 bindings/jni/settings.gradle create mode 100644 bindings/jni/src/main/cpp/Lattice.cc create mode 100644 bindings/jni/src/main/cpp/LatticeKMP.cc create mode 100644 bindings/jni/src/main/cpp/NativeLibrary.cc create mode 100644 bindings/jni/src/main/cpp/PaRattice.cc create mode 100644 bindings/jni/src/main/cpp/internal.hh create mode 100644 bindings/jni/src/main/java/parattice/Lattice.java create mode 100644 bindings/jni/src/main/java/parattice/LatticeKMP.java create mode 100644 bindings/jni/src/main/java/parattice/PaRattice.java create mode 100644 bindings/jni/src/main/java/parattice/Pair.java create mode 100644 bindings/jni/src/main/java/parattice/SearchIndexNode.java create mode 100644 bindings/jni/src/main/java/parattice/internal/JNILoader.java create mode 100644 bindings/jni/src/test/java/parattice/LatticeKMPTest.java create mode 100644 bindings/jni/src/test/java/parattice/PaRatticeTest.java create mode 100644 examples/README.md create mode 100644 examples/clojure/project.clj create mode 100644 examples/clojure/src/parattice/example/main.clj create mode 100644 examples/cpp/CMakeLists.txt create mode 100644 examples/cpp/parattice_example.cc create mode 100644 examples/graph.svg create mode 100644 examples/rust/Cargo.toml create mode 100644 examples/rust/src/main.rs create mode 100644 src/externs.rs create mode 100644 src/lattice.rs create mode 100644 src/lattice_kmp.rs create mode 100644 src/lattice_searcher.rs create mode 100644 src/lib.rs create mode 100644 src/parattice.rs create mode 100644 src/utils.rs create mode 100644 tests/lattice_kmp.rs create mode 100644 tests/parattice.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fb1a1a4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +target/ +**/*.rs.bk +Cargo.lock +*.swp diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..b775d3d --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "parattice" +version = "0.2.1" +authors = ["Koichi Akabe "] +edition = "2018" + +[dependencies] +libc = "0.2.71" + +[lib] +name = "parattice" +crate-type = ["rlib", "cdylib", "staticlib"] + +[profile.release] +codegen-units = 1 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..b82d088 --- /dev/null +++ b/README.md @@ -0,0 +1,63 @@ +# parattice: Recursive paraphrase lattice generator 🔄 + +This library takes a sentence and a paraphrase corpus, recursively finds +paraphrases based on the corpus, expands the given sentence, and generates a +paraphrase lattice. + +This library also provides a method to search a phrase in the paraphrase +lattice. + +parattice is implemented in Rust, and this repository also provides C++ and +Java bindings. + +## Build and Installation + +Prerequisites: +* Rust 2018 (>= 1.31) + +Run the following commands: +```shell +cargo build --release +cargo test --release +``` + +### C++ binding + +Prerequisites: +* C++11 compiler (GCC, Clang) +* Googletest + +```shell +mkdir cpp_build +cd cpp_build +cmake ../bindings/cpp -DPARATTICE_BUILD_TESTS=ON -DPARATTICE_GTEST_SOURCE_DIR=${PATH_TO_GOOGLETEST} +make VERBOSE=1 +make test ARGS="-V" +``` + +### Java binding + +Prerequisites: +* JDK (>= 8) and header files + +```shell +gradle -b bindings/jni/build.gradle build +gradle -b bindings/jni/build.gradle publishToMavenLocal +``` + +The above commands generates two JAR files: a normal JAR file and a native library. + +## Examples + +Examples are contained in [examples](/examples) directory. + +## Patents + +* [JP2019153267](https://patentscope2.wipo.int/search/en/detail.jsf?docId=JP274788235) +* [特許第6435467号](https://www.j-platpat.inpit.go.jp/c1800/PU/JP-2019-153267/E7C117D77F8BF276A28A31DC60BF7E4CC5B53B3F230980164BD96541AA9DAA0F/11/ja) + +## License + +Copyright 2020 [Xcoo, Inc.](https://xcoo.jp/) + +Licensed under the [Apache License, Version 2.0](/LICENSE). diff --git a/bindings/cpp/CMakeLists.txt b/bindings/cpp/CMakeLists.txt new file mode 100644 index 0000000..e993617 --- /dev/null +++ b/bindings/cpp/CMakeLists.txt @@ -0,0 +1,37 @@ +cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR) +set(CMAKE_POLICY_DEFAULT_CMP0048 NEW) + +project(parattice VERSION 0.2.1 LANGUAGES CXX) + +option(PARATTICE_BUILD_TESTS "Builds test binaries." OFF) +option(PARATTICE_GTEST_SOURCE_DIR "Source directory of Google Test library." "") + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +if(CMAKE_CXX_COMPILER_ID MATCHES "GNU") + set(CMAKE_CXX_FLAGS "-O3 -Wall -Wextra -Werror -fPIC ${CMAKE_CXX_FLAGS}") + set(PARATTICE_LIBRARIES parattice dl) +elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(CMAKE_CXX_FLAGS "-O3 -Wall -Wextra -Werror -fPIC ${CMAKE_CXX_FLAGS}") + set(PARATTICE_LIBRARIES parattice dl) +else() + message(WARNING "parattice may not support the detected compiler: ${CMAKE_CXX_COMPILER_ID}") +endif() + +include_directories( + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_BINARY_DIR}) + +add_library(parattice STATIC IMPORTED) +set_property( + TARGET parattice + PROPERTY IMPORTED_LOCATION ${PROJECT_SOURCE_DIR}/../../target/release/libparattice.a + ) + +add_subdirectory(include) + +if(PARATTICE_BUILD_TESTS) + enable_testing() + add_subdirectory(test) +endif() diff --git a/bindings/cpp/include/CMakeLists.txt b/bindings/cpp/include/CMakeLists.txt new file mode 100644 index 0000000..517e8b8 --- /dev/null +++ b/bindings/cpp/include/CMakeLists.txt @@ -0,0 +1 @@ +install(FILES parattice.hh DESTINATION include) diff --git a/bindings/cpp/include/parattice.hh b/bindings/cpp/include/parattice.hh new file mode 100644 index 0000000..c1206f3 --- /dev/null +++ b/bindings/cpp/include/parattice.hh @@ -0,0 +1,319 @@ +#ifndef PARATTICE_CXX_H +#define PARATTICE_CXX_H + +#include +#include +#include +#include +#include +#include + +extern "C" { + char* parattice_free_string(char* s); + char* parattice_free_bytes(std::uint8_t* bytes, std::size_t length); + void* parattice_parattice_new(const char*** const* dict); + void parattice_parattice_free(void* parattice); + void* parattice_parattice_get_lattice(void const* parattice, const char* const* words, std::size_t length, bool shrink, std::size_t max_depth); + void parattice_lattice_free(void* parattice); + std::size_t parattice_lattice_get_size(void const* lattice); + std::size_t parattice_lattice_get_required_capacity(void const* lattice); + void* parattice_lattice_new_from_bytes(std::uint8_t const* data, const std::size_t length); + std::uint8_t* parattice_lattice_to_bytes(void const* lattice, std::size_t* length); + char* parattice_lattice_dump_dot(void const* lattice, bool is_numbered); + std::size_t parattice_lattice_get_trunk_span(void const* lattice, const char* const* edge_labels, std::size_t const* node_ids, std::size_t length, const char** new_edge_labels, std::size_t* new_edge_label_length, std::size_t* new_node_ids); + void parattice_lattice_get_trunk_spans(void const* lattice, std::size_t* trunk_lefts, std::size_t* trunk_rights); + std::size_t parattice_lattice_dump_for_search_index(void const* lattice, const char** texts, std::size_t* text_lengths, std::size_t* offset_starts, std::size_t* offset_ends, std::size_t* increments, std::size_t* lengths); + void* parattice_lattice_kmp_new(const char* const* pattern, std::size_t length); + void parattice_lattice_kmp_free(void* latticekmp); + void* parattice_lattice_kmp_search(void const* latticekmp, void const* lattice); + void parattice_lattice_kmp_free_result(void* results); + std::size_t parattice_lattice_kmp_results_size(void const* results); + std::size_t parattice_lattice_kmp_result_length(void const* results, std::size_t index); + void parattice_lattice_kmp_result_nodes(void const* results, std::size_t index, std::size_t* nodes); + void parattice_lattice_kmp_result_edge_labels(void const* results, std::size_t index, const char** edge_labels, std::size_t* edge_label_length); +} + +namespace parattice { + + struct search_index_node { + std::string text; + std::size_t offset_start; + std::size_t offset_end; + std::size_t increment; + std::size_t length; + }; + + class Lattice { + + Lattice(const Lattice&) = delete; + + public: + + Lattice(): ptr_(nullptr), words_({}), data_({}) {} + + Lattice(Lattice&&) = default; + + explicit Lattice(const std::vector& words) + : ptr_(nullptr), words_(words), data_({}) {} + + explicit Lattice(std::vector&& words) + : ptr_(nullptr), words_(std::forward>(words)), data_({}) {} + + explicit Lattice(const std::vector& data) + : ptr_(nullptr), words_({}), data_(data) {} + + explicit Lattice(std::vector&& data) + : ptr_(nullptr), words_({}), data_(std::forward>(data)) {} + + std::size_t get_size() const { + return parattice_lattice_get_size(ptr_.get()); + } + + std::size_t get_required_capacity() const { + return parattice_lattice_get_required_capacity(ptr_.get()); + } + + std::string dump_dot(bool is_numbered) const { + char* s = parattice_lattice_dump_dot(ptr_.get(), is_numbered); + std::string cpp_str(s); + parattice_free_string(s); + return cpp_str; + } + + std::vector> get_trunk_span(const std::vector>& path) const { + std::vector edge_labels; + std::vector node_ids; + edge_labels.reserve(path.size()); + node_ids.reserve(path.size()); + for (auto& edge : path) { + edge_labels.emplace_back(edge.first.c_str()); + node_ids.emplace_back(edge.second); + } + const std::size_t lattice_size = get_size(); + std::vector new_edge_labels(lattice_size); + std::vector new_edge_label_length(lattice_size); + std::vector new_node_ids(lattice_size); + const std::size_t s = parattice_lattice_get_trunk_span(ptr_.get(), edge_labels.data(), node_ids.data(), path.size(), new_edge_labels.data(), new_edge_label_length.data(), new_node_ids.data()); + std::vector> result; + result.reserve(s); + for (std::size_t i = 0; i < s; ++i) { + result.emplace_back(std::string(new_edge_labels[i], new_edge_label_length[i]), new_node_ids[i]); + } + return result; + } + + std::vector> get_trunk_spans() const { + const std::size_t lattice_size = get_size(); + std::vector> result; + result.reserve(lattice_size); + std::vector trunk_lefts(lattice_size); + std::vector trunk_rights(lattice_size); + parattice_lattice_get_trunk_spans(ptr_.get(), trunk_lefts.data(), trunk_rights.data()); + for (std::size_t i = 0; i < lattice_size; ++i) { + result.emplace_back(trunk_lefts[i], trunk_rights[i]); + } + return result; + } + + std::vector dump_for_search_index() const { + const std::size_t capacity = get_required_capacity(); + std::vector texts(capacity); + std::vector text_lengths(capacity); + std::vector offset_starts(capacity); + std::vector offset_ends(capacity); + std::vector increments(capacity); + std::vector lengths(capacity); + const std::size_t s = parattice_lattice_dump_for_search_index(ptr_.get(), texts.data(), text_lengths.data(), offset_starts.data(), offset_ends.data(), increments.data(), lengths.data()); + std::vector result; + result.reserve(s); + for (std::size_t i = 0; i < s; ++i) { + result.emplace_back(search_index_node { + std::string(texts[i], text_lengths[i]), + offset_starts[i], + offset_ends[i], + increments[i], + lengths[i], + }); + } + return result; + } + + std::vector to_bytes() const { + std::size_t length; + std::uint8_t* data = parattice_lattice_to_bytes(ptr_.get(), &length); + std::vector result(length); + std::memcpy(result.data(), data, sizeof(std::uint8_t) * length); + parattice_free_bytes(data, length); + return result; + } + + static Lattice from_bytes(const std::vector& data) { + Lattice lattice(data); + lattice.ptr_ = std::unique_ptr>(parattice_lattice_new_from_bytes(lattice.data_.data(), lattice.data_.size()), parattice_lattice_free); + return lattice; + } + + static Lattice from_bytes(std::vector&& data) { + Lattice lattice(std::forward>(data)); + lattice.ptr_ = std::unique_ptr>(parattice_lattice_new_from_bytes(lattice.data_.data(), lattice.data_.size()), parattice_lattice_free); + return lattice; + } + + static Lattice from_bytes(std::uint8_t const* data, std::size_t size) { + Lattice lattice; + lattice.ptr_ = std::unique_ptr>(parattice_lattice_new_from_bytes(data, size), parattice_lattice_free); + return lattice; + } + + public: + + std::unique_ptr> ptr_; + std::vector words_; + std::vector data_; + + }; + + class PaRattice { + + PaRattice() = delete; + PaRattice(const PaRattice&) = delete; + + private: + + void init() { + const char**** dict_c = new const char***[dict_.size() + 1]; + for (std::size_t i = 0; i < dict_.size(); ++i) { + dict_c[i] = new const char**[dict_[i].size() + 1]; + for (std::size_t j = 0; j < dict_[i].size(); ++j) { + dict_c[i][j] = new const char*[dict_[i][j].size() + 1]; + for (std::size_t k = 0; k < dict_[i][j].size(); ++k) { + dict_c[i][j][k] = dict_[i][j][k].c_str(); + } + dict_c[i][j][dict_[i][j].size()] = nullptr; + } + dict_c[i][dict_[i].size()] = nullptr; + } + dict_c[dict_.size()] = nullptr; + ptr_ = std::unique_ptr>(parattice_parattice_new(dict_c), parattice_parattice_free); + for (std::size_t i = 0; dict_c[i] != nullptr; ++i) { + for (std::size_t j = 0; dict_c[i][j] != nullptr; ++j) { + delete[] dict_c[i][j]; + } + delete[] dict_c[i]; + } + delete[] dict_c; + } + + public: + + explicit PaRattice(const std::vector>>& dict) + : ptr_(nullptr), dict_(dict) { + init(); + } + + explicit PaRattice(std::vector>>&& dict) + : ptr_(nullptr), dict_(std::forward>>>(dict)) { + init(); + } + + Lattice get_lattice(const std::vector& words, bool shrink, std::size_t max_depth) const { + Lattice lattice(words); + std::vector words_c; + words_c.reserve(lattice.words_.size()); + for (std::size_t i = 0; i < lattice.words_.size(); ++i) { + words_c.emplace_back(lattice.words_[i].c_str()); + } + lattice.ptr_ = std::unique_ptr>(parattice_parattice_get_lattice(ptr_.get(), words_c.data(), words_c.size(), shrink, max_depth), parattice_lattice_free); + return lattice; + } + + Lattice get_lattice(std::vector&& words, bool shrink, std::size_t max_depth) const { + Lattice lattice(std::forward>(words)); + std::vector words_c; + words_c.reserve(lattice.words_.size()); + for (std::size_t i = 0; i < lattice.words_.size(); ++i) { + words_c.emplace_back(lattice.words_[i].c_str()); + } + lattice.ptr_ = std::unique_ptr>(parattice_parattice_get_lattice(ptr_.get(), words_c.data(), words_c.size(), shrink, max_depth), parattice_lattice_free); + return lattice; + } + + Lattice get_lattice(const std::vector& words, bool shrink, std::size_t max_depth) const { + Lattice lattice; + lattice.ptr_ = std::unique_ptr>(parattice_parattice_get_lattice(ptr_.get(), words.data(), words.size(), shrink, max_depth), parattice_lattice_free); + return lattice; + } + + Lattice get_lattice(std::vector&& words, bool shrink, std::size_t max_depth) const { + Lattice lattice; + lattice.ptr_ = std::unique_ptr>(parattice_parattice_get_lattice(ptr_.get(), words.data(), words.size(), shrink, max_depth), parattice_lattice_free); + return lattice; + } + + private: + + std::unique_ptr> ptr_; + std::vector>> dict_; + + }; + + class LatticeKMP { + + LatticeKMP() = delete; + LatticeKMP(const LatticeKMP&) = delete; + + public: + + explicit LatticeKMP(const std::vector& pattern) + : ptr_(nullptr), pattern_(pattern) { + std::vector pattern_c; + pattern_c.reserve(pattern_.size()); + for (auto& s : pattern_) { + pattern_c.emplace_back(s.c_str()); + } + ptr_ = std::unique_ptr>(parattice_lattice_kmp_new(pattern_c.data(), pattern_c.size()), parattice_lattice_kmp_free); + } + + explicit LatticeKMP(std::vector&& pattern) + : ptr_(nullptr), pattern_(std::forward>(pattern)) { + std::vector pattern_c; + pattern_c.reserve(pattern_.size()); + for (auto& s : pattern_) { + pattern_c.emplace_back(s.c_str()); + } + ptr_ = std::unique_ptr>(parattice_lattice_kmp_new(pattern_c.data(), pattern_c.size()), parattice_lattice_kmp_free); + } + + std::vector>> search(const Lattice& lattice) const { + void* search_result = parattice_lattice_kmp_search(ptr_.get(), lattice.ptr_.get()); + std::vector>> results; + std::size_t s = parattice_lattice_kmp_results_size(search_result); + results.reserve(s); + for (std::size_t i = 0; i < s; ++i) { + std::size_t l = parattice_lattice_kmp_result_length(search_result, i); + std::vector str_vec(l); + std::vector str_len_vec(l); + std::vector node_vec(l); + parattice_lattice_kmp_result_edge_labels(search_result, i, str_vec.data(), str_len_vec.data()); + parattice_lattice_kmp_result_nodes(search_result, i, node_vec.data()); + std::vector> result; + result.reserve(l); + for (std::size_t j = 0; j < l; ++j) { + result.emplace_back(std::string(str_vec[j], str_len_vec[j]), node_vec[j]); + } + results.emplace_back(result); + } + parattice_lattice_kmp_free_result(search_result); + return results; + } + + private: + + std::unique_ptr> ptr_; + std::vector pattern_; + + }; + +} // namespace parattice + +#endif // PARATTICE_CXX_H diff --git a/bindings/cpp/test/CMakeLists.txt b/bindings/cpp/test/CMakeLists.txt new file mode 100644 index 0000000..454a6e9 --- /dev/null +++ b/bindings/cpp/test/CMakeLists.txt @@ -0,0 +1,42 @@ +if(PARATTICE_GTEST_SOURCE_DIR) + include(ExternalProject) + ExternalProject_Add( + GTest + SOURCE_DIR ${PARATTICE_GTEST_SOURCE_DIR} + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/googletest-build + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" + ) + add_subdirectory( + ${PARATTICE_GTEST_SOURCE_DIR} + ${CMAKE_CURRENT_BINARY_DIR}/googletest-build + EXCLUDE_FROM_ALL + ) + set(GTEST_BOTH_LIBRARIES gtest gtest_main) +else() + find_package(GTest REQUIRED) +endif() + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR} + ${GTEST_INCLUDE_DIRS} +) + +function(parattice_test name) + add_executable(${name}_test + ${name}_test.cc) + if(PARATTICE_GTEST_SOURCE_DIR) + add_dependencies(${name}_test GTest) + endif() + target_link_libraries(${name}_test ${PARATTICE_LIBRARIES} ${GTEST_BOTH_LIBRARIES} pthread) + add_test( + NAME ${name}_test + COMMAND ${name}_test + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) +endfunction() + +parattice_test(parattice) +parattice_test(lattice_kmp) diff --git a/bindings/cpp/test/lattice_kmp_test.cc b/bindings/cpp/test/lattice_kmp_test.cc new file mode 100644 index 0000000..82cc784 --- /dev/null +++ b/bindings/cpp/test/lattice_kmp_test.cc @@ -0,0 +1,45 @@ +#include + +#include +#include +#include + +#include + +namespace parattice { + + class LatticeKMPTest : public testing::Test { + protected: + std::vector>> paradict = {}; + + void SetUp() override { + paradict = { + {{"blood", "stem", "cell"}, {"造血", "幹", "細胞"}, {"hematopoietic", "stem", "cell"}}, + {{"造血", "幹", "細胞", "移植"}, {"hematopoietic", "stem", "cell", "transplantation"}}, + {{"stem", "cell"}, {"幹", "細胞"}}, + {{"幹", "細胞", "移植"}, {"rescue", "transplant"}, {"stem", "cell", "rescue"}}, + {{"rescue"}, {"救命"}}, + {{"blood"}, {"血液"}}, + }; + } + }; + + TEST_F(LatticeKMPTest, SearchTest) { + PaRattice parattice(paradict); + const std::vector words = {"造血", "幹", "細胞", "移植"}; + const Lattice lattice = parattice.get_lattice(words, true, 10); + const std::vector pattern = {"幹", "細胞"}; + const LatticeKMP kmp(pattern); + std::vector>> results = kmp.search(lattice); + std::sort(results.begin(), results.end()); + std::vector>> expected = { + {{"", 1}, {"幹", 10}, {"細胞", 15}}, + {{"", 2}, {"幹", 7}, {"細胞", 13}}, + {{"", 3}, {"幹", 9}, {"細胞", 13}}, + {{"", 3}, {"幹", 9}, {"細胞", 14}}, + {{"", 3}, {"幹", 10}, {"細胞", 15}}, + }; + EXPECT_EQ(expected, results); + } + +} // namespace parattice diff --git a/bindings/cpp/test/parattice_test.cc b/bindings/cpp/test/parattice_test.cc new file mode 100644 index 0000000..6c8ce2c --- /dev/null +++ b/bindings/cpp/test/parattice_test.cc @@ -0,0 +1,103 @@ +#include + +#include +#include +#include + +#include + +namespace parattice { + + std::vector> search_index_relative_to_absolute(const std::vector& data) { + std::vector> new_data; + std::size_t node_id = 0; + for (auto& node : data) { + node_id += node.increment; + new_data.emplace_back(node.text, node_id - 1, node_id + node.length - 1, node.offset_start, node.offset_end); + } + std::sort(new_data.begin(), new_data.end()); + return new_data; + } + + class PaRatticeTest : public testing::Test { + protected: + std::vector>> paradict = {}; + + void SetUp() override { + paradict = { + {{"blood", "stem", "cell"}, {"造血", "幹", "細胞"}, {"hematopoietic", "stem", "cell"}}, + {{"造血", "幹", "細胞", "移植"}, {"hematopoietic", "stem", "cell", "transplantation"}}, + {{"stem", "cell"}, {"幹", "細胞"}}, + {{"幹", "細胞", "移植"}, {"rescue", "transplant"}, {"stem", "cell", "rescue"}}, + {{"rescue"}, {"救命"}}, + {{"blood"}, {"血液"}}, + }; + } + }; + + TEST_F(PaRatticeTest, DumpForSearchIndex) { + PaRattice parattice(paradict); + const std::vector words = {"造血", "幹", "細胞", "移植"}; + const Lattice lattice = parattice.get_lattice(words, true, 10); + const std::vector index_data = lattice.dump_for_search_index(); + using idxitem = std::tuple; + std::vector> expected = { + idxitem {"造血", 0, 3, 0, 1}, idxitem {"blood", 0, 2, 0, 3}, idxitem {"血液", 0, 2, 0, 3}, idxitem {"hematopoietic", 0, 1, 0, 3}, + idxitem {"stem", 1, 4, 0, 3}, idxitem {"stem", 1, 6, 0, 4}, idxitem {"幹", 1, 10, 0, 4}, + idxitem {"rescue", 2, 11, 0, 4}, idxitem {"幹", 2, 7, 0, 3}, idxitem {"stem", 2, 5, 0, 3}, idxitem {"stem", 2, 6, 0, 4}, + idxitem {"stem", 3, 8, 1, 3}, idxitem {"幹", 3, 9, 1, 2}, idxitem {"幹", 3, 10, 1, 4}, idxitem {"救命", 3, 11, 1, 4}, idxitem {"rescue", 3, 11, 1, 4}, + idxitem {"cell", 4, 13, 0, 3}, idxitem {"cell", 5, 12, 0, 4}, idxitem {"cell", 5, 13, 0, 3}, idxitem {"cell", 6, 15, 0, 4}, idxitem {"細胞", 7, 13, 0, 3}, + idxitem {"cell", 8, 13, 1, 3}, idxitem {"cell", 8, 14, 1, 4}, idxitem {"細胞", 9, 13, 2, 3}, idxitem {"細胞", 9, 14, 2, 4}, idxitem {"細胞", 10, 15, 1, 4}, + idxitem {"transplant", 11, 16, 1, 4}, idxitem {"rescue", 12, 16, 0, 4}, idxitem {"救命", 12, 16, 0, 4}, idxitem {"移植", 13, 16, 3, 4}, + idxitem {"rescue", 14, 16, 1, 4}, idxitem {"transplantation", 15, 16, 0, 4}, + }; + std::sort(expected.begin(), expected.end()); + EXPECT_EQ(expected, search_index_relative_to_absolute(index_data)); + } + + TEST_F(PaRatticeTest, Serialize) { + PaRattice parattice(paradict); + const std::vector words = {"造血", "幹", "細胞", "移植"}; + const Lattice lattice = parattice.get_lattice(words, true, 10); + const std::vector bytes = lattice.to_bytes(); + const Lattice deserialized_lattice = Lattice::from_bytes(bytes); + EXPECT_EQ(lattice.dump_dot(true), deserialized_lattice.dump_dot(true)); + } + + TEST_F(PaRatticeTest, GetTrunkSpanTest) { + PaRattice parattice(paradict); + const std::vector words = {"造血", "幹", "細胞", "移植"}; + const Lattice lattice = parattice.get_lattice(words, true, 10); + const std::vector>> test_case = { + {{"", 1}, {"stem", 4}}, + {{"", 3}, {"rescue", 11}}, + {{"", 15}, {"transplantation", 16}}, + }; + const std::vector>> expected = { + {{"", 0}, {"hematopoietic", 1}, {"stem", 4}, {"cell", 13}}, + {{"", 3}, {"rescue", 11}, {"transplant", 16}}, + {{"", 0}, {"hematopoietic", 1}, {"stem", 6}, {"cell", 15}, {"transplantation", 16}}, + }; + for (std::size_t i = 0; i < test_case.size(); ++i) { + EXPECT_EQ(expected[i], lattice.get_trunk_span(test_case[i])); + } + } + + TEST_F(PaRatticeTest, MaxDepth) { + PaRattice parattice(paradict); + const std::vector words = {"造血", "幹", "細胞", "移植"}; + const Lattice lattice = parattice.get_lattice(words, true, 1); + const std::vector index_data = lattice.dump_for_search_index(); + using idxitem = std::tuple; + std::vector> expected = { + idxitem {"造血", 0, 3, 0, 1}, idxitem {"blood", 0, 1, 0, 3}, idxitem {"hematopoietic", 0, 2, 0, 3}, + idxitem {"rescue", 3, 6, 1, 4}, idxitem {"stem", 3, 7, 1, 3}, idxitem {"幹", 3, 8, 1, 2}, idxitem {"stem", 1, 4, 0, 3}, + idxitem {"stem", 2, 4, 0, 3}, idxitem {"stem", 2, 5, 0, 4}, idxitem {"cell", 7, 10, 1, 4}, idxitem {"cell", 7, 11, 1, 3}, + idxitem {"細胞", 8, 11, 2, 3}, idxitem {"cell", 4, 11, 0, 3}, idxitem {"cell", 5, 9, 0, 4}, idxitem {"transplant", 6, 12, 1, 4}, + idxitem {"rescue", 10, 12, 1, 4}, idxitem {"移植", 11, 12, 3, 4}, idxitem {"transplantation", 9, 12, 0, 4}, + }; + std::sort(expected.begin(), expected.end()); + EXPECT_EQ(expected, search_index_relative_to_absolute(index_data)); + } + +} // namespace parattice diff --git a/bindings/jni/.gitignore b/bindings/jni/.gitignore new file mode 100644 index 0000000..76c2c17 --- /dev/null +++ b/bindings/jni/.gitignore @@ -0,0 +1,7 @@ +.classpath +.settings/ +.project +.gradle +build/ +bin/ +*.swp diff --git a/bindings/jni/build.gradle b/bindings/jni/build.gradle new file mode 100644 index 0000000..85817b3 --- /dev/null +++ b/bindings/jni/build.gradle @@ -0,0 +1,144 @@ +import org.gradle.internal.os.OperatingSystem; + +plugins { + id "cpp" + id "eclipse" + id "java-library" + id "maven-publish" + id "org.datlowe.maven-publish-auth" version "2.0.2" +} + +allprojects { + repositories { + jcenter() + } + group = "parattice" + version = "0.2.1-SNAPSHOT" + sourceCompatibility = 1.8 + targetCompatibility = 1.8 +} + +test { + useJUnitPlatform { + includeEngines "junit-jupiter" + } +} + +dependencies { + implementation "commons-io:commons-io:2.7" + implementation "org.apache.commons:commons-lang3:3.10" + testImplementation "org.junit.jupiter:junit-jupiter-api:5.6.2" + testImplementation "org.junit.jupiter:junit-jupiter-engine:5.6.2" +} + +def nativeHeadersDir = file("$buildDir/nativeHeaders") + +task nativeHeaders { + def sourceFiles = [ + "$projectDir/src/main/java/parattice/Lattice.java", + "$projectDir/src/main/java/parattice/LatticeKMP.java", + "$projectDir/src/main/java/parattice/PaRattice.java", + ] + inputs.files sourceSets.main.output + inputs.property("classes", classes) + doLast { + nativeHeadersDir.mkdirs() + exec { + executable org.gradle.internal.jvm.Jvm.current().getExecutable("javac") + args "-h", nativeHeadersDir + args "-d", "$projectDir/build/tmp" + args "-classpath", sourceSets.main.output.classesDirs.asPath + args "-Xlint:deprecation" + args sourceFiles + } + } +} + +model { + platforms { + if (OperatingSystem.current().isLinux()) { + linux_amd64 { + architecture "amd64" + operatingSystem "linux" + } + } else if (OperatingSystem.current().isMacOsX()) { + darwin_amd64 { + architecture "amd64" + operatingSystem "darwin" + } + } + } + components { + nativePlatform(NativeLibrarySpec) { + baseName "parattice" + $.platforms.each { p -> + targetPlatform p.name + } + sources { + cpp { + source.srcDirs = ["src/shared/cpp", "src/main/cpp"] + exportedHeaders.srcDirs = ["src/shared/headers"] + } + } + } + all { + binaries.all { + cppCompiler.args "-std=c++11" + cppCompiler.args "-O3", "-Wall", "-Wextra", "-Werror" + cppCompiler.args "-I", "${org.gradle.internal.jvm.Jvm.current().javaHome}/include" + if (OperatingSystem.current().isLinux()) { + cppCompiler.args "-I", "${org.gradle.internal.jvm.Jvm.current().javaHome}/include/linux" + } else if (OperatingSystem.current().isMacOsX()) { + cppCompiler.args "-I", "${org.gradle.internal.jvm.Jvm.current().javaHome}/include/darwin" + } + cppCompiler.args "-I$nativeHeadersDir" + cppCompiler.args "-I$projectDir/../cpp/include" + linker.args "$projectDir/../../target/release/libparattice.a" + } + binaries.withType(SharedLibraryBinarySpec) { binary -> + def variantName = targetPlatform.name.replace("_", "-") + def taskName = "jar-$variantName" + def nativeJar = project.tasks.findByName(taskName) + if (nativeJar == null) { + nativeJar = project.tasks.create(taskName, Jar) { + archiveBaseName = "parattice-$variantName" + } + artifacts { + runtimeOnly nativeJar + } + publishing { + publications { + main(MavenPublication) { + artifact(nativeJar) { + classifier = variantName + } + } + } + } + } + binary.tasks.withType(LinkSharedLibrary) { builderTask -> + nativeJar.into("parattice/platform/$variantName") { from builderTask.linkedFile } + } + project.tasks.test { + classpath.from nativeJar + } + } + } + } +} + +tasks.withType(JavaCompile) { + options.encoding = "UTF-8" +} + +tasks.withType(CppCompile) { task -> + task.dependsOn project.tasks.nativeHeaders +} + +publishing { + publications { + main(MavenPublication) { + from components.java + } + } +} diff --git a/bindings/jni/settings.gradle b/bindings/jni/settings.gradle new file mode 100644 index 0000000..5e11077 --- /dev/null +++ b/bindings/jni/settings.gradle @@ -0,0 +1 @@ +rootProject.name = 'parattice' diff --git a/bindings/jni/src/main/cpp/Lattice.cc b/bindings/jni/src/main/cpp/Lattice.cc new file mode 100644 index 0000000..cbb2edf --- /dev/null +++ b/bindings/jni/src/main/cpp/Lattice.cc @@ -0,0 +1,92 @@ +#include "parattice_Lattice.h" + +#include + +#include "internal.hh" + +namespace parattice { + + namespace jni { + + extern "C" { + + JNIEXPORT void JNICALL Java_parattice_Lattice_jniDelete(JNIEnv*, jobject, jlong handle) { + delete to_object_ptr(handle); + } + + JNIEXPORT jint JNICALL Java_parattice_Lattice_jniGetSize(JNIEnv*, jobject, jlong handle) { + return static_cast(to_object(handle).lattice_.get_size()); + } + + JNIEXPORT jint JNICALL Java_parattice_Lattice_jniGetRequiredCapacity(JNIEnv*, jobject, jlong handle) { + return static_cast(to_object(handle).lattice_.get_required_capacity()); + } + + JNIEXPORT jlong JNICALL Java_parattice_Lattice_jniNewFromBytes(JNIEnv* env, jclass, jbyteArray bytes) { + JNIByteArrayAccess bytes_access(env, bytes); + return to_jlong(new JNILatticeWrapper(Lattice::from_bytes(reinterpret_cast(bytes_access.data()), bytes_access.size()), std::move(bytes_access))); + } + + JNIEXPORT jbyteArray JNICALL Java_parattice_Lattice_jniToBytes(JNIEnv* env, jobject, jlong handle) { + const auto data = to_object(handle).lattice_.to_bytes(); + jbyteArray bytes = env->NewByteArray(static_cast(data.size())); + JNIByteArrayAccess bytes_access(env, bytes); + std::memcpy(bytes_access.data(), data.data(), sizeof(std::uint8_t) * data.size()); + return bytes; + } + + JNIEXPORT jstring JNICALL Java_parattice_Lattice_jniDumpDot(JNIEnv* env, jobject, jlong handle, jboolean is_numbered) { + return env->NewStringUTF(to_object(handle).lattice_.dump_dot(is_numbered).c_str()); + } + + JNIEXPORT jint JNICALL Java_parattice_Lattice_jniGetTrunkSpan(JNIEnv* env, jobject, jlong handle, jobjectArray path_string, jintArray path_node_id, jobjectArray result_string, jintArray result_node_id) { + const JNIObjectArrayAccess path_string_access(env, path_string); + const JNIIntArrayAccess path_node_id_access(env, path_node_id); + JNIObjectArrayAccess result_string_access(env, result_string); + JNIIntArrayAccess result_node_id_access(env, result_node_id); + std::vector> path_vector; + path_vector.reserve(path_string_access.size()); + for (std::size_t i = 0; i < path_string_access.size(); ++i) { + const JNIStringAccess str_access(env, static_cast(path_string_access.get(i))); + path_vector.emplace_back(str_access.get_string(), path_node_id_access.data()[i]); + } + const auto new_path = to_object(handle).lattice_.get_trunk_span(path_vector); + for (std::size_t i = 0; i < new_path.size(); ++i) { + result_string_access.set(i, env->NewStringUTF(new_path.at(i).first.c_str())); + result_node_id_access.data()[i] = static_cast(new_path.at(i).second); + } + return static_cast(new_path.size()); + } + + JNIEXPORT void JNICALL Java_parattice_Lattice_jniGetTrunkSpans(JNIEnv* env, jobject, jlong handle, jintArray left_trunks, jintArray right_trunks) { + const auto trunk_spans = to_object(handle).lattice_.get_trunk_spans(); + JNIIntArrayAccess left_trunks_access(env, left_trunks); + JNIIntArrayAccess right_trunks_access(env, right_trunks); + for (std::size_t i = 0; i < trunk_spans.size(); ++i) { + left_trunks_access.data()[i] = static_cast(trunk_spans[i].first); + right_trunks_access.data()[i] = static_cast(trunk_spans[i].second); + } + } + + JNIEXPORT jint JNICALL Java_parattice_Lattice_jniDumpForSearchIndex(JNIEnv* env, jobject, jlong handle, jobjectArray texts, jintArray offset_starts, jintArray offset_ends, jintArray increments, jintArray lengths) { + const auto search_index_nodes = to_object(handle).lattice_.dump_for_search_index(); + JNIObjectArrayAccess texts_access(env, texts); + JNIIntArrayAccess offset_starts_access(env, offset_starts); + JNIIntArrayAccess offset_ends_access(env, offset_ends); + JNIIntArrayAccess increments_access(env, increments); + JNIIntArrayAccess lengths_access(env, lengths); + for (std::size_t i = 0; i < search_index_nodes.size(); ++i) { + texts_access.set(i, env->NewStringUTF(search_index_nodes[i].text.c_str())); + offset_starts_access.data()[i] = static_cast(search_index_nodes[i].offset_start); + offset_ends_access.data()[i] = static_cast(search_index_nodes[i].offset_end); + increments_access.data()[i] = static_cast(search_index_nodes[i].increment); + lengths_access.data()[i] = static_cast(search_index_nodes[i].length); + } + return static_cast(search_index_nodes.size()); + } + + } + + } + +} diff --git a/bindings/jni/src/main/cpp/LatticeKMP.cc b/bindings/jni/src/main/cpp/LatticeKMP.cc new file mode 100644 index 0000000..31f6763 --- /dev/null +++ b/bindings/jni/src/main/cpp/LatticeKMP.cc @@ -0,0 +1,55 @@ +#include "parattice_LatticeKMP.h" + +#include + +#include "internal.hh" + +namespace parattice { + + namespace jni { + + extern "C" { + + JNIEXPORT jlong JNICALL Java_parattice_LatticeKMP_jniNew(JNIEnv* env, jobject, jobjectArray pattern) { + const JNIObjectArrayAccess pattern_access(env, pattern); + return to_jlong(new LatticeKMP(to_string_vector(env, pattern_access))); + } + + JNIEXPORT void JNICALL Java_parattice_LatticeKMP_jniDelete(JNIEnv*, jobject, jlong handle) { + delete to_object_ptr(handle); + } + + JNIEXPORT jlong JNICALL Java_parattice_LatticeKMP_jniSearch(JNIEnv*, jclass, jlong handle, jlong lattice_handle) { + return to_jlong(new std::vector>>(to_object(handle).search(to_object(lattice_handle)))); + } + + JNIEXPORT jintArray JNICALL Java_parattice_LatticeKMP_jniSearchLength(JNIEnv* env, jclass, jlong result_handle) { + const auto& result = to_object>>>(result_handle); + jintArray sizes = env->NewIntArray(static_cast(result.size())); + JNIIntArrayAccess sizes_access(env, sizes); + for(std::size_t i = 0; i < result.size(); ++i){ + sizes_access.data()[i] = static_cast(result[i].size()); + } + return sizes; + } + + JNIEXPORT void JNICALL Java_parattice_LatticeKMP_jniSearchGetDataAndFree(JNIEnv* env, jclass, jlong result_handle, jobjectArray result_string, jintArray result_node_id) { + JNIObjectArrayAccess result_string_access(env, result_string); + JNIIntArrayAccess result_node_id_access(env, result_node_id); + auto results = to_object_ptr>>>(result_handle); + std::size_t i = 0; + for (auto& result : *results) { + for (auto& node : result) { + result_string_access.set(i, env->NewStringUTF(node.first.c_str())); + result_node_id_access.data()[i] = static_cast(node.second); + ++i; + } + } + delete results; + } + + } // end extern "C" + + } // namespace jni + +} // namespace parattice diff --git a/bindings/jni/src/main/cpp/NativeLibrary.cc b/bindings/jni/src/main/cpp/NativeLibrary.cc new file mode 100644 index 0000000..6928f7b --- /dev/null +++ b/bindings/jni/src/main/cpp/NativeLibrary.cc @@ -0,0 +1,30 @@ +#include + +#include "internal.hh" + +namespace parattice { + + namespace jni { + + extern "C" { + + jint JNI_OnLoad(JavaVM* vm, void*) { + + JNIEnv* env; + if (vm->GetEnv(reinterpret_cast(&env), JNI_VERSION_1_8) != JNI_OK) { + return JNI_ERR; + } + + return JNI_VERSION_1_8; + } + + void JNI_OnUnload(JavaVM* vm, void*) { + JNIEnv* env; + vm->GetEnv(reinterpret_cast(&env), JNI_VERSION_1_8); + } + + } + + } + +} diff --git a/bindings/jni/src/main/cpp/PaRattice.cc b/bindings/jni/src/main/cpp/PaRattice.cc new file mode 100644 index 0000000..56cb424 --- /dev/null +++ b/bindings/jni/src/main/cpp/PaRattice.cc @@ -0,0 +1,40 @@ +#include "parattice_PaRattice.h" + +#include + +#include "internal.hh" + +namespace parattice { + + namespace jni { + + extern "C" { + + JNIEXPORT jlong JNICALL Java_parattice_PaRattice_jniNew(JNIEnv* env, jobject, jobjectArray arr_dict) { + const JNIObjectArrayAccess arr_dict_access(env, arr_dict); + return to_jlong(new PaRattice(to_string_vector_3d(env, arr_dict_access))); + } + + JNIEXPORT void JNICALL Java_parattice_PaRattice_jniDelete(JNIEnv*, jobject, jlong handle) { + delete to_object_ptr(handle); + } + + JNIEXPORT jlong JNICALL Java_parattice_PaRattice_jniGetLattice(JNIEnv* env, jobject, jlong handle, jobjectArray sentence, jboolean shrink, jint max_depth) { + const JNIObjectArrayAccess sentence_access(env, sentence); + std::vector str_vec; + std::vector str_access_vec; + str_vec.reserve(sentence_access.size()); + str_access_vec.reserve(sentence_access.size()); + for (std::size_t i = 0; i < sentence_access.size(); ++i) { + JNIStringAccess str_access(env, static_cast(sentence_access.get(i))); + str_vec.emplace_back(str_access.data()); + str_access_vec.emplace_back(std::move(str_access)); + } + return to_jlong(new JNILatticeWrapper(to_object(handle).get_lattice(str_vec, shrink, max_depth), std::move(str_access_vec))); + } + + } // end extern "C" + + } // namespace jni + +} // namespace parattice diff --git a/bindings/jni/src/main/cpp/internal.hh b/bindings/jni/src/main/cpp/internal.hh new file mode 100644 index 0000000..ca205ec --- /dev/null +++ b/bindings/jni/src/main/cpp/internal.hh @@ -0,0 +1,302 @@ +#ifndef PARATTICE_INTERNAL_H_ +#define PARATTICE_INTERNAL_H_ + +#include +#include + +namespace parattice { + + namespace jni { + + class JNIStringAccess { + JNIStringAccess() = delete; + JNIStringAccess(const JNIStringAccess&) = delete; + + public: + + JNIStringAccess(JNIEnv* env, jstring string) + : env_(env) + , string_(string) + , size_(env_->GetStringUTFLength(string_)) + , char_ptr_(env_->GetStringUTFChars(string_, 0)) {} + + JNIStringAccess(JNIStringAccess&& r) noexcept + : env_(r.env_) + , string_(r.string_) + , size_(r.size_) + , char_ptr_(r.char_ptr_) { + r.env_ = nullptr; + r.string_ = nullptr; + r.size_ = 0; + r.char_ptr_ = nullptr; + } + + JNIStringAccess& operator=(JNIStringAccess&& r) noexcept { + env_ = r.env_; + string_ = r.string_; + size_ = r.size_; + char_ptr_ = r.char_ptr_; + r.env_ = nullptr; + r.string_ = nullptr; + r.size_ = 0; + r.char_ptr_ = nullptr; + return *this; + } + + ~JNIStringAccess() { + if (env_) { + env_->ReleaseStringUTFChars(string_, char_ptr_); + env_->DeleteLocalRef(string_); + } + } + + inline const char* data() const { return char_ptr_; } + inline jsize size() const { return size_; } + inline std::string get_string() const { + return std::string(char_ptr_); + } + + private: + JNIEnv* env_; + jstring string_; + jsize size_; + const char* char_ptr_; + + }; // class JNIStringAccess + + class JNIByteArrayAccess { + JNIByteArrayAccess(const JNIByteArrayAccess&) = delete; + + public: + JNIByteArrayAccess() + : env_(nullptr) {} + + JNIByteArrayAccess(JNIEnv* env, jbyteArray array) + : env_(env) + , array_(array) + , size_(env_->GetArrayLength(array_)) + , data_(reinterpret_cast(env_->GetPrimitiveArrayCritical(array_, nullptr))) { + } + + JNIByteArrayAccess(JNIByteArrayAccess&& r) noexcept + : env_(r.env_) + , array_(r.array_) + , size_(r.size_) + , data_(r.data_) { + r.env_ = nullptr; + r.array_ = nullptr; + r.size_ = 0; + r.data_ = nullptr; + } + + JNIByteArrayAccess& operator=(JNIByteArrayAccess&& r) noexcept { + env_ = r.env_; + array_ = r.array_; + size_ = r.size_; + data_ = r.data_; + r.env_ = nullptr; + r.array_ = nullptr; + r.size_ = 0; + r.data_ = nullptr; + return *this; + } + + ~JNIByteArrayAccess() { + if (env_) { + env_->ReleasePrimitiveArrayCritical(array_, data_, 0); + } + } + + inline std::size_t size() const { return static_cast(size_); } + inline jbyte* data() const { + return data_; + } + inline jbyte* data() { + return data_; + } + + private: + JNIEnv* env_; + jbyteArray array_; + jsize size_; + jbyte* data_; + + }; // class JNIByteArrayAccess + + class JNIIntArrayAccess { + JNIIntArrayAccess() = delete; + JNIIntArrayAccess(const JNIIntArrayAccess&) = delete; + + public: + JNIIntArrayAccess(JNIEnv* env, jintArray array) + : env_(env) + , array_(array) + , size_(env_->GetArrayLength(array_)) + , data_(reinterpret_cast(env_->GetPrimitiveArrayCritical(array_, nullptr))) { + } + + JNIIntArrayAccess(JNIIntArrayAccess&& r) noexcept + : env_(r.env_) + , array_(r.array_) + , size_(r.size_) + , data_(r.data_) { + r.env_ = nullptr; + r.array_ = nullptr; + r.size_ = 0; + r.data_ = nullptr; + } + + JNIIntArrayAccess& operator=(JNIIntArrayAccess&& r) noexcept { + env_ = r.env_; + array_ = r.array_; + size_ = r.size_; + data_ = r.data_; + r.env_ = nullptr; + r.array_ = nullptr; + r.size_ = 0; + r.data_ = nullptr; + return *this; + } + + ~JNIIntArrayAccess() { + if (env_) { + env_->ReleasePrimitiveArrayCritical(array_, data_, 0); + } + } + inline std::size_t size() const { return static_cast(size_); } + inline jint* data() const { + return data_; + } + inline jint* data() { + return data_; + } + + private: + JNIEnv* env_; + jintArray array_; + jsize size_; + jint* data_; + + }; // class JNIIntArrayAccess + + class JNIObjectArrayAccess { + JNIObjectArrayAccess() = delete; + JNIObjectArrayAccess(const JNIObjectArrayAccess&) = delete; + + public: + JNIObjectArrayAccess(JNIEnv* env, jobjectArray array) + : env_(env) + , array_(array) + , size_(env_->GetArrayLength(array_)) { + } + + JNIObjectArrayAccess(JNIObjectArrayAccess&& r) noexcept + : env_(r.env_) + , array_(r.array_) + , size_(r.size_) { + r.env_ = nullptr; + r.array_ = nullptr; + r.size_ = 0; + } + + JNIObjectArrayAccess& operator=(JNIObjectArrayAccess&& r) noexcept { + env_ = r.env_; + array_ = r.array_; + size_ = r.size_; + r.env_ = nullptr; + r.array_ = nullptr; + r.size_ = 0; + return *this; + } + + ~JNIObjectArrayAccess() { + if (env_) { + env_->DeleteLocalRef(array_); + } + } + inline std::size_t size() const { return static_cast(size_); } + inline jobject get(std::size_t index) const { + return env_->GetObjectArrayElement(array_, static_cast(index)); + } + inline void set(std::size_t index, jobject value) { + return env_->SetObjectArrayElement(array_, static_cast(index), value); + } + + private: + JNIEnv* env_; + jobjectArray array_; + jsize size_; + + }; // class JNIObjectArrayAccess + + class JNILatticeWrapper { + JNILatticeWrapper() = delete; + JNILatticeWrapper(const JNILatticeWrapper&) = delete; + + public: + JNILatticeWrapper(Lattice&& lattice, std::vector&& sentence) + : lattice_(std::forward(lattice)) + , sentence_(std::forward>(sentence)) {} + + JNILatticeWrapper(Lattice&& lattice, JNIByteArrayAccess&& bytes) + : lattice_(std::forward(lattice)) + , bytes_(std::forward(bytes)) {} + + public: + Lattice lattice_; + + private: + std::vector sentence_; + JNIByteArrayAccess bytes_; + + }; // class JNILatticeWrapper + + inline jlong to_jlong(void* instance) { + return reinterpret_cast(instance); + } + + template + inline T *to_object_ptr(jlong pointer) { + return reinterpret_cast(pointer); + } + + template + inline T &to_object(jlong pointer) { + return *reinterpret_cast(pointer); + } + + inline std::vector to_string_vector(JNIEnv *env, const JNIObjectArrayAccess& array) { + std::vector str_vec; + str_vec.reserve(array.size()); + for (std::size_t i = 0; i < array.size(); ++i) { + const JNIStringAccess str_access(env, static_cast(array.get(i))); + str_vec.emplace_back(str_access.get_string()); + } + return str_vec; + } + + inline std::vector>> to_string_vector_3d(JNIEnv *env, const JNIObjectArrayAccess& array) { + std::vector>> vec; + vec.reserve(array.size()); + for (std::size_t i = 0; i < array.size(); ++i) { + const JNIObjectArrayAccess array2(env, static_cast(array.get(i))); + vec.emplace_back(); + vec.back().reserve(array2.size()); + for (std::size_t j = 0; j < array2.size(); ++j) { + const JNIObjectArrayAccess array3(env, static_cast(array2.get(j))); + vec.back().emplace_back(); + vec.back().back().reserve(array3.size()); + for (std::size_t k = 0; k < array3.size(); ++k) { + const JNIStringAccess str_access(env, static_cast(array3.get(k))); + vec.back().back().emplace_back(str_access.get_string()); + } + } + } + return vec; + } + + } // namespace jni + +} // namespace parattice + +#endif // PARATTICE_INTERNAL_H_ diff --git a/bindings/jni/src/main/java/parattice/Lattice.java b/bindings/jni/src/main/java/parattice/Lattice.java new file mode 100644 index 0000000..6f6ec0a --- /dev/null +++ b/bindings/jni/src/main/java/parattice/Lattice.java @@ -0,0 +1,164 @@ +package parattice; + +import java.io.Externalizable; +import java.io.IOException; +import java.io.ObjectInput; +import java.io.ObjectOutput; +import java.util.Arrays; +import java.util.ArrayList; +import java.util.List; + +import parattice.internal.JNILoader; + +public class Lattice implements Externalizable, AutoCloseable { + + public Lattice(long handle) { + this.handle = handle; + } + + public Lattice() {} + + protected long handle = 0; + protected String[] sentence; + + public void dispose() { + if (this.handle == 0) { + return; + } + jniDelete(this.handle); + this.handle = 0; + } + + @Override + public void close() { + dispose(); + } + + public long getHandle() { + return this.handle; + } + + public int getSize() { + if (this.handle == 0) { + throw new IllegalStateException(); + } + return jniGetSize(this.handle); + } + + public byte[] toBytes() { + if (this.handle == 0) { + throw new IllegalStateException(); + } + return jniToBytes(this.handle); + } + + public static Lattice fromBytes(byte[] data) { + return new Lattice(jniNewFromBytes(data)); + } + + public void writeExternal(ObjectOutput out) throws IOException { + if (this.handle == 0) { + throw new IllegalStateException(); + } + byte[] data = jniToBytes(this.handle); + out.writeInt(data.length); + out.write(data); + } + + public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException { + if (this.handle == 0) { + int cap = in.readInt(); + byte[] data = new byte[cap]; + int s = 0; + int sizeSum = 0; + do { + if ((s = in.read(data, sizeSum, cap - sizeSum)) == -1) { + break; + } + sizeSum += s; + } while (sizeSum != cap); + if (sizeSum != cap) { + throw new IOException("size mismatch: read " + sizeSum + " bytes, expected " + cap + " bytes"); + } + this.handle = jniNewFromBytes(data); + } else { + throw new IllegalStateException(); + } + } + + public String dumpDot(boolean isNumbered) { + if (this.handle == 0) { + throw new IllegalStateException(); + } + return jniDumpDot(this.handle, isNumbered); + } + + public List> getTrunkSpan(List> path) { + if (this.handle == 0) { + throw new IllegalStateException(); + } + String[] pathString = new String[path.size()]; + int[] pathNodeId = new int[path.size()]; + for (int i = 0; i < path.size(); ++i) { + pathString[i] = path.get(i).first; + pathNodeId[i] = path.get(i).second; + } + int latticeSize = getSize(); + String[] resultString = new String[latticeSize]; + int[] resultNodeId = new int[latticeSize]; + int s = jniGetTrunkSpan(this.handle, pathString, pathNodeId, resultString, resultNodeId); + List> result = new ArrayList<>(); + for (int i = 0; i < s; ++i) { + result.add(new Pair(resultString[i], resultNodeId[i])); + } + return result; + } + + public List> getTrunkSpans() { + if (this.handle == 0) { + throw new IllegalStateException(); + } + final int latticeSize = this.getSize(); + int[] leftTrunks = new int[latticeSize]; + int[] rightTrunks = new int[latticeSize]; + List> result = new ArrayList<>(); + jniGetTrunkSpans(this.handle, leftTrunks, rightTrunks); + for (int i = 0; i < latticeSize; ++i) { + result.add(new Pair(leftTrunks[i], rightTrunks[i])); + } + return result; + } + + public List dumpForSearchIndex() { + if (this.handle == 0) { + throw new IllegalStateException(); + } + int capacity = jniGetRequiredCapacity(this.handle); + String[] texts = new String[capacity]; + int[] offsetStarts = new int[capacity]; + int[] offsetEnds = new int[capacity]; + int[] increments = new int[capacity]; + int[] lengths = new int[capacity]; + int s = jniDumpForSearchIndex(this.handle, texts, offsetStarts, offsetEnds, increments, lengths); + List result = new ArrayList<>(); + for (int i = 0; i < s; ++i) { + result.add(new SearchIndexNode(texts[i], offsetStarts[i], offsetEnds[i], increments[i], lengths[i])); + } + return result; + } + + private native void jniDelete(long handle); + private native int jniGetSize(long handle); + private native int jniGetRequiredCapacity(long handle); + private static native long jniNewFromBytes(byte[] data); + private native byte[] jniToBytes(long handle); + private native String jniDumpDot(long handle, boolean isNumbered); + private native int jniGetTrunkSpan(long handle, String[] pathString, int[] pathNodeId, String[] resultString, int[] resultNodeId); + private native void jniGetTrunkSpans(long handle, int[] leftTrunks, int[] rightTrunks); + private native int jniDumpForSearchIndex(long handle, String[] texts, int[] offsetStarts, int[] offsetEnds, int[] increments, int[] lengths); + + static { + JNILoader.loadLibrary(); + } + +} diff --git a/bindings/jni/src/main/java/parattice/LatticeKMP.java b/bindings/jni/src/main/java/parattice/LatticeKMP.java new file mode 100644 index 0000000..47ba16b --- /dev/null +++ b/bindings/jni/src/main/java/parattice/LatticeKMP.java @@ -0,0 +1,65 @@ +package parattice; + +import java.util.ArrayList; +import java.util.List; + +import parattice.internal.JNILoader; + +public class LatticeKMP implements AutoCloseable { + + public LatticeKMP(String[] pattern) { + this.handle = jniNew(pattern); + } + + public LatticeKMP(List pattern) { + this.handle = jniNew(pattern.toArray(new String[pattern.size()])); + } + + protected long handle; + + public void dispose() { + if (this.handle == 0) { + return; + } + jniDelete(this.handle); + this.handle = 0; + } + + @Override + public void close() { + dispose(); + } + + public List>> search(Lattice lattice) { + long resultHandle = jniSearch(this.handle, lattice.getHandle()); + int[] resultSizes = jniSearchLength(resultHandle); + int sizeTotal = 0; + for (int resultSize : resultSizes) { + sizeTotal += resultSize; + } + String[] resultString = new String[sizeTotal]; + int[] resultNodeId = new int[sizeTotal]; + jniSearchGetDataAndFree(resultHandle, resultString, resultNodeId); + List>> result = new ArrayList<>(); + int k = 0; + for (int i = 0; i < resultSizes.length; ++i) { + List> resultArr = new ArrayList<>(); + for (int j = 0; j < resultSizes[i]; ++j) { + resultArr.add(new Pair(resultString[k], resultNodeId[k])); + ++k; + } + result.add(resultArr); + } + return result; + } + + private native long jniNew(String[] pattern); + private native void jniDelete(long handle); + private static native long jniSearch(long handle, long latticeHandle); + private static native int[] jniSearchLength(long resultHandle); + private static native void jniSearchGetDataAndFree(long resultHandle, String[] resultString, int[] resultNodeId); + + static { + JNILoader.loadLibrary(); + } +} diff --git a/bindings/jni/src/main/java/parattice/PaRattice.java b/bindings/jni/src/main/java/parattice/PaRattice.java new file mode 100644 index 0000000..13e3bf8 --- /dev/null +++ b/bindings/jni/src/main/java/parattice/PaRattice.java @@ -0,0 +1,63 @@ +package parattice; + +import java.util.List; + +import parattice.internal.JNILoader; + +public class PaRattice implements AutoCloseable { + + protected long handle; + + public PaRattice(String[][][] dict) { + this.handle = jniNew(dict); + } + + public PaRattice(List>> dict) { + String[][][] dictArr = new String[dict.size()][][]; + int i = 0; + for (List> group : dict) { + dictArr[i] = new String[group.size()][]; + int j = 0; + for (List phrase : group) { + dictArr[i][j] = phrase.toArray(new String[phrase.size()]); + ++j; + } + ++i; + } + this.handle = jniNew(dictArr); + } + + public void dispose() { + if (this.handle == 0) { + return; + } + jniDelete(this.handle); + this.handle = 0; + } + + @Override + public void close() { + dispose(); + } + + public Lattice getLattice(String[] sentence, boolean shrink, int max_depth) { + Lattice lattice = new Lattice(jniGetLattice(this.handle, sentence, shrink, max_depth)); + lattice.sentence = sentence; + return lattice; + } + + public Lattice getLattice(List sentence, boolean shrink, int max_depth) { + String[] sentencearr = sentence.toArray(new String[sentence.size()]); + Lattice lattice = new Lattice(jniGetLattice(this.handle, sentencearr, shrink, max_depth)); + lattice.sentence = sentencearr; + return lattice; + } + + private native long jniNew(String[][][] dict); + private native void jniDelete(long handle); + private native long jniGetLattice(long handle, String[] sentence, boolean shrink, int max_depth); + + static { + JNILoader.loadLibrary(); + } +} diff --git a/bindings/jni/src/main/java/parattice/Pair.java b/bindings/jni/src/main/java/parattice/Pair.java new file mode 100644 index 0000000..bdc9b43 --- /dev/null +++ b/bindings/jni/src/main/java/parattice/Pair.java @@ -0,0 +1,46 @@ +package parattice; + +public class Pair, B extends Comparable> implements Comparable> { + public A first; + public B second; + + public Pair(A first, B second) { + this.first = first; + this.second = second; + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof Pair)) { + return false; + } + @SuppressWarnings("unchecked") + Pair other = (Pair)obj; + return + (this.first == null ? other.first == null : this.first.equals(other.first)) && + (this.second == null ? other.second == null : this.second.equals(other.second)); + } + + @Override + public int compareTo(Pair other){ + int result = this.first.compareTo(other.first); + if (result != 0) { + return result; + } else { + return this.second.compareTo(other.second); + } + } + + @Override + public int hashCode() { + int f = this.first != null ? this.first.hashCode() : 0; + int s = this.second != null ? this.second.hashCode() : 0; + return f ^ s; + } + + public String toString() { + String f = this.first != null ? this.first.toString() : "(null)"; + String s = this.second != null ? this.second.toString() : "(null)"; + return "(" + f + ", " + s + ")"; + } +} diff --git a/bindings/jni/src/main/java/parattice/SearchIndexNode.java b/bindings/jni/src/main/java/parattice/SearchIndexNode.java new file mode 100644 index 0000000..689d124 --- /dev/null +++ b/bindings/jni/src/main/java/parattice/SearchIndexNode.java @@ -0,0 +1,17 @@ +package parattice; + +public class SearchIndexNode { + public String text; + public int offsetStart; + public int offsetEnd; + public int increment; + public int length; + + public SearchIndexNode(String text, int offsetStart, int offsetEnd, int increment, int length) { + this.text = text; + this.offsetStart = offsetStart; + this.offsetEnd = offsetEnd; + this.increment = increment; + this.length = length; + } +} diff --git a/bindings/jni/src/main/java/parattice/internal/JNILoader.java b/bindings/jni/src/main/java/parattice/internal/JNILoader.java new file mode 100644 index 0000000..2159070 --- /dev/null +++ b/bindings/jni/src/main/java/parattice/internal/JNILoader.java @@ -0,0 +1,44 @@ +package parattice.internal; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.security.AccessController; +import java.security.PrivilegedAction; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.SystemUtils; + +import parattice.PaRattice; + +public class JNILoader { + public static void loadLibrary() { + try { + InputStream in; + File tempFile; + if (SystemUtils.IS_OS_LINUX) { + in = PaRattice.class.getResourceAsStream("platform/linux-amd64/libparattice.so"); + tempFile = File.createTempFile("libparattice", ".so"); + } else if (SystemUtils.IS_OS_MAC_OSX) { + in = PaRattice.class.getResourceAsStream("platform/darwin-amd64/libparattice.dylib"); + tempFile = File.createTempFile("libparattice", ".dylib"); + } else { + throw new IOException("Unsupported system"); + } + OutputStream out = FileUtils.openOutputStream(tempFile); + IOUtils.copy(in, out); + in.close(); + out.close(); + AccessController.doPrivileged(new PrivilegedAction() { + public Void run() { + System.load(tempFile.toString()); + return null; + } + }); + } catch (IOException e) { + System.err.println(e); + } + } +} diff --git a/bindings/jni/src/test/java/parattice/LatticeKMPTest.java b/bindings/jni/src/test/java/parattice/LatticeKMPTest.java new file mode 100644 index 0000000..f7243ee --- /dev/null +++ b/bindings/jni/src/test/java/parattice/LatticeKMPTest.java @@ -0,0 +1,57 @@ +package parattice; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +import org.junit.jupiter.api.Test; + +public class LatticeKMPTest { + + static String[][][] paradict = { + {{"blood", "stem", "cell"}, {"造血", "幹", "細胞"}, {"hematopoietic", "stem", "cell"}}, + {{"造血", "幹", "細胞", "移植"}, {"hematopoietic", "stem", "cell", "transplantation"}}, + {{"stem", "cell"}, {"幹", "細胞"}}, + {{"幹", "細胞", "移植"}, {"rescue", "transplant"}, {"stem", "cell", "rescue"}}, + {{"rescue"}, {"救命"}}, + {{"blood"}, {"血液"}}, + }; + + @Test + public void search() { + Comparator>> comparator = new Comparator>>() { + @Override + public int compare(List> o1, List> o2) { + int s = Math.min(o1.size(), o2.size()); + for (int i = 0; i < s; ++i) { + int x = o1.get(i).compareTo(o2.get(i)); + if (x != 0) { + return x; + } + } + return o1.size() - o2.size(); + } + }; + try (PaRattice parattice = new PaRattice(paradict)) { + String[] words = {"造血", "幹", "細胞", "移植"}; + try (Lattice lattice = parattice.getLattice(words, true, 10)) { + String[] pattern = {"幹", "細胞"}; + try (LatticeKMP kmp = new LatticeKMP(pattern)) {; + List>> result = kmp.search(lattice); + Collections.sort(result, comparator); + List>> expected = Arrays.asList( + Arrays.asList(new Pair("", 1), new Pair("幹", 10), new Pair("細胞", 15)), + Arrays.asList(new Pair("", 2), new Pair("幹", 7), new Pair("細胞", 13)), + Arrays.asList(new Pair("", 3), new Pair("幹", 9), new Pair("細胞", 13)), + Arrays.asList(new Pair("", 3), new Pair("幹", 9), new Pair("細胞", 14)), + Arrays.asList(new Pair("", 3), new Pair("幹", 10), new Pair("細胞", 15)) + ); + assertEquals(expected, result); + } + } + } + } +} diff --git a/bindings/jni/src/test/java/parattice/PaRatticeTest.java b/bindings/jni/src/test/java/parattice/PaRatticeTest.java new file mode 100644 index 0000000..2287aa6 --- /dev/null +++ b/bindings/jni/src/test/java/parattice/PaRatticeTest.java @@ -0,0 +1,165 @@ +package parattice; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.junit.jupiter.api.Test; + +public class PaRatticeTest { + + static String[][][] paradict = { + {{"blood", "stem", "cell"}, {"造血", "幹", "細胞"}, {"hematopoietic", "stem", "cell"}}, + {{"造血", "幹", "細胞", "移植"}, {"hematopoietic", "stem", "cell", "transplantation"}}, + {{"stem", "cell"}, {"幹", "細胞"}}, + {{"幹", "細胞", "移植"}, {"rescue", "transplant"}, {"stem", "cell", "rescue"}}, + {{"rescue"}, {"救命"}}, + {{"blood"}, {"血液"}}, + }; + + static class EdgeInfo implements Comparable { + String text; + int nodeId; + int targetId; + int offsetBegin; + int offsetEnd; + public EdgeInfo(String text, int nodeId, int targetId, int offsetBegin, int offsetEnd) { + this.text = text; + this.nodeId = nodeId; + this.targetId = targetId; + this.offsetBegin = offsetBegin; + this.offsetEnd = offsetEnd; + } + @Override + public int compareTo(EdgeInfo other){ + int result; + result = this.text.compareTo(other.text); + if (result != 0) return result; + result = this.nodeId - other.nodeId; + if (result != 0) return result; + result = this.targetId - other.targetId; + if (result != 0) return result; + result = this.offsetBegin - other.offsetBegin; + if (result != 0) return result; + result = this.offsetEnd - other.offsetEnd; + return result; + } + @Override + public String toString() { + return "(" + text + ", " + nodeId + ", " + targetId + ", " + offsetBegin + ", " + offsetEnd + ")"; + } + @Override + public boolean equals(Object obj) { + if (!(obj instanceof EdgeInfo)) { + return false; + } + EdgeInfo other = (EdgeInfo) obj; + return + this.text.equals(other.text) && + this.nodeId == other.nodeId && + this.targetId == other.targetId && + this.offsetBegin == other.offsetBegin && + this.offsetEnd == other.offsetEnd; + } + } + + static List searchIndexRelativeToAbsolute(List data) { + List new_data = new ArrayList<>(); + int nodeId = 0; + for (SearchIndexNode node : data) { + nodeId += node.increment; + new_data.add(new EdgeInfo(node.text, nodeId - 1, nodeId + node.length - 1, node.offsetStart, node.offsetEnd)); + } + Collections.sort(new_data); + return new_data; + } + + @Test + public void dumpForSearchIndex() { + try (PaRattice parattice = new PaRattice(paradict)) { + String[] words = {"造血", "幹", "細胞", "移植"}; + try (Lattice lattice = parattice.getLattice(words, true, 10)) { + List indexData = lattice.dumpForSearchIndex(); + List expected = Arrays.asList( + new EdgeInfo("造血", 0, 3, 0, 1), new EdgeInfo("blood", 0, 2, 0, 3), new EdgeInfo("血液", 0, 2, 0, 3), new EdgeInfo("hematopoietic", 0, 1, 0, 3), + new EdgeInfo("stem", 1, 4, 0, 3), new EdgeInfo("stem", 1, 6, 0, 4), new EdgeInfo("幹", 1, 10, 0, 4), + new EdgeInfo("rescue", 2, 11, 0, 4), new EdgeInfo("幹", 2, 7, 0, 3), new EdgeInfo("stem", 2, 5, 0, 3), new EdgeInfo("stem", 2, 6, 0, 4), + new EdgeInfo("stem", 3, 8, 1, 3), new EdgeInfo("幹", 3, 9, 1, 2), new EdgeInfo("幹", 3, 10, 1, 4), new EdgeInfo("救命", 3, 11, 1, 4), new EdgeInfo("rescue", 3, 11, 1, 4), + new EdgeInfo("cell", 4, 13, 0, 3), new EdgeInfo("cell", 5, 12, 0, 4), new EdgeInfo("cell", 5, 13, 0, 3), new EdgeInfo("cell", 6, 15, 0, 4), new EdgeInfo("細胞", 7, 13, 0, 3), + new EdgeInfo("cell", 8, 13, 1, 3), new EdgeInfo("cell", 8, 14, 1, 4), new EdgeInfo("細胞", 9, 13, 2, 3), new EdgeInfo("細胞", 9, 14, 2, 4), new EdgeInfo("細胞", 10, 15, 1, 4), + new EdgeInfo("transplant", 11, 16, 1, 4), new EdgeInfo("rescue", 12, 16, 0, 4), new EdgeInfo("救命", 12, 16, 0, 4), new EdgeInfo("移植", 13, 16, 3, 4), + new EdgeInfo("rescue", 14, 16, 1, 4), new EdgeInfo("transplantation", 15, 16, 0, 4) + ); + Collections.sort(expected); + assertEquals(expected, searchIndexRelativeToAbsolute(indexData)); + } + } + } + + @Test + public void serialize() throws IOException, ClassNotFoundException { + try (PaRattice parattice = new PaRattice(paradict)) { + String[] words = {"造血", "幹", "細胞", "移植"}; + try (Lattice lattice = parattice.getLattice(words, true, 10)) { + ByteArrayOutputStream outs = new ByteArrayOutputStream(2000); + ObjectOutputStream oos = new ObjectOutputStream(outs); + oos.writeObject(lattice); + byte[] bytes = outs.toByteArray(); + ByteArrayInputStream ins = new ByteArrayInputStream(bytes); + ObjectInputStream ois = new ObjectInputStream(ins); + try (Lattice latticeDeserialized = (Lattice) ois.readObject()) { + assertEquals(lattice.dumpDot(true), latticeDeserialized.dumpDot(true)); + } + } + } + } + + @Test + public void getTrunkSpan() { + try (PaRattice parattice = new PaRattice(paradict)) { + String[] words = {"造血", "幹", "細胞", "移植"}; + try (Lattice lattice = parattice.getLattice(words, true, 10)) { + List>> testCase = Arrays.asList( + Arrays.asList(new Pair("", 1), new Pair("stem", 4)), + Arrays.asList(new Pair("", 3), new Pair("rescue", 11)), + Arrays.asList(new Pair("", 15), new Pair("transplantation", 16)) + ); + List>> expected = Arrays.asList( + Arrays.asList(new Pair("", 0), new Pair("hematopoietic", 1), new Pair("stem", 4), new Pair("cell", 13)), + Arrays.asList(new Pair("", 3), new Pair("rescue", 11), new Pair("transplant", 16)), + Arrays.asList(new Pair("", 0), new Pair("hematopoietic", 1), new Pair("stem", 6), new Pair("cell", 15), new Pair("transplantation", 16)) + ); + for (int i = 0; i < testCase.size(); ++i) { + assertEquals(expected.get(i), lattice.getTrunkSpan(testCase.get(i))); + } + } + } + } + + @Test + public void maxDepth() { + try (PaRattice parattice = new PaRattice(paradict)) { + String[] words = {"造血", "幹", "細胞", "移植"}; + try (Lattice lattice = parattice.getLattice(words, true, 1)) { + List indexData = lattice.dumpForSearchIndex(); + List expected = Arrays.asList( + new EdgeInfo("造血", 0, 3, 0, 1), new EdgeInfo("blood", 0, 1, 0, 3), new EdgeInfo("hematopoietic", 0, 2, 0, 3), + new EdgeInfo("rescue", 3, 6, 1, 4), new EdgeInfo("stem", 3, 7, 1, 3), new EdgeInfo("幹", 3, 8, 1, 2), new EdgeInfo("stem", 1, 4, 0, 3), + new EdgeInfo("stem", 2, 4, 0, 3), new EdgeInfo("stem", 2, 5, 0, 4), new EdgeInfo("cell", 7, 10, 1, 4), new EdgeInfo("cell", 7, 11, 1, 3), + new EdgeInfo("細胞", 8, 11, 2, 3), new EdgeInfo("cell", 4, 11, 0, 3), new EdgeInfo("cell", 5, 9, 0, 4), new EdgeInfo("transplant", 6, 12, 1, 4), + new EdgeInfo("rescue", 10, 12, 1, 4), new EdgeInfo("移植", 11, 12, 3, 4), new EdgeInfo("transplantation", 9, 12, 0, 4) + ); + Collections.sort(expected); + assertEquals(expected, searchIndexRelativeToAbsolute(indexData)); + } + } + } +} diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..9b74de5 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,81 @@ +# Examples + +This directory contains example codes that generate a paraphrase lattice and +search a phrase in the generated lattice. + +All bindings require `cargo` and `rustc` for building the parattice library. +Install the latest Rust refering [this documentation](https://www.rust-lang.org/tools/install). +To convert the generated lattice to an image file, install *Graphviz* beforehand: + +Debian/Ubuntu: +```shell +sudo apt install graphviz +``` + +Mac OS X (Homebrew) +```shell +brew install graphviz +``` + +## Rust + +Just run `cargo run` in [rust](/examples/rust) directory. + +To convert the generated lattice to an image file, run `dot -Tpng -O ./paraphrase-lattice.dot`. + +## C++ + +1. Install *cmake* as follows: + + Debian/Ubuntu: + ```shell + sudo apt install cmake + ``` + + Mac OS X (Homebrew): + ```shell + brew install cmake + ``` + +2. Run `cargo build --release` in the [top derectory](/) of this repository to build parattice library. +3. Run `cmake . && make` in [cpp](/examples/cpp) directory to build the example code. +4. Run `./parattice_example`. +5. To convert the generated lattice to an image file, run `dot -Tpng -O ./paraphrase-lattice.dot`. + +## Clojure + +1. Install *Leiningen* refering [this documentation](https://leiningen.org/). +2. Install *gradle* as follows: + + Debian/Ubuntu: + ```shell + sudo apt install gradle + ``` + + Mac OS X (Homebrew): + ```shell + brew install gradle + ``` + +2. Run `cargo build --release` in the [top derectory](/) of this repository to build parattice library. +3. Run the following commands in the [top directory](/) of this repository to build and to install JAR files of parattice: + + ```shell + gradle -b bindings/jni/build.gradle build + gradle -b bindings/jni/build.gradle publishToMavenLocal + ``` + +4. If you use Mac OS X, please edit [project.clj](/examples/clojure/project.clj) as follows: + + ```clojure + [parattice "0.2.1-SNAPSHOT" :classifier "linux-amd64"] + ``` + ↓↓↓ + ```clojure + [parattice "0.2.1-SNAPSHOT" :classifier "darwin-amd64"] + ``` + +5. Run `lein run` in [clojure](/examples/clojure) directory. +6. To convert the generated lattice to an image file, run `dot -Tpng -O ./paraphrase-lattice.dot`. + +![Example graph](/examples/graph.svg) diff --git a/examples/clojure/project.clj b/examples/clojure/project.clj new file mode 100644 index 0000000..b28c466 --- /dev/null +++ b/examples/clojure/project.clj @@ -0,0 +1,5 @@ +(defproject parattice-example "0.1.0-SNAPSHOT" + :dependencies [[org.clojure/clojure "1.10.0"] + [parattice "0.2.1-SNAPSHOT"] + [parattice "0.2.1-SNAPSHOT" :classifier "linux-amd64"]] + :main parattice.example.main) diff --git a/examples/clojure/src/parattice/example/main.clj b/examples/clojure/src/parattice/example/main.clj new file mode 100644 index 0000000..4d9e014 --- /dev/null +++ b/examples/clojure/src/parattice/example/main.clj @@ -0,0 +1,35 @@ +(ns parattice.example.main + (:import parattice.PaRattice + parattice.Lattice + parattice.LatticeKMP)) + +(def paradict [[["blood" "stem" "cell"] ["造血" "幹" "細胞"] ["hematopoietic" "stem" "cell"]] + [["造血" "幹" "細胞" "移植"] ["hematopoietic" "stem" "cell" "transplantation"]] + [["stem" "cell"] ["幹" "細胞"]] + [["幹" "細胞" "移植"] ["rescue" "transplant"] ["stem" "cell" "rescue"]] + [["rescue"] ["救命"]] + [["blood"] ["血液"]]]) +(def words ["造血" "幹" "細胞" "移植"]) + +(defn -main + [& args] + (with-open [;; initialization + parattice (PaRattice. paradict) + ;; lattice generation + lattice (.getLattice parattice words true 2)] + ;; dump a generated lattice + (spit "paraphrase-lattice.dot" (.dumpDot lattice true)) + ;; serialization + (let [b (.toBytes lattice) + results (with-open [;; deserialization + new-lattice (Lattice/fromBytes b) + ;; search + kmp (LatticeKMP. ["幹" "細胞"])] + (.search kmp new-lattice))] + (doseq [result results] + (doseq [edge result] + (print (str "(" (.first edge) ", " (.second edge) ") "))) + (println) + (doseq [edge (.getTrunkSpan lattice result)] + (print (str "(" (.first edge) ", " (.second edge) ") "))) + (println "\n==========="))))) diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt new file mode 100644 index 0000000..1ade204 --- /dev/null +++ b/examples/cpp/CMakeLists.txt @@ -0,0 +1,29 @@ +cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR) +set(CMAKE_POLICY_DEFAULT_CMP0048 NEW) + +project(parattice-example VERSION 0.1.0 LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +if(CMAKE_CXX_COMPILER_ID MATCHES "GNU") + set(CMAKE_CXX_FLAGS "-O3 -Wall -Wextra -Werror -fPIC ${CMAKE_CXX_FLAGS}") + set(PARATTICE_LIBRARIES parattice dl) +elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(CMAKE_CXX_FLAGS "-O3 -Wall -Wextra -Werror -fPIC ${CMAKE_CXX_FLAGS}") + set(PARATTICE_LIBRARIES parattice dl) +else() + message(WARNING "parattice may not support the detected compiler: ${CMAKE_CXX_COMPILER_ID}") +endif() + +include_directories( + ${PROJECT_SOURCE_DIR}/../../bindings/cpp/include + ${PROJECT_BINARY_DIR}) + +add_library(parattice SHARED IMPORTED) +set_property( + TARGET parattice + PROPERTY IMPORTED_LOCATION ${PROJECT_SOURCE_DIR}/../../target/release/libparattice.a + ) +add_executable(parattice_example parattice_example.cc) +target_link_libraries(parattice_example parattice dl pthread) diff --git a/examples/cpp/parattice_example.cc b/examples/cpp/parattice_example.cc new file mode 100644 index 0000000..58b3c13 --- /dev/null +++ b/examples/cpp/parattice_example.cc @@ -0,0 +1,47 @@ +#include "parattice.hh" +#include +#include + +int main() { + // initialization + const std::vector>> paradict + = {{{"blood", "stem", "cell"}, {"造血", "幹", "細胞"}, {"hematopoietic", "stem", "cell"}}, + {{"造血", "幹", "細胞", "移植"}, {"hematopoietic", "stem", "cell", "transplantation"}}, + {{"stem", "cell"}, {"幹", "細胞"}}, + {{"幹", "細胞", "移植"}, {"rescue", "transplant"}, {"stem", "cell", "rescue"}}, + {{"rescue"}, {"救命"}}, + {{"blood"}, {"血液"}}}; + const parattice::PaRattice parattice(paradict); + + // lattice generation + const std::vector words = {"造血", "幹", "細胞", "移植"}; + const auto lattice = parattice.get_lattice(words, true, 2); + + // dump a generated lattice + std::ofstream file("paraphrase-lattice.dot"); + if (!file.is_open()) { + return -1; + } + file << lattice.dump_dot(true) << std::endl; + + // serialization & deserialization + const auto bytes = lattice.to_bytes(); + const auto new_lattice = parattice::Lattice::from_bytes(bytes); + + // search + const parattice::LatticeKMP kmp({"幹", "細胞"}); + const auto results = kmp.search(new_lattice); + for (const auto& result : results) { + for (const auto& edge : result) { + std::cout << "(" << edge.first << ", " << edge.second << ") "; + } + std::cout << std::endl; + const auto s = lattice.get_trunk_span(result); + for (const auto& edge : s) { + std::cout << "(" << edge.first << ", " << edge.second << ") "; + } + std::cout << std::endl << "===========" << std::endl; + } + + return 0; +} diff --git a/examples/graph.svg b/examples/graph.svg new file mode 100644 index 0000000..c86097b --- /dev/null +++ b/examples/graph.svg @@ -0,0 +1,569 @@ + + + + + + +%3 + + + +0 +0 + + + +0-0-1 + +blood + + + +0->0-0-1 + + + + +0-1-3 + +blood + + + +0->0-1-3 + + + + +0-2-2 + +hematopoietic + + + +0->0-2-2 + + + + +0-3-3 + +血液 + + + +0->0-3-3 + + + + +0-4-4 + +造血 + + + +0->0-4-4 + + + + +1 +1 + + + +0-0-1->1 + + + + + +1-0-5 + +stem + + + +1->1-0-5 + + + + +3 +3 + + + +0-1-3->3 + + + + + +3-0-6 + +stem + + + +3->3-0-6 + + + + +3-1-7 + + + + + +3->3-1-7 + + + + +2 +2 + + + +0-2-2->2 + + + + + +2-0-5 + +stem + + + +2->2-0-5 + + + + +2-1-6 + +stem + + + +2->2-1-6 + + + + +2-2-10 + + + + + +2->2-2-10 + + + + +0-3-3->3 + + + + + +4 +4 + + + +0-4-4->4 + + + + + +4-0-11 + +rescue + + + +4->4-0-11 + + + + +4-1-8 + +stem + + + +4->4-1-8 + + + + +4-2-9 + + + + + +4->4-2-9 + + + + +4-3-10 + + + + + +4->4-3-10 + + + + +4-4-11 + +救命 + + + +4->4-4-11 + + + + +5 +5 + + + +1-0-5->5 + + + + + +5-0-14 + +cell + + + +5->5-0-14 + + + + +2-0-5->5 + + + + + +6 +6 + + + +2-1-6->6 + + + + + +6-0-12 + +cell + + + +6->6-0-12 + + + + +10 +10 + + + +2-2-10->10 + + + + + +10-0-14 + +細胞 + + + +10->10-0-14 + + + + +3-0-6->6 + + + + + +7 +7 + + + +3-1-7->7 + + + + + +7-0-12 + +細胞 + + + +7->7-0-12 + + + + +11 +11 + + + +4-0-11->11 + + + + + +11-0-15 + +transplant + + + +11->11-0-15 + + + + +8 +8 + + + +4-1-8->8 + + + + + +8-0-12 + +cell + + + +8->8-0-12 + + + + +8-1-13 + +cell + + + +8->8-1-13 + + + + +9 +9 + + + +4-2-9->9 + + + + + +9-0-12 + +細胞 + + + +9->9-0-12 + + + + +9-1-13 + +細胞 + + + +9->9-1-13 + + + + +4-3-10->10 + + + + + +4-4-11->11 + + + + + +14 +14 + + + +5-0-14->14 + + + + + +14-0-15 + +transplantation + + + +14->14-0-15 + + + + +12 +12 + + + +6-0-12->12 + + + + + +12-0-15 + +移植 + + + +12->12-0-15 + + + + +7-0-12->12 + + + + + +8-0-12->12 + + + + + +13 +13 + + + +8-1-13->13 + + + + + +13-0-15 + +rescue + + + +13->13-0-15 + + + + +9-0-12->12 + + + + + +9-1-13->13 + + + + + +10-0-14->14 + + + + + +15 +15 + + + +11-0-15->15 + + + + + +12-0-15->15 + + + + + +13-0-15->15 + + + + + +14-0-15->15 + + + + + diff --git a/examples/rust/Cargo.toml b/examples/rust/Cargo.toml new file mode 100644 index 0000000..0f302dd --- /dev/null +++ b/examples/rust/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "parattice-example" +version = "0.1.0" +authors = ["Koichi Akabe "] +edition = "2018" + +[dependencies] +parattice = { path = "../.." } diff --git a/examples/rust/src/main.rs b/examples/rust/src/main.rs new file mode 100644 index 0000000..38551b9 --- /dev/null +++ b/examples/rust/src/main.rs @@ -0,0 +1,53 @@ +use parattice::PaRattice; +use parattice::LatticeKMP; +use parattice::Lattice; + +use std::fs::File; +use std::io::prelude::*; +use std::io; +use std::path::Path; + +fn main() -> Result<(), io::Error> { + // initialization + let paradict + = vec![vec![vec!["blood", "stem", "cell"], vec!["造血", "幹", "細胞"], + vec!["hematopoietic", "stem", "cell"]], + vec![vec!["造血", "幹", "細胞", "移植"], + vec!["hematopoietic", "stem", "cell", "transplantation"]], + vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + vec![vec!["幹", "細胞", "移植"], vec!["rescue", "transplant"], + vec!["stem", "cell", "rescue"]], + vec![vec!["rescue"], vec!["救命"]], + vec![vec!["blood"], vec!["血液"]]]; + let parattice = PaRattice::new(paradict); + + // lattice generation + let words = vec!["造血", "幹", "細胞", "移植"]; + let lattice = parattice.get_lattice(&words, true, 2); + + // dump a generated lattice + let path = Path::new("paraphrase-lattice.dot"); + let mut file = File::create(&path)?; + file.write_all(lattice.dump_dot(true).as_bytes())?; + + // serialization & deserialization + let bytes = lattice.to_bytes(); + let new_lattice = Lattice::new_from_bytes(&bytes); + + // search + let kmp = LatticeKMP::new(vec!["幹", "細胞"]); + let results = kmp.search(&new_lattice); + for result in &results { + for edge in result { + print!("({}, {}) ", edge.0, edge.1); + } + println!(); + let s = lattice.get_trunk_span(result.clone()); + for edge in &s { + print!("({}, {}) ", edge.0, edge.1); + } + println!("\n==========="); + } + + Ok(()) +} diff --git a/src/externs.rs b/src/externs.rs new file mode 100644 index 0000000..d3f123b --- /dev/null +++ b/src/externs.rs @@ -0,0 +1,193 @@ +use crate::parattice::PaRattice; +use crate::lattice::Lattice; +use crate::lattice_kmp::LatticeKMP; + +use libc::c_char; +use std::ffi::CStr; +use std::ffi::CString; +use std::mem; +use std::ptr; +use std::slice; +use std::str; + +#[no_mangle] +pub unsafe extern fn parattice_free_string(s: *mut c_char) { + if !s.is_null() { + CString::from_raw(s); + } +} + +#[no_mangle] +pub unsafe extern fn parattice_free_bytes(bytes: *mut u8, length: usize) { + mem::drop(Vec::from_raw_parts(bytes, length, length)); +} + +#[no_mangle] +pub unsafe extern fn parattice_parattice_new<'a>(dict: *const *const *const *const c_char) -> *mut PaRattice<'a> { + let mut dict_vec = vec![]; + let mut i = 0; + while *dict.offset(i) != ptr::null() { + let group: *const *const *const c_char = *dict.offset(i); + let mut group_vec = vec![]; + let mut j = 0; + while *group.offset(j) != ptr::null() { + let phrase: *const *const c_char = *group.offset(j); + let mut phrase_vec = vec![]; + let mut k = 0; + while *phrase.offset(k) != ptr::null() { + let word = *phrase.offset(k); + let c_str = CStr::from_ptr(word); + phrase_vec.push(str::from_utf8_unchecked(c_str.to_bytes())); + k += 1; + } + group_vec.push(phrase_vec); + j += 1 + } + dict_vec.push(group_vec); + i += 1; + } + Box::into_raw(Box::new(PaRattice::new(dict_vec))) +} + +#[no_mangle] +pub unsafe extern fn parattice_parattice_free(parattice: *mut PaRattice) { + Box::from_raw(parattice); +} + +#[no_mangle] +pub unsafe extern fn parattice_parattice_get_lattice(parattice: *const PaRattice, words: *const *const c_char, length: usize, shrink: bool, max_depth: usize) -> *mut Lattice { + let mut words_vec = Vec::with_capacity(length); + for i in 0..length { + let word = *words.offset(i as isize); + let c_str = CStr::from_ptr(word); + words_vec.push(str::from_utf8_unchecked(c_str.to_bytes())); + } + Box::into_raw(Box::new((*parattice).get_lattice(&words_vec, shrink, max_depth))) +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_new_from_bytes<'a>(data: *const u8, length: usize) -> *mut Lattice<'a> { + Box::into_raw(Box::new(Lattice::new_from_bytes(slice::from_raw_parts(data, length)))) +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_to_bytes(lattice: *const Lattice, length: *mut usize) -> *mut u8 { + let mut bytes = (*lattice).to_bytes(); + *length = bytes.len(); + let ptr = bytes.as_mut_ptr(); + mem::forget(bytes); + ptr +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_free(lattice: *mut Lattice) { + Box::from_raw(lattice); +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_get_size(lattice: *const Lattice) -> usize { + (*lattice).lattice.len() +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_get_required_capacity(lattice: *const Lattice) -> usize { + (*lattice).capacity +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_dump_dot(lattice: *const Lattice, is_numbered: bool) -> *mut c_char { + let s = (*lattice).dump_dot(is_numbered); + let c_string = CString::new(s).unwrap(); + c_string.into_raw() +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_get_trunk_span<'a>(lattice: *const Lattice, edge_labels: *const *const c_char, node_ids: *const usize, length: usize, new_edge_labels: *mut *const u8, new_edge_label_length: *mut usize, new_node_ids: *mut usize) -> usize { + let mut path = Vec::with_capacity(length); + for i in 0..length { + let word = *edge_labels.offset(i as isize); + let c_str = CStr::from_ptr(word); + path.push((str::from_utf8_unchecked(c_str.to_bytes()), *node_ids.offset(i as isize))); + } + let trunk_span = (*lattice).get_trunk_span(path); + for i in 0..trunk_span.len() { + *new_edge_labels.offset(i as isize) = trunk_span[i].0.as_ptr(); + *new_edge_label_length.offset(i as isize) = trunk_span[i].0.len(); + *new_node_ids.offset(i as isize) = trunk_span[i].1; + } + trunk_span.len() +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_get_trunk_spans(lattice: *const Lattice, trunk_lefts: *mut usize, trunk_rights: *mut usize) { + let trunk_spans = (*lattice).get_trunk_spans(); + for (i, (trunk_left, trunk_right)) in trunk_spans.into_iter().enumerate() { + *trunk_lefts.offset(i as isize) = trunk_left; + *trunk_rights.offset(i as isize) = trunk_right; + } +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_dump_for_search_index(lattice: *const Lattice, texts: *mut *const u8, text_lengths: *mut usize, offset_starts: *mut usize, offset_ends: *mut usize, increments: *mut usize, lengths: *mut usize) -> usize { + let search_index_nodes = (*lattice).dump_for_search_index(); + for i in 0..search_index_nodes.len() { + *texts.offset(i as isize) = search_index_nodes[i].text.as_ptr(); + *text_lengths.offset(i as isize) = search_index_nodes[i].text.len(); + *offset_starts.offset(i as isize) = search_index_nodes[i].offset.0; + *offset_ends.offset(i as isize) = search_index_nodes[i].offset.1; + *increments.offset(i as isize) = search_index_nodes[i].increment; + *lengths.offset(i as isize) = search_index_nodes[i].length; + } + search_index_nodes.len() +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_kmp_new<'a>(pattern: *const *const c_char, length: usize) -> *mut LatticeKMP<'a> { + let mut pattern_vec = Vec::with_capacity(length); + for i in 0..length { + let word = *pattern.offset(i as isize); + let c_str = CStr::from_ptr(word); + pattern_vec.push(str::from_utf8_unchecked(c_str.to_bytes())); + } + Box::into_raw(Box::new(LatticeKMP::new(pattern_vec))) +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_kmp_free(latticekmp: *mut LatticeKMP) { + Box::from_raw(latticekmp); +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_kmp_search<'a>(latticekmp: *const LatticeKMP<'a>, lattice: *const Lattice<'a>) -> *mut Vec> { + Box::into_raw(Box::new((*latticekmp).search(&(*lattice)))) +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_kmp_free_result<'a>(results: *mut Vec>) { + Box::from_raw(results); +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_kmp_results_size<'a>(results: *const Vec>) -> usize { + (*results).len() +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_kmp_result_length<'a>(results: *const Vec>, index: usize) -> usize { + (*results)[index].len() +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_kmp_result_nodes<'a>(results: *const Vec>, index: usize, nodes: *mut usize) { + for i in 0..(*results)[index].len() { + *nodes.offset(i as isize) = (*results)[index][i].1; + } +} + +#[no_mangle] +pub unsafe extern fn parattice_lattice_kmp_result_edge_labels<'a>(results: *const Vec>, index: usize, edge_labels: *mut *const u8, edge_label_length: *mut usize) { + for i in 0..(*results)[index].len() { + *edge_labels.offset(i as isize) = (*results)[index][i].0.as_ptr(); + *edge_label_length.offset(i as isize) = (*results)[index][i].0.len(); + } +} diff --git a/src/lattice.rs b/src/lattice.rs new file mode 100644 index 0000000..d164e9a --- /dev/null +++ b/src/lattice.rs @@ -0,0 +1,325 @@ +use std::collections::BTreeSet; +use std::collections::HashMap; +use std::collections::VecDeque; +use std::str; +use std::usize; + +#[derive(Debug, Eq, PartialEq)] +pub struct LatticeNode<'a> { + pub forwards: BTreeSet<(&'a str, usize)>, + pub backwards: BTreeSet<(&'a str, usize)>, + pub forward_main: Option<(&'a str, usize)>, + pub backward_main: Option<(&'a str, usize)>, + pub depth: usize, +} + +impl<'a> LatticeNode<'a> { + pub fn new>, T2: Into>>( + forward_main: T1, backward_main: T2, depth: usize) -> LatticeNode<'a> { + let mut forwards = BTreeSet::new(); + let mut backwards = BTreeSet::new(); + let forward_main = forward_main.into(); + let backward_main = backward_main.into(); + if let Some(x) = forward_main { + forwards.insert(x); + } + if let Some(x) = backward_main { + backwards.insert(x); + } + LatticeNode { + forwards: forwards, + backwards: backwards, + forward_main: forward_main, + backward_main: backward_main, + depth: depth, + } + } + + pub fn insert_forward(&mut self, edge_str: &'a str, edge_target: usize) { + self.forwards.insert((edge_str, edge_target)); + } + + pub fn insert_backward(&mut self, edge_str: &'a str, edge_target: usize) { + self.backwards.insert((edge_str, edge_target)); + } +} + +pub struct SearchIndexNode<'a> { + pub text: &'a str, + pub offset: (usize, usize), + pub increment: usize, + pub length: usize, +} + +#[derive(Debug, Eq, PartialEq)] +pub struct Lattice<'a> { + pub lattice: Vec>, + pub trunk: HashMap, + pub capacity: usize, +} + +fn usize_to_vec(x: usize) -> Vec { + vec![ + x as u8, + (x >> 8) as u8, + (x >> 16) as u8, + (x >> 24) as u8, + (x >> 32) as u8, + (x >> 40) as u8, + (x >> 48) as u8, + (x >> 56) as u8, + ] +} + +fn vec_to_usize(x: &[u8]) -> usize { + x[0] as usize | + (x[1] as usize) << 8 | + (x[2] as usize) << 16 | + (x[3] as usize) << 24 | + (x[4] as usize) << 32 | + (x[5] as usize) << 40 | + (x[6] as usize) << 48 | + (x[7] as usize) << 56 +} + +impl<'a> Lattice<'a> { + /// Returns a lattice from the given binary data. + /// + /// # Arguments + /// + /// * `data` - A byte array + /// + /// # Example + /// + /// ``` + /// let bytes = lattice.to_bytes(); + /// let new_lattice = Lattice::new_from_bytes(&bytes); + /// ``` + pub fn new_from_bytes(data: &'a [u8]) -> Lattice<'a> { + let mut lattice = Vec::with_capacity(vec_to_usize(&data[0..8])); + let mut offset = 8; + while offset < data.len() { + let num_forwards = vec_to_usize(&data[offset..offset+8]); + let num_backwards = vec_to_usize(&data[offset+8..offset+16]); + offset += 16; + let mut forwards = BTreeSet::new(); + let forward_main = + if num_forwards != 0 { + let forward_main_num_chars = vec_to_usize(&data[offset..offset+8]); + let forward_main_edge_string = str::from_utf8( + &data[offset+8..offset+8+forward_main_num_chars]).unwrap(); + let forward_main_edge_target = vec_to_usize( + &data[offset+8+forward_main_num_chars..offset+16+forward_main_num_chars]); + offset += 16 + forward_main_num_chars; + forwards.insert((forward_main_edge_string, forward_main_edge_target)); + for _ in 1..num_forwards { + let forward_num_chars = vec_to_usize(&data[offset..offset+8]); + let forward_edge_string = str::from_utf8( + &data[offset+8..offset+8+forward_num_chars]).unwrap(); + let forward_edge_target = vec_to_usize( + &data[offset+8+forward_num_chars..offset+16+forward_num_chars]); + offset += 16 + forward_num_chars; + forwards.insert((forward_edge_string, forward_edge_target)); + } + Some((forward_main_edge_string, forward_main_edge_target)) + } else { + None + }; + let mut backwards = BTreeSet::new(); + let backward_main = + if num_backwards != 0 { + let backward_main_num_chars = vec_to_usize(&data[offset..offset+8]); + let backward_main_edge_string = str::from_utf8( + &data[offset+8..offset+8+backward_main_num_chars]).unwrap(); + let backward_main_edge_target = vec_to_usize( + &data[offset+8+backward_main_num_chars..offset+16+backward_main_num_chars]); + offset += 16 + backward_main_num_chars; + backwards.insert((backward_main_edge_string, backward_main_edge_target)); + for _ in 1..num_backwards { + let backward_num_chars = vec_to_usize(&data[offset..offset+8]); + let backward_edge_string = str::from_utf8( + &data[offset+8..offset+8+backward_num_chars]).unwrap(); + let backward_edge_target = vec_to_usize( + &data[offset+8+backward_num_chars..offset+16+backward_num_chars]); + offset += 16 + backward_num_chars; + backwards.insert((backward_edge_string, backward_edge_target)); + } + Some((backward_main_edge_string, backward_main_edge_target)) + } else { + None + }; + lattice.push(LatticeNode { + forwards: forwards, + backwards: backwards, + forward_main: forward_main, + backward_main: backward_main, + depth: 0, + }); + } + let mut trunk = HashMap::new(); + let mut node_id = 0; + let mut orig_node_id = 0; + trunk.insert(0, 0); + while let Some(x) = lattice[node_id].forward_main { + node_id = x.1; + orig_node_id += 1; + trunk.insert(node_id, orig_node_id); + } + Lattice { + trunk: trunk, + capacity: lattice.iter().fold(0, |sum, x| sum + x.forwards.len()), + lattice: lattice, + } + } + + /// Returns binary data of the lattice. + /// + /// # Example + /// + /// ``` + /// let bytes = lattice.to_bytes(); + /// let new_lattice = Lattice::new_from_bytes(&bytes); + /// ``` + pub fn to_bytes(&self) -> Vec { + let mut result = vec![]; + result.append(&mut usize_to_vec(self.lattice.len())); + for node in &self.lattice { + result.append(&mut usize_to_vec(node.forwards.len())); + result.append(&mut usize_to_vec(node.backwards.len())); + if let Some(x) = node.forward_main { + result.append(&mut usize_to_vec(x.0.len())); + result.append(&mut x.0.as_bytes().to_vec()); + result.append(&mut usize_to_vec(x.1)); + for &edge in &node.forwards { + if edge != x { + result.append(&mut usize_to_vec(edge.0.len())); + result.append(&mut edge.0.as_bytes().to_vec()); + result.append(&mut usize_to_vec(edge.1)); + } + } + } + if let Some(x) = node.backward_main { + result.append(&mut usize_to_vec(x.0.len())); + result.append(&mut x.0.as_bytes().to_vec()); + result.append(&mut usize_to_vec(x.1)); + for &edge in &node.backwards { + if edge != x { + result.append(&mut usize_to_vec(edge.0.len())); + result.append(&mut edge.0.as_bytes().to_vec()); + result.append(&mut usize_to_vec(edge.1)); + } + } + } + } + result + } + + /// Returns graphviz of the lattice. + /// + /// # Arguments + /// + /// * `is_numbered` - If true, node numbers are printed. + /// + /// # Example + /// + /// ``` + /// let dot = lattice.dump_dot(true); + /// println!("{}", dot); + /// ``` + pub fn dump_dot(&self, is_numbered: bool) -> String { + let mut result = "digraph { graph [rankdir=LR];\n".to_string(); + for (i, node) in self.lattice.iter().enumerate() { + if is_numbered { + result = result + &format!("\"{}\" [label=\"{}\",shape=plaintext,width=\"0.1\"];\n", i, i); + } else { + result = result + &format!("\"{}\" [label=\"\",shape=circle,width=\"0.1\"];\n", i); + } + for (j, &edge) in node.forwards.iter().enumerate() { + result = result + &format!("\"{}-{}-{}\" [label=\"{}\",shape=box];\n", i, j, edge.1, edge.0); + if edge == node.forward_main.unwrap() { + result = result + &format!("\"{}\" -> \"{}-{}-{}\" [arrowhead=none,color=\"#ff0000\"];\n", i, i, j, edge.1); + } else { + result = result + &format!("\"{}\" -> \"{}-{}-{}\" [arrowhead=none];\n", i, i, j, edge.1); + } + if self.lattice[edge.1].backward_main == Some((edge.0, i)) { + result = result + &format!("\"{}-{}-{}\" -> \"{}\" [color=\"#0000ff\"];\n", i, j, edge.1, edge.1); + } else { + result = result + &format!("\"{}-{}-{}\" -> \"{}\";\n", i, j, edge.1, edge.1); + } + } + } + result += "}"; + result + } + + /// Returns a trunk path of the given path. + /// + /// # Arguments + /// + /// * `path` - A path of the lattice. (e.g. a result of LatticeKMP) + pub fn get_trunk_span(&self, path: Vec<(&'a str, usize)>) -> Vec<(&'a str, usize)> { + let mut new_path: VecDeque<(&str, usize)> = path.into_iter().collect(); + let mut edge_bw = new_path.pop_front().unwrap(); + while !self.trunk.contains_key(&edge_bw.1) { + let next_edge = self.lattice[edge_bw.1].backward_main.unwrap().clone(); + new_path.push_front((next_edge.0, edge_bw.1)); + edge_bw = next_edge; + } + new_path.push_front(("", edge_bw.1)); + let mut edge_fw = new_path.back().unwrap().clone(); + while !self.trunk.contains_key(&edge_fw.1) { + edge_fw = self.lattice[edge_fw.1].forward_main.unwrap().clone(); + new_path.push_back(edge_fw); + } + new_path.into_iter().collect() + } + + /// Returns trunk node IDs for each node ID. + pub fn get_trunk_spans(&self) -> Vec<(usize, usize)> { + let mut left_trunks = vec![0; self.lattice.len()]; + let mut right_trunks = vec![self.lattice.len() - 1; self.lattice.len()]; + for (&node_id, _) in &self.trunk { + left_trunks[node_id] = node_id; + right_trunks[node_id] = node_id; + } + for node_id in 1..self.lattice.len()-1 { + for edge in &self.lattice[node_id].forwards { + if left_trunks[edge.1] == 0 + && self.lattice[edge.1].backward_main.unwrap().1 == node_id { + left_trunks[edge.1] = left_trunks[node_id]; + } + } + } + for node_id in (1..self.lattice.len()-1).rev() { + for edge in &self.lattice[node_id].backwards { + if right_trunks[edge.1] == self.lattice.len() - 1 + && self.lattice[edge.1].forward_main.unwrap().1 == node_id { + right_trunks[edge.1] = right_trunks[node_id]; + } + } + } + let mut result = Vec::with_capacity(self.lattice.len()); + for item in left_trunks.into_iter().zip(right_trunks.into_iter()) { + result.push((self.trunk[&item.0], self.trunk[&item.1])); + } + result + } + + /// Returns a vector of SearchIndexNode for the search index such as Elasticsearch. + pub fn dump_for_search_index(&self) -> Vec { + let trunk_spans = self.get_trunk_spans(); + let mut result = Vec::with_capacity(self.capacity); + for i in 0..self.lattice.len()-1 { + for (j, edge) in self.lattice[i].forwards.iter().enumerate() { + result.push( + SearchIndexNode { + text: edge.0, + offset: (trunk_spans[i].0, trunk_spans[edge.1].1), + increment: if j == 0 { 1 } else { 0 }, + length: edge.1 - i, + }); + } + } + result + } +} diff --git a/src/lattice_kmp.rs b/src/lattice_kmp.rs new file mode 100644 index 0000000..6aea480 --- /dev/null +++ b/src/lattice_kmp.rs @@ -0,0 +1,96 @@ +use crate::lattice::Lattice; + +use std::collections::HashSet; +use std::collections::VecDeque; + +pub struct LatticeKMP<'a> { + pattern: Vec<&'a str>, + cpattern: Vec, +} + +impl<'a> LatticeKMP<'a> { + /// Returns LatticeKMP with the given pattern. + /// + /// # Arguments + /// + /// * `pattern` - A word array + /// + /// # Example + /// + /// ``` + /// let pattern = vec!["幹", "細胞"]; + /// let kmp = LatticeKMP::new(pattern); + /// ``` + pub fn new(pattern: Vec<&'a str>) -> LatticeKMP<'a> { + let mut cpattern = vec![0]; + let mut j; + for i in 1..pattern.len() { + j = cpattern[i - 1]; + while j > 0 && pattern[j] != pattern[i] { + j = cpattern[j - 1]; + } + cpattern.push(j + if pattern[j] == pattern[i] { 1 } else { 0 }); + } + LatticeKMP { + pattern: pattern, + cpattern: cpattern, + } + } + + /// Returns paths of found patterns. + /// + /// # Arguments + /// + /// * `lattice` - A lattice + /// + /// # Example + /// + /// ``` + /// let results = kmp.search(&lattice); + /// ``` + pub fn search(&self, lattice: &'a Lattice) -> Vec> { + let mut added_candidates = HashSet::new(); + let mut queue = VecDeque::new(); + let mut candidates = VecDeque::new(); + let mut results = vec![]; + queue.push_back((0, 0)); + candidates.push_back(VecDeque::new()); + candidates[0].push_back(("", 0)); + while let Some(item) = queue.pop_front() { + let candidate = candidates.pop_front().unwrap(); + if lattice.lattice[item.0].forward_main.is_none() { + continue; + } + for edge in &lattice.lattice[item.0].forwards { + let mut j = item.1; + while j > 0 && edge.0 != self.pattern[j] { + j = self.cpattern[j - 1]; + } + if edge.0 == self.pattern[j] { + j += 1; + } + let mut new_candidate = VecDeque::new(); + new_candidate.push_back(edge.clone()); + let mut k = candidate.len(); + while new_candidate.len() < j { + k -= 1; + new_candidate.push_front(candidate[k].clone()); + } + new_candidate.push_front(("", candidate[k - 1].1)); + if j == self.pattern.len() { + results.push(new_candidate.clone().into_iter().collect()); + j = self.cpattern[j - 1]; + while new_candidate.len() > j + 1 { + new_candidate.pop_front(); + } + } + if !added_candidates.contains(&new_candidate) { + added_candidates.insert(new_candidate.clone()); + queue.push_back((edge.1, j)); + candidates.push_back(new_candidate); + } + } + } + results + } +} diff --git a/src/lattice_searcher.rs b/src/lattice_searcher.rs new file mode 100644 index 0000000..b3df335 --- /dev/null +++ b/src/lattice_searcher.rs @@ -0,0 +1,130 @@ +use crate::lattice::Lattice; +use crate::lattice::LatticeNode; + +use std::collections::HashMap; +use std::collections::HashSet; +use std::collections::VecDeque; + +pub struct PMANode<'a> { + edges: HashMap<&'a str, usize>, + fail: usize, + matched: Vec, +} + +pub struct LatticeSearcher<'a> { + pma: Vec>, + patterns: Vec>, +} + +impl<'a> LatticeSearcher<'a> { + pub fn new(patterns: Vec>) -> LatticeSearcher<'a> { + let mut pma = vec![PMANode { + edges: HashMap::new(), + fail: 0, + matched: vec![], + }]; + for (i, phrase) in patterns.iter().enumerate() { + let mut node_id = 0; + for word in phrase { + node_id = if let Some(&node_id_next) = pma[node_id].edges.get(word) { + node_id_next + } else { + let node_id_next = pma.len(); + pma[node_id].edges.insert(word, node_id_next); + pma.push(PMANode { + edges: HashMap::new(), + fail: 0, + matched: vec![], + }); + node_id_next + } + } + pma[node_id].matched.push(i); + } + let mut queue = VecDeque::new(); + for &node_id in pma[0].edges.values() { + queue.push_back(node_id); + } + while let Some(node_id) = queue.pop_front() { + for (c, node_id_next) in pma[node_id].edges.clone().iter() { + queue.push_back(*node_id_next); + let mut node_id_fail = node_id; + while node_id_fail != 0 { + node_id_fail = pma[node_id_fail].fail; + if let Some(&edge_target) = pma[node_id_fail].edges.get(c) { + node_id_fail = edge_target; + break; + } + } + let matched = pma[node_id_fail].matched.clone(); + let mut node = &mut pma[*node_id_next]; + node.fail = node_id_fail; + node.matched.extend(matched); + } + } + LatticeSearcher { + pma: pma, + patterns: patterns, + } + } + + fn next_pma_state_id(pma: &Vec, state_id: usize, edge_str: &str) -> usize { + let mut next_state_id = state_id; + loop { + if let Some(&x) = pma[next_state_id].edges.get(edge_str) { + return x; + } + if next_state_id == 0 { + return 0; + } + next_state_id = pma[next_state_id].fail; + } + } + + fn backward_match(phrase: &Vec<&'a str>, lattice: &'a Vec, pos: usize) -> Vec> { + let mut result = vec![]; + let mut backward_queue = VecDeque::new(); + let mut init_path = VecDeque::new(); + init_path.push_front(("", pos)); + backward_queue.push_back((phrase.len(), pos, init_path)); + while let Some((phrase_pos, lattice_node_id, path)) = backward_queue.pop_front() { + if phrase_pos == 0 { + result.push(path); + } else { + for &(edge_str, edge_target) in lattice[lattice_node_id].backwards.keys() { + if edge_str == phrase[phrase_pos - 1] { + let mut new_path = path.clone(); + new_path.front_mut().unwrap().0 = edge_str; + new_path.push_front(("", edge_target)); + backward_queue.push_back((phrase_pos - 1, edge_target, new_path)); + } + } + } + } + result + } + + pub fn search(&self, lattice: &'a Lattice) -> Vec> { + let lattice = &lattice.lattice; + let mut queue = VecDeque::new(); + queue.push_back((0, 0)); + let mut state_id_cache = HashSet::new(); + let mut result = vec![]; + while let Some((lattice_node_id, pma_state_id)) = queue.pop_front() { + for (lattice_edge_str, lattice_egde_target) in lattice[lattice_node_id].forwards.keys() { + let pma_state_id_new = Self::next_pma_state_id(&self.pma, pma_state_id, lattice_edge_str); + if !state_id_cache.contains(&(*lattice_egde_target, pma_state_id_new)) { + queue.push_back((*lattice_egde_target, pma_state_id_new)); + state_id_cache.insert((*lattice_egde_target, pma_state_id_new)); + } + } + for &phrase_id in &self.pma[pma_state_id].matched { + let phrase = &self.patterns[phrase_id]; + for m in Self::backward_match(phrase, &lattice, lattice_node_id) { + result.push(m.into_iter().collect()); + } + } + } + result + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..fddb203 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,11 @@ +pub mod lattice; +pub mod lattice_kmp; +pub mod parattice; +pub mod utils; + +pub mod externs; + +pub use crate::lattice_kmp::LatticeKMP; +pub use crate::lattice::Lattice; +pub use crate::lattice::SearchIndexNode; +pub use crate::parattice::PaRattice; diff --git a/src/parattice.rs b/src/parattice.rs new file mode 100644 index 0000000..12fb587 --- /dev/null +++ b/src/parattice.rs @@ -0,0 +1,404 @@ +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::collections::HashMap; +use std::collections::HashSet; +use std::collections::VecDeque; +use std::mem; +use std::usize; +use std::cmp; + +use crate::lattice::Lattice; +use crate::lattice::LatticeNode; +use crate::utils::get_two_mut_elems; + +pub struct PMANode<'a> { + edges: HashMap<&'a str, usize>, + fail: usize, + matched: Vec, +} + +pub struct PaRattice<'a> { + pma: Vec>, + phrases: Vec<(Vec<&'a str>, usize)>, + dict: Vec>, +} + +impl<'a> PaRattice<'a> { + /// Returns PaRattice with the given paraphrase corpus. + /// + /// # Arguments + /// + /// * `dict` - A paraphrase corpus + /// + /// # Example + /// + /// ``` + /// use parattice::PaRattice; + /// let paradict + /// = vec![vec![vec!["blood", "stem", "cell"], vec!["造血", "幹", "細胞"], + /// vec!["hematopoietic", "stem", "cell"]], + /// vec![vec!["造血", "幹", "細胞", "移植"], + /// vec!["hematopoietic", "stem", "cell", "transplantation"]], + /// vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + /// vec![vec!["幹", "細胞", "移植"], vec!["rescue", "transplant"], + /// vec!["stem", "cell", "rescue"]], + /// vec![vec!["rescue"], vec!["救命"]], + /// vec![vec!["blood"], vec!["血液"]]]; + /// let parattice = PaRattice::new(paradict); + /// ``` + pub fn new(dict: Vec>>) -> PaRattice<'a> { + let mut pma = vec![PMANode { + edges: HashMap::new(), + fail: 0, + matched: vec![], + }]; + let mut phrases = vec![]; + let mut ids = vec![]; + for (i, group) in dict.iter().enumerate() { + let mut id_group = vec![]; + for phrase in group { + let mut node_id = 0; + for word in phrase { + node_id = if let Some(&node_id_next) = pma[node_id].edges.get(word) { + node_id_next + } else { + let node_id_next = pma.len(); + pma[node_id].edges.insert(word, node_id_next); + pma.push(PMANode { + edges: HashMap::new(), + fail: 0, + matched: vec![], + }); + node_id_next + } + } + pma[node_id].matched.push(phrases.len()); + id_group.push(phrases.len()); + phrases.push((phrase.clone(), i)); + } + ids.push(id_group); + } + let mut queue = VecDeque::new(); + for &node_id in pma[0].edges.values() { + queue.push_back(node_id); + } + while let Some(node_id) = queue.pop_front() { + for (c, node_id_next) in pma[node_id].edges.clone().iter() { + queue.push_back(*node_id_next); + let mut node_id_fail = node_id; + while node_id_fail != 0 { + node_id_fail = pma[node_id_fail].fail; + if let Some(&edge_target) = pma[node_id_fail].edges.get(c) { + node_id_fail = edge_target; + break; + } + } + let matched = pma[node_id_fail].matched.clone(); + let mut node = &mut pma[*node_id_next]; + node.fail = node_id_fail; + node.matched.extend(matched); + } + } + PaRattice { + pma: pma, + phrases: phrases, + dict: ids, + } + } + + fn backward_match(phrase: &Vec<&str>, lattice: &Vec, pos: usize, max_depth: usize) -> Vec<(usize, usize)> { + let mut result = vec![]; + let mut backward_queue = VecDeque::new(); + if lattice[pos].depth < max_depth { + backward_queue.push_back((phrase.len(), pos, lattice[pos].depth)); + } + while let Some((phrase_pos, lattice_node_id, depth)) = backward_queue.pop_front() { + if phrase_pos == 0 { + result.push((lattice_node_id, depth)); + } else { + for &(edge_str, edge_target) in &lattice[lattice_node_id].backwards { + if edge_str == phrase[phrase_pos - 1] && lattice[edge_target].depth < max_depth { + backward_queue.push_back((phrase_pos - 1, edge_target, cmp::max(depth, lattice[edge_target].depth))); + } + } + } + } + result + } + + fn next_pma_state_id(pma: &Vec, state_id: usize, edge_str: &str) -> usize { + let mut next_state_id = state_id; + loop { + if let Some(&x) = pma[next_state_id].edges.get(edge_str) { + return x; + } + if next_state_id == 0 { + return 0; + } + next_state_id = pma[next_state_id].fail; + } + } + + fn insert_branch(lattice: &mut Vec>, state_id_cache: &mut Vec>, phrase: &Vec<&'a str>, start_node_id: usize, end_node_id: usize, depth: usize) -> usize { + let new_node_id = lattice.len(); + assert!(lattice.len() >= 1); + match phrase.len() { + 1 => { + lattice[start_node_id].insert_forward(&phrase[0], end_node_id); + lattice[end_node_id].insert_backward(&phrase[0], start_node_id); + end_node_id + }, + 2 => { + lattice[start_node_id].insert_forward(&phrase[0], new_node_id); + lattice.push(LatticeNode::new((phrase[1].clone(), end_node_id), (phrase[0].clone(), start_node_id), depth)); + state_id_cache.push(BTreeSet::new()); + lattice[end_node_id].insert_backward(&phrase[1], new_node_id); + new_node_id + }, + 3 => { + lattice[start_node_id].insert_forward(&phrase[0], new_node_id); + lattice.push(LatticeNode::new((phrase[1].clone(), new_node_id + 1), (phrase[0].clone(), start_node_id), depth)); + state_id_cache.push(BTreeSet::new()); + lattice.push(LatticeNode::new((phrase[2].clone(), end_node_id), (phrase[1].clone(), new_node_id), depth)); + state_id_cache.push(BTreeSet::new()); + lattice[end_node_id].insert_backward(&phrase[2], new_node_id + 1); + new_node_id + }, + _ => { + lattice[start_node_id].insert_forward(&phrase[0], new_node_id); + lattice.push(LatticeNode::new((phrase[1].clone(), new_node_id + 1), (phrase[0].clone(), start_node_id), depth)); + state_id_cache.push(BTreeSet::new()); + for i in 0..phrase.len() - 3 { + lattice.push(LatticeNode::new((phrase[i + 2].clone(), new_node_id + i + 2), (phrase[i + 1].clone(), new_node_id + i), depth)); + state_id_cache.push(BTreeSet::new()); + } + lattice.push(LatticeNode::new((phrase[phrase.len() - 1].clone(), end_node_id), (phrase[phrase.len() - 2].clone(), new_node_id + phrase.len() - 3), depth)); + state_id_cache.push(BTreeSet::new()); + lattice[end_node_id].insert_backward(&phrase[phrase.len() - 1], new_node_id + phrase.len() - 2); + new_node_id + }, + } + } + + /// Returns a recursive paraphrase lattice of the given sentence. + /// + /// # Arguments + /// + /// * `words` - A sentence + /// * `shrink` - If true, duplicated edges are shrinked + /// * `max_depth` - A number of recursion + /// + /// # Example + /// + /// ``` + /// let words = vec!["造血", "幹", "細胞", "移植"]; + /// let lattice = parattice.get_lattice(&words, true, 2); + /// ``` + pub fn get_lattice(&self, words: &Vec<&'a str>, shrink: bool, max_depth: usize) -> Lattice<'a> { + let mut inserted_branches = HashSet::new(); + // generate initial lattice + let mut lattice = vec![]; + let mut state_id_cache = vec![]; + if words.is_empty() { + lattice.push(LatticeNode::new(None, None, 0)); + state_id_cache.push(BTreeSet::new()); + } else { + lattice.push(LatticeNode::new((words[0].clone(), 1), None, 0)); + state_id_cache.push(BTreeSet::new()); + for node_id in 1..words.len() { + lattice.push(LatticeNode::new((words[node_id].clone(), node_id + 1), (words[node_id - 1].clone(), node_id - 1), 0)); + state_id_cache.push(BTreeSet::new()); + } + lattice.push(LatticeNode::new(None, (words[words.len() - 1].clone(), words.len() - 1), 0)); + state_id_cache.push(BTreeSet::new()); + } + // search phrases + let mut queue = VecDeque::new(); + queue.push_back((0, 0)); + state_id_cache[0].insert(0); + while let Some((lattice_node_id, pma_state_id)) = queue.pop_front() { + for (lattice_edge_str, lattice_egde_target) in lattice[lattice_node_id].forwards.iter() { + let pma_state_id_new = Self::next_pma_state_id(&self.pma, pma_state_id, lattice_edge_str); + if !state_id_cache[*lattice_egde_target].contains(&pma_state_id_new) { + // queue next node + queue.push_back((*lattice_egde_target, pma_state_id_new)); + state_id_cache[*lattice_egde_target].insert(pma_state_id_new); + } + } + for &phrase_id in &self.pma[pma_state_id].matched { + let (phrase, group_id) = &self.phrases[phrase_id]; + let trunk_end = Self::main_branch_fw(&lattice, lattice_node_id, words.len()); + for (branch_start, depth) in Self::backward_match(phrase, &lattice, lattice_node_id, max_depth) { + let trunk_start = Self::main_branch_bw(&lattice, branch_start, words.len()); + if inserted_branches.contains(&(group_id, trunk_start, trunk_end)) { + continue; + } + inserted_branches.insert((group_id, trunk_start, trunk_end)); + for ¶phrase_id in &self.dict[*group_id] { + if paraphrase_id == phrase_id { + continue; + } + let paraphrase = &self.phrases[paraphrase_id].0; + let inserted_first_node_id = Self::insert_branch(&mut lattice, &mut state_id_cache, paraphrase, branch_start, lattice_node_id, depth + 1); + let (state_id_cache_current, state_id_cache_next) = get_two_mut_elems(&mut state_id_cache, branch_start, inserted_first_node_id); + for pma_state_id_cached in state_id_cache_current.iter() { + let pma_state_id_new = Self::next_pma_state_id(&self.pma, *pma_state_id_cached, ¶phrase[0]); + if !state_id_cache_next.contains(&pma_state_id_new) { + // queue added node + queue.push_back((inserted_first_node_id, pma_state_id_new)); + state_id_cache_next.insert(pma_state_id_new); + } + } + } + } + } + } + if shrink { + PaRattice::shrink_lattice(&mut lattice); + } + let new_lattice = PaRattice::index_left_to_right(&lattice); + let mut trunk = HashMap::new(); + let mut node_id = 0; + let mut orig_node_id = 0; + trunk.insert(0, 0); + while let Some((_, x)) = &new_lattice[node_id].forward_main { + node_id = *x; + orig_node_id += 1; + trunk.insert(node_id, orig_node_id); + } + Lattice { + trunk: trunk, + capacity: new_lattice.iter().fold(0, |sum, x| sum + x.forwards.len()), + lattice: new_lattice, + } + } + + fn main_branch_bw(g: &Vec, begin: usize, eos: usize) -> usize { + let mut b = begin; + while b > eos { + b = g[b].backward_main.unwrap().1; + } + b + } + + fn main_branch_fw(g: &Vec, end: usize, eos: usize) -> usize { + let mut e = end; + while e > eos { + e = g[e].forward_main.unwrap().1; + } + e + } + + fn shrink_lattice(lattice: &mut Vec) { + let mut updated_node_bw: BTreeSet = (0..lattice.len()).collect(); + let mut updated_node_fw: BTreeSet = (0..lattice.len()).collect(); + while !updated_node_bw.is_empty() || !updated_node_fw.is_empty() { + let mut backward_map = BTreeMap::new(); + for &i in &updated_node_bw { + if !lattice[i].backwards.is_empty() { + backward_map.entry(lattice[i].backwards.clone()).or_insert(vec![]).push(i); + } + } + updated_node_bw.clear(); + for nodes in backward_map.values() { + if nodes.len() >= 2 { + for i in 1..nodes.len() { + let backward_tmp = mem::replace(&mut lattice[nodes[i]].backwards, BTreeSet::new()); + for (edge_str, prev_node_id) in backward_tmp { + lattice[prev_node_id].forwards.remove(&(edge_str, nodes[i])); + if lattice[prev_node_id].forward_main == Some((edge_str, nodes[i])) { + lattice[prev_node_id].forward_main = Some((edge_str, nodes[0])); + } + } + let forward_tmp = mem::replace(&mut lattice[nodes[i]].forwards, BTreeSet::new()); + for (edge_str, next_node_id) in forward_tmp { + lattice[next_node_id].backwards.remove(&(edge_str, nodes[i])); + lattice[next_node_id].backwards.insert((edge_str, nodes[0])); + if lattice[next_node_id].backward_main == Some((edge_str, nodes[i])) { + lattice[next_node_id].backward_main = Some((edge_str, nodes[0])); + } + lattice[nodes[0]].forwards.insert((edge_str, next_node_id)); + } + } + updated_node_bw.insert(nodes[0]); + for &(_, i) in &lattice[nodes[0]].forwards { + updated_node_bw.insert(i); + } + } + } + let mut forward_map = BTreeMap::new(); + for &i in &updated_node_fw { + if !lattice[i].forwards.is_empty() { + forward_map.entry(lattice[i].forwards.clone()).or_insert(vec![]).push(i); + } + } + updated_node_fw.clear(); + for nodes in forward_map.values() { + if nodes.len() >= 2 { + for i in 1..nodes.len() { + let forward_tmp = mem::replace(&mut lattice[nodes[i]].forwards, BTreeSet::new()); + for (edge_str, next_node_id) in forward_tmp { + lattice[next_node_id].backwards.remove(&(edge_str, nodes[i])); + if lattice[next_node_id].backward_main == Some((edge_str, nodes[i])) { + lattice[next_node_id].backward_main = Some((edge_str, nodes[0])); + } + } + let backward_tmp = mem::replace(&mut lattice[nodes[i]].backwards, BTreeSet::new()); + for (edge_str, prev_node_id) in backward_tmp { + lattice[prev_node_id].forwards.remove(&(edge_str, nodes[i])); + lattice[prev_node_id].forwards.insert((edge_str, nodes[0])); + if lattice[prev_node_id].forward_main == Some((edge_str, nodes[i])) { + lattice[prev_node_id].forward_main = Some((edge_str, nodes[0])); + } + lattice[nodes[0]].backwards.insert((edge_str, prev_node_id)); + } + } + updated_node_fw.insert(nodes[0]); + for &(_, i) in &lattice[nodes[0]].backwards { + updated_node_fw.insert(i); + } + } + } + } + } + + fn index_left_to_right(lattice: &Vec>) -> Vec> { + let mut node_id_map = vec![0; lattice.len()]; + let mut node_id_map_rev = Vec::with_capacity(lattice.len()); + let mut queue = VecDeque::new(); + let mut backward_counter = vec![0; lattice.len()]; + queue.push_back(0); + while let Some(node_id) = queue.pop_front() { + node_id_map[node_id] = node_id_map_rev.len(); + node_id_map_rev.push(node_id); + for &(_, edge_target) in &lattice[node_id].forwards { + backward_counter[edge_target] += 1; + if backward_counter[edge_target] == lattice[edge_target].backwards.len() { + queue.push_back(edge_target); + } + } + } + let mut new_lattice = Vec::with_capacity(node_id_map_rev.len()); + for &node_id in &node_id_map_rev { + let mut new_forwards = BTreeSet::new(); + let mut new_backwards = BTreeSet::new(); + for &(s, next_node_id) in &lattice[node_id].forwards { + new_forwards.insert((s, node_id_map[next_node_id])); + } + for &(s, prev_node_id) in &lattice[node_id].backwards { + new_backwards.insert((s, node_id_map[prev_node_id])); + } + let forward_main = lattice[node_id].forward_main.map(|(x, i)| (x, node_id_map[i])); + let backward_main = lattice[node_id].backward_main.map(|(x, i)| (x, node_id_map[i])); + new_lattice.push( + LatticeNode { + forwards: new_forwards, + backwards: new_backwards, + forward_main: forward_main, + backward_main: backward_main, + depth: 0, + }); + } + new_lattice + } +} diff --git a/src/utils.rs b/src/utils.rs new file mode 100644 index 0000000..4567cc0 --- /dev/null +++ b/src/utils.rs @@ -0,0 +1,10 @@ +pub fn get_two_mut_elems<'a, T>(x: &'a mut Vec, i: usize, j: usize) -> (&'a mut T, &'a mut T) { + let len = x.len(); + assert!(i != j); + assert!(i != len); + assert!(j != len); + let ptr = x.as_mut_ptr(); + unsafe { + (ptr.add(i).as_mut().unwrap(), ptr.add(j).as_mut().unwrap()) + } +} diff --git a/tests/lattice_kmp.rs b/tests/lattice_kmp.rs new file mode 100644 index 0000000..ba4df2f --- /dev/null +++ b/tests/lattice_kmp.rs @@ -0,0 +1,31 @@ +extern crate parattice; + +use parattice::LatticeKMP; +use parattice::PaRattice; + +#[test] +fn lattice_kmp_test() { + let paradict + = vec![vec![vec!["blood", "stem", "cell"], vec!["造血", "幹", "細胞"], vec!["hematopoietic", "stem", "cell"]], + vec![vec!["造血", "幹", "細胞", "移植"], vec!["hematopoietic", "stem", "cell", "transplantation"]], + vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + vec![vec!["幹", "細胞", "移植"], vec!["rescue", "transplant"], vec!["stem", "cell", "rescue"]], + vec![vec!["rescue"], vec!["救命"]], + vec![vec!["blood"], vec!["血液"]]]; + let parattice = PaRattice::new(paradict); + let words = vec!["造血", "幹", "細胞", "移植"]; + let lattice = parattice.get_lattice(&words, true, 10); + let pattern = vec!["幹", "細胞"]; + let kmp = LatticeKMP::new(pattern); + let mut results = kmp.search(&lattice); + results.sort(); + let expected = vec![ + vec![("", 1), ("幹", 10), ("細胞", 15)], + vec![("", 2), ("幹", 7), ("細胞", 13)], + vec![("", 3), ("幹", 9), ("細胞", 13)], + vec![("", 3), ("幹", 9), ("細胞", 14)], + vec![("", 3), ("幹", 10), ("細胞", 15)], + ]; + assert_eq!(expected, results); + +} diff --git a/tests/parattice.rs b/tests/parattice.rs new file mode 100644 index 0000000..b688630 --- /dev/null +++ b/tests/parattice.rs @@ -0,0 +1,107 @@ +extern crate parattice; + +use parattice::Lattice; +use parattice::PaRattice; +use parattice::SearchIndexNode; + +#[test] +fn dump_for_search_index_test() { + let paradict + = vec![vec![vec!["blood", "stem", "cell"], vec!["造血", "幹", "細胞"], vec!["hematopoietic", "stem", "cell"]], + vec![vec!["造血", "幹", "細胞", "移植"], vec!["hematopoietic", "stem", "cell", "transplantation"]], + vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + vec![vec!["幹", "細胞", "移植"], vec!["rescue", "transplant"], vec!["stem", "cell", "rescue"]], + vec![vec!["rescue"], vec!["救命"]], + vec![vec!["blood"], vec!["血液"]]]; + let parattice = PaRattice::new(paradict); + let words = vec!["造血", "幹", "細胞", "移植"]; + let lattice = parattice.get_lattice(&words, true, 10); + let index_data = lattice.dump_for_search_index(); + let mut expected = vec![ + ("造血", 0, 3, 0, 1), ("blood", 0, 2, 0, 3), ("血液", 0, 2, 0, 3), ("hematopoietic", 0, 1, 0, 3), + ("stem", 1, 4, 0, 3), ("stem", 1, 6, 0, 4), ("幹", 1, 10, 0, 4), + ("rescue", 2, 11, 0, 4), ("幹", 2, 7, 0, 3), ("stem", 2, 5, 0, 3), ("stem", 2, 6, 0, 4), + ("stem", 3, 8, 1, 3), ("幹", 3, 9, 1, 2), ("幹", 3, 10, 1, 4), ("救命", 3, 11, 1, 4), ("rescue", 3, 11, 1, 4), + ("cell", 4, 13, 0, 3), ("cell", 5, 12, 0, 4), ("cell", 5, 13, 0, 3), ("cell", 6, 15, 0, 4), ("細胞", 7, 13, 0, 3), + ("cell", 8, 13, 1, 3), ("cell", 8, 14, 1, 4), ("細胞", 9, 13, 2, 3), ("細胞", 9, 14, 2, 4), ("細胞", 10, 15, 1, 4), + ("transplant", 11, 16, 1, 4), ("rescue", 12, 16, 0, 4), ("救命", 12, 16, 0, 4), ("移植", 13, 16, 3, 4), + ("rescue", 14, 16, 1, 4), ("transplantation", 15, 16, 0, 4), + ]; + expected.sort(); + assert_eq!(expected, search_index_relative_to_absolute(&index_data)); +} + +#[test] +fn serialize_test() { + let paradict + = vec![vec![vec!["blood", "stem", "cell"], vec!["造血", "幹", "細胞"], vec!["hematopoietic", "stem", "cell"]], + vec![vec!["造血", "幹", "細胞", "移植"], vec!["hematopoietic", "stem", "cell", "transplantation"]], + vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + vec![vec!["幹", "細胞", "移植"], vec!["rescue", "transplant"], vec!["stem", "cell", "rescue"]], + vec![vec!["rescue"], vec!["救命"]], + vec![vec!["blood"], vec!["血液"]]]; + let parattice = PaRattice::new(paradict); + let words = vec!["造血", "幹", "細胞", "移植"]; + let lattice = parattice.get_lattice(&words, true, 10); + let bytes = lattice.to_bytes(); + let lattice_from_bytes = Lattice::new_from_bytes(&bytes); + assert_eq!(lattice, lattice_from_bytes); +} + +fn search_index_relative_to_absolute<'a>(data: &'a Vec) -> Vec<(&'a str, usize, usize, usize, usize)> { + let mut new_data = vec![]; + let mut node_id = 0; + for node in data { + node_id += node.increment; + new_data.push((node.text, node_id - 1, node_id + node.length - 1, node.offset.0, node.offset.1)); + } + new_data.sort(); + new_data +} + +#[test] +fn get_trunk_span_test() { + let paradict + = vec![vec![vec!["blood", "stem", "cell"], vec!["造血", "幹", "細胞"], vec!["hematopoietic", "stem", "cell"]], + vec![vec!["造血", "幹", "細胞", "移植"], vec!["hematopoietic", "stem", "cell", "transplantation"]], + vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + vec![vec!["幹", "細胞", "移植"], vec!["rescue", "transplant"], vec!["stem", "cell", "rescue"]], + vec![vec!["rescue"], vec!["救命"]], + vec![vec!["blood"], vec!["血液"]]]; + let parattice = PaRattice::new(paradict); + let words = vec!["造血", "幹", "細胞", "移植"]; + let lattice = parattice.get_lattice(&words, true, 10); + assert_eq!( + vec![("", 0), ("hematopoietic", 1), ("stem", 4), ("cell", 13)], + lattice.get_trunk_span(vec![("", 1), ("stem", 4)])); + assert_eq!( + vec![("", 3), ("rescue", 11), ("transplant", 16)], + lattice.get_trunk_span(vec![("", 3), ("rescue", 11)])); + assert_eq!( + vec![("", 0), ("hematopoietic", 1), ("stem", 6), ("cell", 15), ("transplantation", 16)], + lattice.get_trunk_span(vec![("", 15), ("transplantation", 16)])); +} + +#[test] +fn max_depth_test() { + let paradict + = vec![vec![vec!["blood", "stem", "cell"], vec!["造血", "幹", "細胞"], vec!["hematopoietic", "stem", "cell"]], + vec![vec!["造血", "幹", "細胞", "移植"], vec!["hematopoietic", "stem", "cell", "transplantation"]], + vec![vec!["stem", "cell"], vec!["幹", "細胞"]], + vec![vec!["幹", "細胞", "移植"], vec!["rescue", "transplant"], vec!["stem", "cell", "rescue"]], + vec![vec!["rescue"], vec!["救命"]], + vec![vec!["blood"], vec!["血液"]]]; + let parattice = PaRattice::new(paradict); + let words = vec!["造血", "幹", "細胞", "移植"]; + let lattice = parattice.get_lattice(&words, true, 1); + let index_data = lattice.dump_for_search_index(); + let mut expected = vec![ + ("造血", 0, 3, 0, 1), ("blood", 0, 1, 0, 3), ("hematopoietic", 0, 2, 0, 3), + ("rescue", 3, 6, 1, 4), ("stem", 3, 7, 1, 3), ("幹", 3, 8, 1, 2), ("stem", 1, 4, 0, 3), + ("stem", 2, 4, 0, 3), ("stem", 2, 5, 0, 4), ("cell", 7, 10, 1, 4), ("cell", 7, 11, 1, 3), + ("細胞", 8, 11, 2, 3), ("cell", 4, 11, 0, 3), ("cell", 5, 9, 0, 4), ("transplant", 6, 12, 1, 4), + ("rescue", 10, 12, 1, 4), ("移植", 11, 12, 3, 4), ("transplantation", 9, 12, 0, 4), + ]; + expected.sort(); + assert_eq!(expected, search_index_relative_to_absolute(&index_data)); +}