Skip to content

Commit 5908414

Browse files
PSeitzfulmicoton
andauthored
use optional index in multivalued index (#2439)
* use optional index in multivalued index For mostly empty multivalued indices there was a large overhead during creation when iterating all docids. This is alleviated by placing an optional index in the multivalued index to mark documents that have values. There's some performance overhead when accessing values in a multivalued index. The accessing cost is now optional index + multivalue index. The sparse codec performs relatively bad with the binary_search when accessing data. This is reflected in the benchmarks below. This changes the format of columnar to v2, but code is added to handle the v1 formats. ``` Running benches/bench_access.rs (/home/pascal/Development/tantivy/optional_multivalues/target/release/deps/bench_access-ea323c028db88db4) multi sparse 1/13 access_values_for_doc Avg: 42.8946ms (+241.80%) Median: 42.8869ms (+244.10%) [42.7484ms .. 43.1074ms] access_first_vals Avg: 42.8022ms (+421.93%) Median: 42.7553ms (+439.84%) [42.6794ms .. 43.7404ms] multi 2x access_values_for_doc Avg: 31.1244ms (+24.17%) Median: 30.8339ms (+23.46%) [30.7192ms .. 33.6059ms] access_first_vals Avg: 24.3070ms (+70.92%) Median: 24.0966ms (+70.18%) [23.9328ms .. 26.4851ms] sparse 1/13 access_values_for_doc Avg: 42.2490ms (+0.61%) Median: 42.2346ms (+2.28%) [41.8988ms .. 43.7821ms] access_first_vals Avg: 43.6272ms (+0.23%) Median: 43.6197ms (+1.78%) [43.4920ms .. 43.9009ms] dense 1/12 access_values_for_doc Avg: 8.6184ms (+23.18%) Median: 8.6126ms (+23.78%) [8.5843ms .. 8.7527ms] access_first_vals Avg: 6.8112ms (+4.47%) Median: 6.8002ms (+4.55%) [6.7887ms .. 6.8991ms] full access_values_for_doc Avg: 9.4073ms (-5.09%) Median: 9.4023ms (-2.23%) [9.3694ms .. 9.4568ms] access_first_vals Avg: 4.9531ms (+6.24%) Median: 4.9502ms (+7.85%) [4.9423ms .. 4.9718ms] ``` ``` Running benches/bench_merge.rs (/home/pascal/Development/tantivy/optional_multivalues/target/release/deps/bench_merge-475697dfceb3639f) merge_multi 2x_and_multi 2x Avg: 20.2280ms (+34.33%) Median: 20.1829ms (+35.33%) [19.9933ms .. 20.8806ms] merge_multi sparse 1/13_and_multi sparse 1/13 Avg: 0.8961ms (-78.04%) Median: 0.8943ms (-77.61%) [0.8899ms .. 0.9272ms] merge_dense 1/12_and_dense 1/12 Avg: 0.6619ms (-1.26%) Median: 0.6616ms (+2.20%) [0.6473ms .. 0.6837ms] merge_sparse 1/13_and_sparse 1/13 Avg: 0.5508ms (-0.85%) Median: 0.5508ms (+2.80%) [0.5420ms .. 0.5634ms] merge_sparse 1/13_and_dense 1/12 Avg: 0.6046ms (-4.64%) Median: 0.6038ms (+2.80%) [0.5939ms .. 0.6296ms] merge_multi sparse 1/13_and_dense 1/12 Avg: 0.9111ms (-83.48%) Median: 0.9063ms (-83.50%) [0.9047ms .. 0.9663ms] merge_multi sparse 1/13_and_sparse 1/13 Avg: 0.8451ms (-89.49%) Median: 0.8428ms (-89.43%) [0.8411ms .. 0.8563ms] merge_multi 2x_and_dense 1/12 Avg: 10.6624ms (-4.82%) Median: 10.6568ms (-4.49%) [10.5738ms .. 10.8353ms] merge_multi 2x_and_sparse 1/13 Avg: 10.6336ms (-22.95%) Median: 10.5925ms (-22.33%) [10.5149ms .. 11.5657ms] ``` * Update columnar/src/columnar/format_version.rs Co-authored-by: Paul Masurel <[email protected]> * Update columnar/src/column_index/mod.rs Co-authored-by: Paul Masurel <[email protected]> --------- Co-authored-by: Paul Masurel <[email protected]>
1 parent 511b027 commit 5908414

28 files changed

+1007
-366
lines changed

columnar/benches/bench_access.rs

+6-67
Original file line numberDiff line numberDiff line change
@@ -1,73 +1,13 @@
1-
use core::fmt;
2-
use std::fmt::{Display, Formatter};
3-
41
use binggan::{black_box, InputGroup};
5-
use tantivy_columnar::*;
2+
use common::*;
3+
use tantivy_columnar::Column;
64

7-
pub enum Card {
8-
MultiSparse,
9-
Multi,
10-
Sparse,
11-
Dense,
12-
Full,
13-
}
14-
impl Display for Card {
15-
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
16-
match self {
17-
Card::MultiSparse => write!(f, "multi sparse 1/13"),
18-
Card::Multi => write!(f, "multi 2x"),
19-
Card::Sparse => write!(f, "sparse 1/13"),
20-
Card::Dense => write!(f, "dense 1/12"),
21-
Card::Full => write!(f, "full"),
22-
}
23-
}
24-
}
5+
pub mod common;
256

267
const NUM_DOCS: u32 = 2_000_000;
278

28-
pub fn generate_columnar(card: Card, num_docs: u32) -> ColumnarReader {
29-
use tantivy_columnar::ColumnarWriter;
30-
31-
let mut columnar_writer = ColumnarWriter::default();
32-
33-
match card {
34-
Card::MultiSparse => {
35-
columnar_writer.record_numerical(0, "price", 10u64);
36-
columnar_writer.record_numerical(0, "price", 10u64);
37-
}
38-
_ => {}
39-
}
40-
41-
for i in 0..num_docs {
42-
match card {
43-
Card::MultiSparse | Card::Sparse => {
44-
if i % 13 == 0 {
45-
columnar_writer.record_numerical(i, "price", i as u64);
46-
}
47-
}
48-
Card::Dense => {
49-
if i % 12 == 0 {
50-
columnar_writer.record_numerical(i, "price", i as u64);
51-
}
52-
}
53-
Card::Full => {
54-
columnar_writer.record_numerical(i, "price", i as u64);
55-
}
56-
Card::Multi => {
57-
columnar_writer.record_numerical(i, "price", i as u64);
58-
columnar_writer.record_numerical(i, "price", i as u64);
59-
}
60-
}
61-
}
62-
63-
let mut wrt: Vec<u8> = Vec::new();
64-
columnar_writer.serialize(num_docs, &mut wrt).unwrap();
65-
let reader = ColumnarReader::open(wrt).unwrap();
66-
reader
67-
}
68-
699
pub fn generate_columnar_and_open(card: Card, num_docs: u32) -> Column {
70-
let reader = generate_columnar(card, num_docs);
10+
let reader = generate_columnar_with_name(card, num_docs, "price");
7111
reader.read_columns("price").unwrap()[0]
7212
.open_u64_lenient()
7313
.unwrap()
@@ -116,9 +56,8 @@ fn bench_group(mut runner: InputGroup<Column>) {
11656

11757
column.first_vals(&docs, &mut buffer);
11858
for val in buffer.iter() {
119-
if let Some(val) = val {
120-
sum += *val;
121-
}
59+
let Some(val) = val else { continue };
60+
sum += *val;
12261
}
12362
}
12463

columnar/benches/bench_merge.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
mod bench_access;
1+
pub mod common;
22

3-
use bench_access::{generate_columnar, Card};
43
use binggan::{black_box, BenchRunner};
4+
use common::{generate_columnar_with_name, Card};
55
use tantivy_columnar::*;
66

77
const NUM_DOCS: u32 = 100_000;
@@ -13,8 +13,8 @@ fn main() {
1313
inputs.push((
1414
format!("merge_{card1}_and_{card2}"),
1515
vec![
16-
generate_columnar(card1, NUM_DOCS),
17-
generate_columnar(card2, NUM_DOCS),
16+
generate_columnar_with_name(card1, NUM_DOCS, "price"),
17+
generate_columnar_with_name(card2, NUM_DOCS, "price"),
1818
],
1919
));
2020
};

columnar/benches/common.rs

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
extern crate tantivy_columnar;
2+
3+
use core::fmt;
4+
use std::fmt::{Display, Formatter};
5+
6+
use tantivy_columnar::{ColumnarReader, ColumnarWriter};
7+
8+
pub enum Card {
9+
MultiSparse,
10+
Multi,
11+
Sparse,
12+
Dense,
13+
Full,
14+
}
15+
impl Display for Card {
16+
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
17+
match self {
18+
Card::MultiSparse => write!(f, "multi sparse 1/13"),
19+
Card::Multi => write!(f, "multi 2x"),
20+
Card::Sparse => write!(f, "sparse 1/13"),
21+
Card::Dense => write!(f, "dense 1/12"),
22+
Card::Full => write!(f, "full"),
23+
}
24+
}
25+
}
26+
pub fn generate_columnar_with_name(card: Card, num_docs: u32, column_name: &str) -> ColumnarReader {
27+
let mut columnar_writer = ColumnarWriter::default();
28+
29+
if let Card::MultiSparse = card {
30+
columnar_writer.record_numerical(0, column_name, 10u64);
31+
columnar_writer.record_numerical(0, column_name, 10u64);
32+
}
33+
34+
for i in 0..num_docs {
35+
match card {
36+
Card::MultiSparse | Card::Sparse => {
37+
if i % 13 == 0 {
38+
columnar_writer.record_numerical(i, column_name, i as u64);
39+
}
40+
}
41+
Card::Dense => {
42+
if i % 12 == 0 {
43+
columnar_writer.record_numerical(i, column_name, i as u64);
44+
}
45+
}
46+
Card::Full => {
47+
columnar_writer.record_numerical(i, column_name, i as u64);
48+
}
49+
Card::Multi => {
50+
columnar_writer.record_numerical(i, column_name, i as u64);
51+
columnar_writer.record_numerical(i, column_name, i as u64);
52+
}
53+
}
54+
}
55+
56+
let mut wrt: Vec<u8> = Vec::new();
57+
columnar_writer.serialize(num_docs, &mut wrt).unwrap();
58+
ColumnarReader::open(wrt).unwrap()
59+
}
-338 Bytes
Binary file not shown.
41 KB
Binary file not shown.

columnar/src/column/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
136136
.map(|value_row_id: RowId| self.values.get_val(value_row_id))
137137
}
138138

139-
/// Get the docids of values which are in the provided value range.
139+
/// Get the docids of values which are in the provided value and docid range.
140140
#[inline]
141141
pub fn get_docids_for_value_range(
142142
&self,

columnar/src/column/serialize.rs

+17-10
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ use crate::column_values::{
1212
CodecType, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
1313
};
1414
use crate::iterable::Iterable;
15-
use crate::StrColumn;
15+
use crate::{StrColumn, Version};
1616

1717
pub fn serialize_column_mappable_to_u128<T: MonotonicallyMappableToU128>(
1818
column_index: SerializableColumnIndex<'_>,
@@ -40,7 +40,10 @@ pub fn serialize_column_mappable_to_u64<T: MonotonicallyMappableToU64>(
4040
Ok(())
4141
}
4242

43-
pub fn open_column_u64<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Column<T>> {
43+
pub fn open_column_u64<T: MonotonicallyMappableToU64>(
44+
bytes: OwnedBytes,
45+
format_version: Version,
46+
) -> io::Result<Column<T>> {
4447
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
4548
let column_index_num_bytes = u32::from_le_bytes(
4649
column_index_num_bytes_payload
@@ -49,7 +52,7 @@ pub fn open_column_u64<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::
4952
.unwrap(),
5053
);
5154
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
52-
let column_index = crate::column_index::open_column_index(column_index_data)?;
55+
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
5356
let column_values = load_u64_based_column_values(column_values_data)?;
5457
Ok(Column {
5558
index: column_index,
@@ -59,6 +62,7 @@ pub fn open_column_u64<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::
5962

6063
pub fn open_column_u128<T: MonotonicallyMappableToU128>(
6164
bytes: OwnedBytes,
65+
format_version: Version,
6266
) -> io::Result<Column<T>> {
6367
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
6468
let column_index_num_bytes = u32::from_le_bytes(
@@ -68,7 +72,7 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
6872
.unwrap(),
6973
);
7074
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
71-
let column_index = crate::column_index::open_column_index(column_index_data)?;
75+
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
7276
let column_values = crate::column_values::open_u128_mapped(column_values_data)?;
7377
Ok(Column {
7478
index: column_index,
@@ -79,7 +83,10 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
7983
/// Open the column as u64.
8084
///
8185
/// See [`open_u128_as_compact_u64`] for more details.
82-
pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u64>> {
86+
pub fn open_column_u128_as_compact_u64(
87+
bytes: OwnedBytes,
88+
format_version: Version,
89+
) -> io::Result<Column<u64>> {
8390
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
8491
let column_index_num_bytes = u32::from_le_bytes(
8592
column_index_num_bytes_payload
@@ -88,27 +95,27 @@ pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u
8895
.unwrap(),
8996
);
9097
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
91-
let column_index = crate::column_index::open_column_index(column_index_data)?;
98+
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
9299
let column_values = crate::column_values::open_u128_as_compact_u64(column_values_data)?;
93100
Ok(Column {
94101
index: column_index,
95102
values: column_values,
96103
})
97104
}
98105

99-
pub fn open_column_bytes(data: OwnedBytes) -> io::Result<BytesColumn> {
106+
pub fn open_column_bytes(data: OwnedBytes, format_version: Version) -> io::Result<BytesColumn> {
100107
let (body, dictionary_len_bytes) = data.rsplit(4);
101108
let dictionary_len = u32::from_le_bytes(dictionary_len_bytes.as_slice().try_into().unwrap());
102109
let (dictionary_bytes, column_bytes) = body.split(dictionary_len as usize);
103110
let dictionary = Arc::new(Dictionary::from_bytes(dictionary_bytes)?);
104-
let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes)?;
111+
let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes, format_version)?;
105112
Ok(BytesColumn {
106113
dictionary,
107114
term_ord_column,
108115
})
109116
}
110117

111-
pub fn open_column_str(data: OwnedBytes) -> io::Result<StrColumn> {
112-
let bytes_column = open_column_bytes(data)?;
118+
pub fn open_column_str(data: OwnedBytes, format_version: Version) -> io::Result<StrColumn> {
119+
let bytes_column = open_column_bytes(data, format_version)?;
113120
Ok(StrColumn::wrap(bytes_column))
114121
}

columnar/src/column_index/merge/mod.rs

+16-3
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,12 @@ pub fn merge_column_index<'a>(
9595

9696
#[cfg(test)]
9797
mod tests {
98+
use common::OwnedBytes;
99+
98100
use crate::column_index::merge::detect_cardinality;
99-
use crate::column_index::multivalued_index::MultiValueIndex;
101+
use crate::column_index::multivalued_index::{
102+
open_multivalued_index, serialize_multivalued_index, MultiValueIndex,
103+
};
100104
use crate::column_index::{merge_column_index, OptionalIndex, SerializableColumnIndex};
101105
use crate::{
102106
Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder, StackMergeOrder,
@@ -171,7 +175,11 @@ mod tests {
171175
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
172176
panic!("Excpected a multivalued index")
173177
};
174-
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
178+
let mut output = Vec::new();
179+
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
180+
let multivalue =
181+
open_multivalued_index(OwnedBytes::new(output), crate::Version::V2).unwrap();
182+
let start_indexes: Vec<RowId> = multivalue.get_start_index_column().iter().collect();
175183
assert_eq!(&start_indexes, &[0, 3, 5]);
176184
}
177185

@@ -200,11 +208,16 @@ mod tests {
200208
],
201209
)
202210
.into();
211+
203212
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
204213
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
205214
panic!("Excpected a multivalued index")
206215
};
207-
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
216+
let mut output = Vec::new();
217+
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
218+
let multivalue =
219+
open_multivalued_index(OwnedBytes::new(output), crate::Version::V2).unwrap();
220+
let start_indexes: Vec<RowId> = multivalue.get_start_index_column().iter().collect();
208221
assert_eq!(&start_indexes, &[0, 3, 5, 6]);
209222
}
210223
}

0 commit comments

Comments
 (0)