Skip to content

Commit

Permalink
Initial work on implementing the reinterpret codec
Browse files Browse the repository at this point in the history
  • Loading branch information
juntyr committed Aug 8, 2024
1 parent 8d6f0af commit f174a5c
Show file tree
Hide file tree
Showing 9 changed files with 278 additions and 19 deletions.
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ members = [

"codecs/bit-round",
"codecs/identity",
"codecs/reinterpret",
"codecs/round",
"codecs/uniform-noise",
"codecs/zlib",
Expand All @@ -27,6 +28,7 @@ numcodecs-python = { version = "0.2", path = "crates/numcodecs-python", default-
# workspace-internal codecs crates
numcodecs-bit-round = { version = "0.1", path = "codecs/bit-round", default-features = false }
numcodecs-identity = { version = "0.1", path = "codecs/identity", default-features = false }
numcodecs-reinterpret = { version = "0.1", path = "codecs/reinterpret", default-features = false }
numcodecs-round = { version = "0.1", path = "codecs/round", default-features = false }
numcodecs-uniform-noise = { version = "0.1", path = "codecs/uniform-noise", default-features = false }
numcodecs-zlib = { version = "0.1", path = "codecs/zlib", default-features = false }
Expand Down
2 changes: 1 addition & 1 deletion codecs/identity/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
//! [Rust Doc Main]: https://img.shields.io/badge/docs-main-blue
//! [docs]: https://juntyr.github.io/numcodecs-rs/numcodecs-identity
//!
//! Bit rounding codec implementation for the [`numcodecs`] API.
//! Identity codec implementation for the [`numcodecs`] API.
use ndarray::{ArrayViewD, ArrayViewMutD};
use numcodecs::{
Expand Down
23 changes: 23 additions & 0 deletions codecs/reinterpret/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[package]
name = "numcodecs-reinterpret"
version = "0.1.0"
edition = { workspace = true }
authors = { workspace = true }
repository = { workspace = true }
license = { workspace = true }
rust-version = { workspace = true }

description = "Binary reinterpret codec implementation for the numcodecs API"
readme = "README.md"
categories = ["compression", "encoding"]
keywords = ["reinterpret", "numcodecs", "compression", "encoding"]

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
numcodecs = { workspace = true }
serde = { workspace = true, features = ["std", "derive"] }
thiserror = { workspace = true }

[lints]
workspace = true
1 change: 1 addition & 0 deletions codecs/reinterpret/LICENSE
32 changes: 32 additions & 0 deletions codecs/reinterpret/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[![CI Status]][workflow] [![MSRV]][repo] [![Latest Version]][crates.io] [![Rust Doc Crate]][docs.rs] [![Rust Doc Main]][docs]

[CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/numcodecs-rs/ci.yml?branch=main
[workflow]: https://github.com/juntyr/numcodecs-rs/actions/workflows/ci.yml?query=branch%3Amain

[MSRV]: https://img.shields.io/badge/MSRV-1.64.0-blue
[repo]: https://github.com/juntyr/numcodecs-rs

[Latest Version]: https://img.shields.io/crates/v/numcodecs-reinterpret
[crates.io]: https://crates.io/crates/numcodecs-reinterpret

[Rust Doc Crate]: https://img.shields.io/docsrs/numcodecs-reinterpret
[docs.rs]: https://docs.rs/numcodecs-reinterpret/

[Rust Doc Main]: https://img.shields.io/badge/docs-main-blue
[docs]: https://juntyr.github.io/numcodecs-rs/numcodecs-reinterpret

# numcodecs-reinterpret

Binary reinterpret codec implementation for the [`numcodecs`] API.

[`numcodecs`]: https://docs.rs/numcodecs/0.1/numcodecs/

## License

Licensed under the Mozilla Public License, Version 2.0 ([LICENSE](LICENSE) or https://www.mozilla.org/en-US/MPL/2.0/).

## Funding

The `numcodecs-reinterpret` crate has been developed as part of [ESiWACE3](https://www.esiwace.eu), the third phase of the Centre of Excellence in Simulation of Weather and Climate in Europe.

Funded by the European Union. This work has received funding from the European High Performance Computing Joint Undertaking (JU) under grant agreement No 101093054.
178 changes: 178 additions & 0 deletions codecs/reinterpret/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
//! [![CI Status]][workflow] [![MSRV]][repo] [![Latest Version]][crates.io] [![Rust Doc Crate]][docs.rs] [![Rust Doc Main]][docs]
//!
//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/numcodecs-rs/ci.yml?branch=main
//! [workflow]: https://github.com/juntyr/numcodecs-rs/actions/workflows/ci.yml?query=branch%3Amain
//!
//! [MSRV]: https://img.shields.io/badge/MSRV-1.64.0-blue
//! [repo]: https://github.com/juntyr/numcodecs-rs
//!
//! [Latest Version]: https://img.shields.io/crates/v/numcodecs-reinterpret
//! [crates.io]: https://crates.io/crates/numcodecs-reinterpret
//!
//! [Rust Doc Crate]: https://img.shields.io/docsrs/numcodecs-reinterpret
//! [docs.rs]: https://docs.rs/numcodecs-reinterpret/
//!
//! [Rust Doc Main]: https://img.shields.io/badge/docs-main-blue
//! [docs]: https://juntyr.github.io/numcodecs-rs/numcodecs-reinterpret
//!
//! Binary reinterpret codec implementation for the [`numcodecs`] API.
use numcodecs::{
AnyArray, AnyArrayDType, AnyArrayView, AnyArrayViewMut, AnyCowArray, Codec, StaticCodec,
};
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use thiserror::Error;

#[derive(Clone)]
/// Codec to reinterpret data between different compatible types.
///
/// Note that no conversion happens, only the meaning of the bits changes.
pub struct ReinterpretCodec {
encode_dtype: AnyArrayDType,
decode_dtype: AnyArrayDType,
}

impl ReinterpretCodec {
#[must_use]
/// Try to create a [`ReinterpretCodec`] that reinterprets the input data
/// from `decode_dtype` to `encode_dtype` on encoding, and from
/// `encode_dtype` back to `decode_dtype` on decoding.
///
/// Returns `Some(_)` if `encode_dtype` and `decode_dtype` are compatible,
/// `None` otherwise.
pub fn try_new(encode_dtype: AnyArrayDType, decode_dtype: AnyArrayDType) -> Option<Self> {
#[allow(clippy::match_same_arms)]
match (decode_dtype, encode_dtype) {
// performing no conversion always works
(ty_a, ty_b) if ty_a == ty_b => (),
// converting to bytes always works
(_, AnyArrayDType::U8) => (),
// converting from signed / floating to same-size binary always works
(AnyArrayDType::I16, AnyArrayDType::U16)
| (AnyArrayDType::I32 | AnyArrayDType::F32, AnyArrayDType::U32)
| (AnyArrayDType::I64 | AnyArrayDType::F64, AnyArrayDType::U64) => (),
_ => return None,
};

Some(Self {
encode_dtype,
decode_dtype,
})
}

#[must_use]
/// Create a [`ReinterpretCodec`] that does not change the `dtype`.
pub const fn passthrough(dtype: AnyArrayDType) -> Self {
Self {
encode_dtype: dtype,
decode_dtype: dtype,
}
}

#[must_use]
/// Create a [`ReinterpretCodec`] that reinterprets `dtype` as
/// [bytes][`AnyArrayDType::U8`].
pub const fn to_bytes(dtype: AnyArrayDType) -> Self {
Self {
encode_dtype: AnyArrayDType::U8,
decode_dtype: dtype,
}
}

#[must_use]
/// Create a [`ReinterpretCodec`] that reinterprets `dtype` as its
/// [binary][`AnyArrayDType::to_binary`] equivalent.
pub const fn to_binary(dtype: AnyArrayDType) -> Self {
Self {
encode_dtype: dtype.to_binary(),
decode_dtype: dtype,
}
}
}

impl Codec for ReinterpretCodec {
type Error = ReinterpretCodecError;

fn encode(&self, _data: AnyCowArray) -> Result<AnyArray, Self::Error> {
todo!()
}

fn decode(&self, _encoded: AnyCowArray) -> Result<AnyArray, Self::Error> {
todo!()
}

fn decode_into(
&self,
_encoded: AnyArrayView,
_decoded: AnyArrayViewMut,
) -> Result<(), Self::Error> {
todo!()
}

fn get_config<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
self.serialize(serializer)
}
}

impl StaticCodec for ReinterpretCodec {
const CODEC_ID: &'static str = "reinterpret";

fn from_config<'de, D: Deserializer<'de>>(config: D) -> Result<Self, D::Error> {
Self::deserialize(config)
}
}

impl Serialize for ReinterpretCodec {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
ReinterpretCodecConfig {
encode_dtype: self.encode_dtype,
decode_dtype: self.decode_dtype,
}
.serialize(serializer)
}
}

impl<'de> Deserialize<'de> for ReinterpretCodec {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
let config = ReinterpretCodecConfig::deserialize(deserializer)?;

#[allow(clippy::option_if_let_else)]
match Self::try_new(config.encode_dtype, config.decode_dtype) {
Some(codec) => Ok(codec),
None => Err(serde::de::Error::custom(format!(
"reinterpreting {} as {} is not allowed",
config.decode_dtype, config.encode_dtype,
))),
}
}
}

#[derive(Clone, Serialize, Deserialize)]
#[serde(rename = "ReinterpretCodec")]
struct ReinterpretCodecConfig {
encode_dtype: AnyArrayDType,
decode_dtype: AnyArrayDType,
}

#[derive(Debug, Error)]
/// Errors that may occur when applying the [`ReinterpretCodec`].
pub enum ReinterpretCodecError {
/// [`ReinterpretCodec`] cannot decode the `decoded` dtype into the `provided`
/// array
#[error("Reinterpret cannot decode the dtype {decoded} into the provided {provided} array")]
MismatchedDecodeIntoDtype {
/// Dtype of the `decoded` data
decoded: AnyArrayDType,
/// Dtype of the `provided` array into which the data is to be decoded
provided: AnyArrayDType,
},
/// [`ReinterpretCodec`] cannot decode the decoded array into the provided
/// array of a different shape
#[error("Reinterpret cannot decode the decoded array of shape {decoded:?} into the provided array of shape {provided:?}")]
MismatchedDecodeIntoShape {
/// Shape of the `decoded` data
decoded: Vec<usize>,
/// Shape of the `provided` array into which the data is to be decoded
provided: Vec<usize>,
},
}
25 changes: 13 additions & 12 deletions codecs/zlib/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,11 @@ impl Codec for ZlibCodec {
fn decode(&self, encoded: AnyCowArray) -> Result<AnyArray, Self::Error> {
let encoded = match encoded {
AnyCowArray::U8(encoded) => encoded,
encoded => return Err(ZlibCodecError::EncodedDataNotBytes {
dtype: encoded.dtype(),
}),
encoded => {
return Err(ZlibCodecError::EncodedDataNotBytes {
dtype: encoded.dtype(),
})
}
};

if !matches!(encoded.shape(), [_]) {
Expand All @@ -84,9 +86,11 @@ impl Codec for ZlibCodec {
) -> Result<(), Self::Error> {
let encoded = match encoded {
AnyArrayView::U8(encoded) => encoded,
encoded => return Err(ZlibCodecError::EncodedDataNotBytes {
dtype: encoded.dtype(),
}),
encoded => {
return Err(ZlibCodecError::EncodedDataNotBytes {
dtype: encoded.dtype(),
})
}
};

if !matches!(encoded.shape(), [_]) {
Expand Down Expand Up @@ -221,13 +225,10 @@ pub fn compress(array: AnyArrayView, level: ZlibLevel) -> Result<Vec<u8>, ZlibCo
encoded.resize(encoded.len() + (data.len() / 2).max(2), 0);

loop {
let (Some(data_left), Some(encoded_left)) =
(data.get(in_pos..), encoded.get_mut(out_pos..))
else {
let (data_left, encoded_left) = match (data.get(in_pos..), encoded.get_mut(out_pos..)) {
(Some(data_left), Some(encoded_left)) => (data_left, encoded_left),
#[allow(clippy::panic)] // this would be a bug and cannot be user-caused
{
panic!("Zlib encode bug: input or output is out of bounds");
}
_ => panic!("Zlib encode bug: input or output is out of bounds"),
};

let (status, bytes_in, bytes_out) = miniz_oxide::deflate::core::compress(
Expand Down
16 changes: 10 additions & 6 deletions codecs/zstd/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,11 @@ impl Codec for ZstdCodec {
fn decode(&self, encoded: AnyCowArray) -> Result<AnyArray, Self::Error> {
let encoded = match encoded {
AnyCowArray::U8(encoded) => encoded,
encoded => return Err(ZstdCodecError::EncodedDataNotBytes {
dtype: encoded.dtype(),
}),
encoded => {
return Err(ZstdCodecError::EncodedDataNotBytes {
dtype: encoded.dtype(),
})
}
};

if !matches!(encoded.shape(), [_]) {
Expand All @@ -68,9 +70,11 @@ impl Codec for ZstdCodec {
) -> Result<(), Self::Error> {
let encoded = match encoded {
AnyArrayView::U8(encoded) => encoded,
encoded => return Err(ZstdCodecError::EncodedDataNotBytes {
dtype: encoded.dtype(),
}),
encoded => {
return Err(ZstdCodecError::EncodedDataNotBytes {
dtype: encoded.dtype(),
})
}
};

if !matches!(encoded.shape(), [_]) {
Expand Down
18 changes: 18 additions & 0 deletions crates/numcodecs/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,24 @@ pub enum AnyArrayDType {
F64,
}

impl AnyArrayDType {
#[must_use]
/// Convert the dtype to its (unsigned) binary equivalent.
///
/// ```rust
/// assert_eq!(AnyArrayDType::I32.to_binary(), AnyArrayDType::U32);
/// assert_eq!(AnyArrayDType::F32.to_binary(), AnyArrayDType::U32);
/// ```
pub const fn to_binary(self) -> Self {
match self {
Self::U8 | Self::I8 => Self::U8,
Self::U16 | Self::I16 => Self::U16,
Self::U32 | Self::I32 | Self::F32 => Self::U32,
Self::U64 | Self::I64 | Self::F64 => Self::U64,
}
}
}

impl fmt::Display for AnyArrayDType {
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
fmt.write_str(match self {
Expand Down

0 comments on commit f174a5c

Please sign in to comment.