Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Audio Features - DO NOT MERGE] PoC for adding an offset+sliced reading to audio file. #7312

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 53 additions & 2 deletions src/datasets/features/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,8 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray]) -> pa.Str
path_array = pa.array([None] * len(storage), type=pa.string())
storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=storage.is_null())
return array_cast(storage, self.pa_type)

def embed_storage(self, storage: pa.StructArray) -> pa.StructArray:
def embed_storage_offset(self, start, dur, storage) -> pa.StructArray:
"""Embed audio files into the Arrow array.

Args:
Expand All @@ -255,13 +255,64 @@ def embed_storage(self, storage: pa.StructArray) -> pa.StructArray:
`pa.StructArray`: Array in the Audio arrow storage type, that is
`pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
"""
import soundfile as sf

@no_op_if_value_is_null
def path_to_bytes(path):
with xopen(path, "rb") as f:
bytes_ = f.read()
return bytes_

def path_to_bytes_offset(start, dur, path):
start = int(start.as_py() * 16000)
dur = int(dur.as_py() * 16000)
arr = sf.read(path, start=start, frames=dur)
return arr[0].tobytes()

bytes_array = []
for i, x in enumerate(storage.to_pylist()):
if start[i].as_py() >= 0:
bytes_array.append(path_to_bytes_offset(start[i], dur[i], x["path"]))
else:
if x["bytes"] is None:
bytes_array.append(path_to_bytes(x["path"]))
elif x is not None:
bytes_array.append(x["bytes"])
else:
bytes_array.append(None)

bytes_array = pa.array(bytes_array, type=pa.binary())
#bytes_array = pa.array(
# [
# (path_to_bytes(x["start"], x["dur"], x["path"]) if x["bytes"] is None else x["bytes"]) if x is not None else None
# for x in storage.to_pylist()
# ],
# type=pa.binary(),
#
path_array = pa.array(
[os.path.basename(x["path"]) if x["path"] is not None else None for x in storage.to_pylist()],
type=pa.string(),
)
storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=bytes_array.is_null())
return array_cast(storage, self.pa_type)

def embed_storage(self, storage: pa.StructArray) -> pa.StructArray:
"""Embed audio files into the Arrow array.

Args:
storage (`pa.StructArray`):
PyArrow array to embed.

Returns:
`pa.StructArray`: Array in the Audio arrow storage type, that is
`pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
"""
@no_op_if_value_is_null
def path_to_bytes(path):
with xopen(path, "rb") as f:
bytes_ = f.read()
return bytes_

bytes_array = pa.array(
[
(path_to_bytes(x["path"]) if x["bytes"] is None else x["bytes"]) if x is not None else None
Expand Down
48 changes: 43 additions & 5 deletions src/datasets/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2107,6 +2107,34 @@ def cast_array_to_feature(
)
raise TypeError(f"Couldn't cast array of type\n{_short_str(array.type)}\nto\n{_short_str(feature)}")

def embed_array_storage_audio(array_start: pa.Array, array_dur: pa.Array, array_wav: pa.Array, feature: "FeatureType"):
"""Embed data into an arrays's storage.
For custom features like Audio or Image, it takes into account the "embed_storage" methods
they define to embed external data (e.g. an image file) into an array.

<Added version="2.4.0"/>

Args:
array (`pa.Array`):
The PyArrow array in which to embed data.
feature (`datasets.features.FeatureType`):
Array features.

Raises:
`TypeError`: if the target type is not supported according, e.g.

- if a field is missing

Returns:
array (`pyarrow.Array`): the casted array
"""
from .features import Sequence

if hasattr(feature, "embed_storage_offset"):
return feature.embed_storage_offset(array_start, array_dur, array_wav)
raise TypeError(f"Couldn't embed array of type\n{_short_str(array.type)}\nwith\n{_short_str(feature)}")



@_wrap_for_chunked_arrays
def embed_array_storage(array: pa.Array, feature: "FeatureType"):
Expand Down Expand Up @@ -2265,12 +2293,22 @@ def embed_table_storage(table: pa.Table):
table (`pyarrow.Table`): the table with embedded data
"""
from .features.features import Features, require_storage_embed

from .features import Audio

features = Features.from_arrow_schema(table.schema)
arrays = [
embed_array_storage(table[name], feature) if require_storage_embed(feature) else table[name]
for name, feature in features.items()
]
arrays = []
for name, feature in features.items():
if require_storage_embed(feature):
if isinstance(feature, Audio):
arrays.append(embed_array_storage_audio(table['start'], table['duration'], table[name], feature))
else:
arrays.append(embed_array_storage(table[name], feature))
else:
arrays.append(table[name])
# arrays = [
# embed_array_storage(table[name], feature) if require_storage_embed(feature) else table[name]
# for name, feature in features.items()
# ]
return pa.Table.from_arrays(arrays, schema=features.arrow_schema)


Expand Down