Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Debug #34

Merged
merged 5 commits into from
Feb 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@ __pycache__
build
dist
output
*temp.py
*temp.py
*.wav
gradio_cached_examples
2 changes: 1 addition & 1 deletion audioldm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .ldm import LatentDiffusion
from .utils import seed_everything, save_wave, get_time
from .utils import seed_everything, save_wave, get_time, get_duration
from .pipeline import *

import os
Expand Down
2 changes: 1 addition & 1 deletion audioldm/audio/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def pad_wav(waveform, segment_length):
temp_wav[:, :waveform_length] = waveform
return temp_wav


def normalize_wav(waveform):
waveform = waveform - np.mean(waveform)
waveform = waveform / (np.max(np.abs(waveform)) + 1e-8)
Expand All @@ -66,6 +65,7 @@ def wav_to_fbank(filename, target_length=1024, fn_STFT=None):

# mixup
waveform = read_wav_file(filename, target_length * 160) # hop size is 160

waveform = waveform[0, ...]
waveform = torch.FloatTensor(waveform)

Expand Down
26 changes: 22 additions & 4 deletions audioldm/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from tqdm import tqdm, trange

from audioldm import LatentDiffusion, seed_everything
from audioldm.utils import default_audioldm_config
from audioldm.utils import default_audioldm_config, get_duration, get_bit_depth
from audioldm.audio import wav_to_fbank, TacotronSTFT, read_wav_file
from audioldm.latent_diffusion.ddim import DDIMSampler
from einops import repeat
Expand Down Expand Up @@ -49,6 +49,8 @@ def make_batch_for_text_to_audio(text, waveform=None, fbank=None, batchsize=1):
)
return batch

def round_up_duration(duration):
return int(round(duration/2.5) + 1) * 2.5

def build_model(
ckpt_path=os.path.join(CACHE_DIR, "audioldm-s-full.ckpt"),
Expand Down Expand Up @@ -109,7 +111,7 @@ def text_to_audio(
config=None,
):
seed_everything(int(seed))

duration = round_up_duration(duration)
waveform = None
if(original_audio_file_path is not None):
waveform = read_wav_file(original_audio_file_path, int(duration * 102.4) * 160)
Expand Down Expand Up @@ -152,6 +154,23 @@ def style_transfer(
else:
device = torch.device("cpu")

assert original_audio_file_path is not None, "You need to provide the original audio file path"

audio_file_duration = get_duration(original_audio_file_path)

assert get_bit_depth(original_audio_file_path) == 16, "The bit depth of the original audio file %s must be 16" % original_audio_file_path

if(duration > 20):
print("Warning: The duration of the audio file %s must be less than 20 seconds. Longer duration will result in Nan in model output (we are still debugging that); Automatically set duration to 20 seconds")
duration = 20

if(duration >= audio_file_duration):
print("Warning: Duration you specified %s-seconds must equal or smaller than the audio file duration %ss" % (duration, audio_file_duration))
duration = round_up_duration(audio_file_duration)
print("Set new duration as %s-seconds" % duration)

# duration = round_up_duration(duration)

latent_diffusion = set_cond_text(latent_diffusion)

if config is not None:
Expand All @@ -161,7 +180,7 @@ def style_transfer(
config = default_audioldm_config()

seed_everything(int(seed))
latent_diffusion.latent_t_size = duration_to_latent_t_size(duration)
# latent_diffusion.latent_t_size = duration_to_latent_t_size(duration)
latent_diffusion.cond_stage_model.embed_mode = "text"

fn_STFT = TacotronSTFT(
Expand Down Expand Up @@ -213,7 +232,6 @@ def style_transfer(
)

x_samples = latent_diffusion.decode_first_stage(samples)

waveform = latent_diffusion.first_stage_model.decode_to_waveform(
x_samples
)
Expand Down
17 changes: 13 additions & 4 deletions audioldm/utils.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
import importlib

from inspect import isfunction

import os
import soundfile as sf
import time


import wave

def get_duration(fname):
with contextlib.closing(wave.open(fname, 'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
return frames / float(rate)

def get_bit_depth(fname):
with contextlib.closing(wave.open(fname, 'r')) as f:
bit_depth = f.getsampwidth() * 8
return bit_depth

def get_time():
t = time.localtime()
return time.strftime("%d_%m_%Y_%H_%M_%S", t)


def seed_everything(seed):
import random, os
import numpy as np
Expand Down
2 changes: 1 addition & 1 deletion bin/audioldm
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/python3
import os
from audioldm import text_to_audio, style_transfer, build_model, save_wave, get_time
from audioldm import text_to_audio, style_transfer, build_model, save_wave, get_time, round_up_duration, get_duration
import argparse

CACHE_DIR = os.getenv(
Expand Down
2 changes: 2 additions & 0 deletions scripts/test.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Generation
audioldm --file_path trumpet.wav
audioldm --file_path trumpet.wav -dur 25
audioldm --file_path trumpet.wav -dur 2.5
audioldm --text "A hammer is hitting a wooden surface"
audioldm

Expand Down
2 changes: 1 addition & 1 deletion scripts/text2sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
waveform = text_to_audio(
audioldm,
text,
random_seed,
seed=random_seed,
duration=duration,
guidance_scale=guidance_scale,
n_candidate_gen_per_text=n_candidate_gen_per_text,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
EMAIL = "[email protected]"
AUTHOR = "Haohe Liu"
REQUIRES_PYTHON = ">=3.7.0"
VERSION = "0.0.15"
VERSION = "0.0.17"

# What packages are required for this module to be executed?
REQUIRED = [
Expand Down