haoheliu · haoheliu · Feb 28, 2023 · Feb 28, 2023 · Feb 28, 2023 · Feb 28, 2023
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,6 @@ __pycache__
 build
 dist
 output
-*temp.py
+*temp.py
+*.wav
+gradio_cached_examples
diff --git a/audioldm/__init__.py b/audioldm/__init__.py
@@ -1,5 +1,5 @@
 from .ldm import LatentDiffusion
-from .utils import seed_everything, save_wave, get_time
+from .utils import seed_everything, save_wave, get_time, get_duration
 from .pipeline import *
 
 import os

diff --git a/audioldm/audio/tools.py b/audioldm/audio/tools.py
@@ -43,7 +43,6 @@ def pad_wav(waveform, segment_length):
         temp_wav[:, :waveform_length] = waveform
     return temp_wav
 
-
 def normalize_wav(waveform):
     waveform = waveform - np.mean(waveform)
     waveform = waveform / (np.max(np.abs(waveform)) + 1e-8)
@@ -66,6 +65,7 @@ def wav_to_fbank(filename, target_length=1024, fn_STFT=None):
 
     # mixup
     waveform = read_wav_file(filename, target_length * 160)  # hop size is 160
+
     waveform = waveform[0, ...]
     waveform = torch.FloatTensor(waveform)
 

diff --git a/audioldm/pipeline.py b/audioldm/pipeline.py
@@ -7,7 +7,7 @@
 from tqdm import tqdm, trange
 
 from audioldm import LatentDiffusion, seed_everything
-from audioldm.utils import default_audioldm_config
+from audioldm.utils import default_audioldm_config, get_duration, get_bit_depth
 from audioldm.audio import wav_to_fbank, TacotronSTFT, read_wav_file
 from audioldm.latent_diffusion.ddim import DDIMSampler
 from einops import repeat
@@ -49,6 +49,8 @@ def make_batch_for_text_to_audio(text, waveform=None, fbank=None, batchsize=1):
     )
     return batch
 
+def round_up_duration(duration):
+    return int(round(duration/2.5) + 1) * 2.5
 
 def build_model(
     ckpt_path=os.path.join(CACHE_DIR, "audioldm-s-full.ckpt"),
@@ -109,7 +111,7 @@ def text_to_audio(
     config=None,
 ):
     seed_everything(int(seed))
-
+    duration = round_up_duration(duration)
     waveform = None
     if(original_audio_file_path is not None):
         waveform = read_wav_file(original_audio_file_path, int(duration * 102.4) * 160)
@@ -152,6 +154,23 @@ def style_transfer(
     else:
         device = torch.device("cpu")
 
+    assert original_audio_file_path is not None, "You need to provide the original audio file path"
+
+    audio_file_duration = get_duration(original_audio_file_path)
+
+    assert get_bit_depth(original_audio_file_path) == 16, "The bit depth of the original audio file %s must be 16" % original_audio_file_path
+
+    if(duration > 20):
+        print("Warning: The duration of the audio file %s must be less than 20 seconds. Longer duration will result in Nan in model output (we are still debugging that); Automatically set duration to 20 seconds")
+        duration = 20
+
+    if(duration >= audio_file_duration):
+        print("Warning: Duration you specified %s-seconds must equal or smaller than the audio file duration %ss" % (duration, audio_file_duration))
+        duration = round_up_duration(audio_file_duration)
+        print("Set new duration as %s-seconds" % duration)
+
+    # duration = round_up_duration(duration)
+
     latent_diffusion = set_cond_text(latent_diffusion)
 
     if config is not None:
@@ -161,7 +180,7 @@ def style_transfer(
         config = default_audioldm_config()
 
     seed_everything(int(seed))
-    latent_diffusion.latent_t_size = duration_to_latent_t_size(duration)
+    # latent_diffusion.latent_t_size = duration_to_latent_t_size(duration)
     latent_diffusion.cond_stage_model.embed_mode = "text"
 
     fn_STFT = TacotronSTFT(
@@ -213,7 +232,6 @@ def style_transfer(
                 )
 
                 x_samples = latent_diffusion.decode_first_stage(samples)
-
                 waveform = latent_diffusion.first_stage_model.decode_to_waveform(
                     x_samples
                 )

diff --git a/audioldm/utils.py b/audioldm/utils.py
@@ -1,17 +1,26 @@
 import importlib
 
 from inspect import isfunction
-
 import os
 import soundfile as sf
 import time
-
-
+import wave
+
+def get_duration(fname):
+    with contextlib.closing(wave.open(fname, 'r')) as f:
+        frames = f.getnframes()
+        rate = f.getframerate()
+        return frames / float(rate)
+
+def get_bit_depth(fname):
+    with contextlib.closing(wave.open(fname, 'r')) as f:
+        bit_depth = f.getsampwidth() * 8
+        return bit_depth
+
 def get_time():
     t = time.localtime()
     return time.strftime("%d_%m_%Y_%H_%M_%S", t)
 
-
 def seed_everything(seed):
     import random, os
     import numpy as np

diff --git a/bin/audioldm b/bin/audioldm
@@ -1,6 +1,6 @@
 #!/usr/bin/python3
 import os
-from audioldm import text_to_audio, style_transfer, build_model, save_wave, get_time
+from audioldm import text_to_audio, style_transfer, build_model, save_wave, get_time, round_up_duration, get_duration
 import argparse
 
 CACHE_DIR = os.getenv(

diff --git a/scripts/test.sh b/scripts/test.sh
@@ -1,5 +1,7 @@
 # Generation
 audioldm --file_path trumpet.wav
+audioldm --file_path trumpet.wav -dur 25
+audioldm --file_path trumpet.wav -dur 2.5
 audioldm --text "A hammer is hitting a wooden surface"
 audioldm
 

diff --git a/scripts/text2sound.py b/scripts/text2sound.py
@@ -92,7 +92,7 @@
 waveform = text_to_audio(
     audioldm,
     text,
-    random_seed,
+    seed=random_seed,
     duration=duration,
     guidance_scale=guidance_scale,
     n_candidate_gen_per_text=n_candidate_gen_per_text,

diff --git a/setup.py b/setup.py
@@ -31,7 +31,7 @@
 EMAIL = "[email protected]"
 AUTHOR = "Haohe Liu"
 REQUIRES_PYTHON = ">=3.7.0"
-VERSION = "0.0.15"
+VERSION = "0.0.17"
 
 # What packages are required for this module to be executed?
 REQUIRED = [