pipeline.py

import torch
import numpy as np
from tqdm import tqdm
from ddpm import DDPMSampler

WIDTH = 512
HEIGHT = 512
LATENTS_WIDTH = WIDTH//8
LATENTS_HEIGHT = HEIGHT//8

def generate(
        prompt,
        uncond_prompt=None,
        input_image=None,
        strength=0.8,
        do_cfg=True,
        cfg_scale=7.5,
        sampler_name='ddpm',
        n_inference_steps=50,
        models={},
        seed=None,
        device=None,
        idle_device=None,
        tokenizer=None,
):
    with torch.no_grad():
        if not 0 < strength <=1:
            raise ValueError("strength must be between 0 and  1")
        
        if idle_device:
            to_idle = lambda x: x.to(idle_device)
        else:
            to_idle = lambda x: x


        generator = torch.Generator(device=device)
        if seed is None:
            generator.seed()
        else:
            generator.manual_seed(seed)

        clip = models["clip"]
        clip.to(device)

        if do_cfg:
           cond_tokens = tokenizer.batch_encode_plus(
               [prompt],padding="max_length", max_length=77
               ).input_ids
           cond_tokens = torch.tensor(cond_tokens, dtype=torch.long, device=device)
           cond_context = clip(cond_tokens)

           uncond_tokens = tokenizer.batch_encode_plus(
               [uncond_prompt],padding="max_length", max_length=77
               ).input_ids
           uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device=device)
           uncond_context = clip(uncond_tokens)

           context = torch.cat([cond_context,uncond_context])
        else:
            tokens = tokenizer.batch_encode_plus(
               [prompt],padding="max_length", max_length=77
               ).input_ids
            tokens = torch.tensor(cond_tokens, dtype=torch.long, device=device)
            context = clip(tokens)
        to_idle(clip)

        if sampler_name == "ddpm":
            sampler = DDPMSampler(generator)
            sampler.set_inference_timesteps(n_inference_steps)
        else:
            raise ValueError("Unkown sampler value %s. ")

        latents_shape = (1,4,LATENTS_HEIGHT,LATENTS_WIDTH)

        if input_image:
            encoder = models["encoder"]
            encoder.to(device)

            input_image_tensor = input_image.resize((WIDTH,HEIGHT))
            input_image_tensor = np.array(input_image_tensor)
            input_image_tensor = torch.tensor(input_image_tensor,dtype=torch.float32,device=device)
            input_image_tensor = rescale(input_image_tensor,(0,255),(-1,1))
            input_image_tensor = input_image_tensor.unsqueeze(0)
            input_image_tensor = input_image_tensor.permute(0,3,1,2)

            encoder_noise = torch.randn(latents_shape,generator=generator,device=device)
            latents = encoder(input_image_tensor,encoder_noise)

            sampler.set_strength(strength=strength)
            latents = sampler.add_noise(latents,sampler.timesteps[0])

            to_idle(encoder)
        else:
            latents = torch.randn(latents_shape,generator=generator,device=device)

        diffusion = models["diffusion"]
        diffusion.to(device)

        timesteps = tqdm(sampler.timesteps)
        for i,timestep in enumerate(timesteps):
            time_embedding = get_time_embedding(timestep).to(device)

            model_input = latents

            if do_cfg:
                model_input = model_input.repeat(2,1,1,1)

            model_output = diffusion(model_input,context,time_embedding)

            if do_cfg:
                output_cond, output_uncond = model_output.chunk(2)
                model_output = cfg_scale * (output_cond-output_uncond)+output_uncond

            latents = sampler.step(timestep,latents,model_output)

        to_idle(diffusion)

        decoder = models["decoder"]
        decoder.to(device)
        images = decoder(latents)
        to_idle(decoder)
        images = rescale(images,(-1,1),(0,255),clamp=True)
        images=images.permute(0,2,3,1)
        images = images.to("cpu",torch.uint8).numpy()
        return images[0]
    
def rescale(x,old_range, new_range, clamp=False):
    old_min, old_max = old_range
    new_min, new_max = new_range
    x-=old_min
    x*=(new_max-new_min)/(old_max-old_min)
    x+=new_min
    if clamp:
        x = x.clamp(new_min,new_max)
    return x

def get_time_embedding(timestep):
    freqs = torch.pow(10000,-torch.arange(start=0,end=160,dtype=torch.float32)/160)
    x = torch.tensor([timestep],dtype=torch.float32)[:,None]*freqs[None]
    return torch.cat([torch.cos(x),torch.sin(x)],dim=-1)