Skip to content

Commit

Permalink
SD img2img (#25)
Browse files Browse the repository at this point in the history
* sd_img2img

* update

* add benchmark

* update

Co-authored-by: Terry Chen <[email protected]>
  • Loading branch information
terrychenism and Terry Chen authored Oct 7, 2022
1 parent 70ff7da commit 445a20e
Show file tree
Hide file tree
Showing 4 changed files with 493 additions and 3 deletions.
23 changes: 22 additions & 1 deletion examples/05_stable_diffusion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ In this example, we show how to build fast AIT modules for CLIP, UNet, VAE model

First, clone, build, and install AITemplate [per the README instructions](https://github.com/facebookincubator/AITemplate#clone-the-code).

This AIT stable diffusion example depends on `diffusers`, `transformers`, `torch` and `click`.
This AIT stable diffusion example depends on `diffusers`, `transformers`, `torch` and `click`.

Verify the library versions. We have tested transformers 4.21/4.22/4.23, diffusers 0.3/0.4 and torch 1.11/1.12.

Expand All @@ -30,6 +30,11 @@ python3 examples/05_stable_diffusion/compile.py --token ACCESS_TOKEN
```
It generates three folders: `./tmp/CLIPTextModel`, `./tmp/UNet2DConditionModel`, `./tmp/AutoencoderKL`. In each folder, there is a `test.so` file which is the generated AIT module for the model.

Compile the img2img models:
```
python3 examples/05_stable_diffusion/compile.py --img2img True --token ACCESS_TOKEN
```

#### Multi-GPU profiling
AIT needs to do profiling to select the best algorithms for CUTLASS and CK.
To enable multiple GPUs for profiling, use the environment variable `CUDA_VISIBLE_DEVICES` on NVIDIA platform and `HIP_VISIBLE_DEVICES` on AMD platform.
Expand All @@ -50,6 +55,12 @@ Run AIT models with an example image:
python3 examples/05_stable_diffusion/demo.py --token ACCESS_TOKEN
```

Img2img demo:

```
python3 examples/05_stable_diffusion/demo_img2img.py --token ACCESS_TOKEN
```

Check the resulted image: `example_ait.png`


Expand Down Expand Up @@ -131,10 +142,20 @@ _OOM = Out of Memory_
| 16 | 7906 | 0.49 |


## IMG2IMG

### A100-40GB / CUDA 11.6, 40 steps

| Module | PT Latency (ms) | AIT Latency (ms) |
|----------|-----------------|------------------|
| Pipeline | 4163.60 | 1785.46 |



### Note for Performance Results

- For all benchmarks we render the images of size 512x512
- For img2img model we only support fix input 512x768 by default, stay tuned for dynamic shape support
- For NVIDIA A100, our test cluster doesn't allow to lock frequency. We make warm up longer to collect more stable results, but it is expected to have small variance to the results with locked frequency.
- To benchmark MI-250 1 GCD, we lock the frequency with command `rocm-smi -d x --setperfdeterminism 1700`, where `x` is the GPU id.
- Performance results are what we can reproduced & take reference. It should not be used for other purposes.
8 changes: 6 additions & 2 deletions examples/05_stable_diffusion/compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,9 +317,10 @@ def compile_vae(
@click.command()
@click.option("--token", default="", help="access token")
@click.option("--batch-size", default=1, help="batch size")
@click.option("--img2img", default=False, help="compile img2img models")
@click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
@click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
def compile_diffusers(token, batch_size, use_fp16_acc=True, convert_conv_to_gemm=True):
def compile_diffusers(token, batch_size, img2img=False, use_fp16_acc=True, convert_conv_to_gemm=True):
logging.getLogger().setLevel(logging.INFO)
np.random.seed(0)
torch.manual_seed(4896)
Expand All @@ -338,16 +339,19 @@ def compile_diffusers(token, batch_size, use_fp16_acc=True, convert_conv_to_gemm
use_auth_token=access_token,
).to("cuda")

width = 96 if img2img else 64

# CLIP
compile_clip(batch_size=batch_size, use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm)
# UNet
compile_unet(
batch_size=batch_size * 2,
ww=width,
use_fp16_acc=use_fp16_acc,
convert_conv_to_gemm=convert_conv_to_gemm,
)
# VAE
compile_vae(batch_size=batch_size, use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm)
compile_vae(batch_size=batch_size, width=width, use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm)


if __name__ == "__main__":
Expand Down
67 changes: 67 additions & 0 deletions examples/05_stable_diffusion/demo_img2img.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from io import BytesIO

import click
import requests
import torch
from PIL import Image

from aitemplate.testing.benchmark_pt import benchmark_torch_function
from pipeline_stable_diffusion_img2img_ait import StableDiffusionImg2ImgAITPipeline


@click.command()
@click.option("--token", default="", help="access token")
@click.option(
"--prompt", default="A fantasy landscape, trending on artstation", help="prompt"
)
@click.option(
"--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
)
def run(token, prompt, benchmark):

# load the pipeline
device = "cuda"
model_id_or_path = "CompVis/stable-diffusion-v1-4"
pipe = StableDiffusionImg2ImgAITPipeline.from_pretrained(
model_id_or_path,
revision="fp16",
torch_dtype=torch.float16,
use_auth_token=token,
)
pipe = pipe.to(device)

# let's download an initial image
url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"

response = requests.get(url)
init_image = Image.open(BytesIO(response.content)).convert("RGB")
init_image = init_image.resize((768, 512))

with torch.autocast("cuda"):
images = pipe(
prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5
).images
if benchmark:
args = (prompt, init_image)
t = benchmark_torch_function(10, pipe, *args)
print(f"sd e2e: {t} ms")

images[0].save("fantasy_landscape_ait.png")


if __name__ == "__main__":
run()
Loading

0 comments on commit 445a20e

Please sign in to comment.