diff --git a/tripy/examples/diffusion/clip_model.py b/tripy/examples/diffusion/clip_model.py index 98a3d200..1c41d5f9 100644 --- a/tripy/examples/diffusion/clip_model.py +++ b/tripy/examples/diffusion/clip_model.py @@ -29,7 +29,7 @@ class CLIPConfig: num_heads: int = 12 max_seq_len: int = 77 num_hidden_layers: int = 12 - dtype: tp.dtype = tp.float16 + dtype: tp.dtype = tp.float32 class CLIPMLP(tp.Module): def __init__(self, config: CLIPConfig): @@ -52,6 +52,7 @@ def __init__(self, config: CLIPConfig): self.v_proj = tp.Linear(self.embed_dim, self.embed_dim, dtype=config.dtype) self.q_proj = tp.Linear(self.embed_dim, self.embed_dim, dtype=config.dtype) self.out_proj = tp.Linear(self.embed_dim, self.embed_dim, dtype=config.dtype) + self.dtype = config.dtype def __call__(self, hidden_states, causal_attention_mask): bsz, tgt_len, embed_dim = hidden_states.shape[0], hidden_states.shape[1], hidden_states.shape[2] @@ -65,7 +66,7 @@ def __call__(self, hidden_states, causal_attention_mask): for x in (q, k, v) ] attn_output = scaled_dot_product_attention( - q, k, v, embedding_dim=self.head_dim, attn_mask=causal_attention_mask + q, k, v, embedding_dim=self.head_dim, attn_mask=causal_attention_mask, dtype=self.dtype, ) out = self.out_proj(tp.reshape(tp.transpose(attn_output, 1, 2), (bsz, tgt_len, embed_dim))) return out @@ -74,18 +75,18 @@ def __call__(self, hidden_states, causal_attention_mask): class CLIPEncoderLayer(tp.Module): def __init__(self, config: CLIPConfig): self.self_attn = CLIPAttention(config) - self.layer_norm1 = tp.LayerNorm(config.embedding_size, dtype=config.dtype) + self.layer_norm1 = tp.LayerNorm(config.embedding_size, dtype=tp.float32) self.mlp = CLIPMLP(config) - self.layer_norm2 = tp.LayerNorm(config.embedding_size, dtype=config.dtype) + self.layer_norm2 = tp.LayerNorm(config.embedding_size, dtype=tp.float32) def __call__(self, hidden_states, causal_attention_mask): residual = hidden_states - hidden_states = self.layer_norm1(hidden_states) + hidden_states = tp.cast(self.layer_norm1(tp.cast(hidden_states, self.layer_norm1.dtype)), hidden_states.dtype) hidden_states = self.self_attn(hidden_states, causal_attention_mask) hidden_states = residual + hidden_states residual = hidden_states - hidden_states = self.layer_norm2(hidden_states) + hidden_states = tp.cast(self.layer_norm2(tp.cast(hidden_states, self.layer_norm2.dtype)), hidden_states.dtype) hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states @@ -115,10 +116,10 @@ class CLIPTextTransformer(tp.Module): def __init__(self, config: CLIPConfig): self.embeddings = CLIPTextEmbeddings(config) self.encoder = CLIPEncoder(config) - self.final_layer_norm = tp.LayerNorm(config.embedding_size, dtype=config.dtype) + self.final_layer_norm = tp.LayerNorm(config.embedding_size, dtype=tp.float32) self.max_seq_len = config.max_seq_len def __call__(self, input_ids): x = self.embeddings(input_ids, tp.reshape(tp.iota((input_ids.shape[1],), dtype=tp.int32), (1, -1))) x = self.encoder(x, tp.triu(tp.full((1, 1, self.max_seq_len, self.max_seq_len), float("-inf")), 1)) - return self.final_layer_norm(x) \ No newline at end of file + return tp.cast(self.final_layer_norm(tp.cast(x, self.final_layer_norm.dtype)), x.dtype) \ No newline at end of file diff --git a/tripy/examples/diffusion/example.py b/tripy/examples/diffusion/example.py index de7a3ca6..8f1bd82d 100644 --- a/tripy/examples/diffusion/example.py +++ b/tripy/examples/diffusion/example.py @@ -52,7 +52,7 @@ def compile_clip(model, dtype=tp.int32, verbose=False): return compile_model(model, inputs, verbose=verbose) -def compile_unet(model, dtype=tp.float16, verbose=False): +def compile_unet(model, dtype, verbose=False): unconditional_context_shape = (1, 77, 768) conditional_context_shape = (1, 77, 768) latent_shape = (1, 4, 64, 64) @@ -68,16 +68,16 @@ def compile_unet(model, dtype=tp.float16, verbose=False): return compile_model(model, inputs, verbose=verbose) -def compile_vae(model, dtype=tp.float16, verbose=False): +def compile_vae(model, dtype, verbose=False): inputs = (tp.InputInfo((1, 4, 64, 64), dtype=dtype),) return compile_model(model, inputs, verbose=verbose) -def run_diffusion_loop(model, unconditional_context, context, latent, steps, guidance): +def run_diffusion_loop(model, unconditional_context, context, latent, steps, guidance, dtype): timesteps = list(range(1, 1000, 1000 // steps)) - print(f"[I] Running diffusion for {timesteps} timesteps...") - alphas = get_alphas_cumprod()[tp.Tensor(timesteps)] - alphas_prev = tp.concatenate([tp.Tensor([1.0]), alphas[:-1]], dim=0) + print(f"[I] Running diffusion for {steps} timesteps...") + alphas = get_alphas_cumprod(dtype=dtype)[tp.Tensor(timesteps)] + alphas_prev = tp.concatenate([tp.Tensor([1.0], dtype=dtype), alphas[:-1]], dim=0) for index, timestep in (t := tqdm(list(enumerate(timesteps))[::-1])): t.set_description("idx: %1d, timestep: %3d" % (index, timestep)) @@ -86,10 +86,10 @@ def run_diffusion_loop(model, unconditional_context, context, latent, steps, gui unconditional_context, context, latent, - tp.cast(tp.Tensor([timestep]), tp.float32), + tp.Tensor([timestep], dtype=dtype), alphas[tid], alphas_prev[tid], - tp.Tensor([guidance]), + tp.Tensor([guidance], dtype=dtype), ) return latent @@ -97,21 +97,23 @@ def run_diffusion_loop(model, unconditional_context, context, latent, steps, gui def tripy_diffusion(args): run_start_time = time.perf_counter() - if os.path.isdir("engines"): + dtype, torch_dtype = (tp.float16, torch.float16) if args.fp16 else (tp.float32, torch.float32) + + if os.path.isdir(args.engine_dir): print("[I] Loading cached engines from disk...") clip_compiled = tp.Executable.load(os.path.join("engines", "clip_executable.json")) unet_compiled = tp.Executable.load(os.path.join("engines", "unet_executable.json")) vae_compiled = tp.Executable.load(os.path.join("engines", "vae_executable.json")) else: - model = StableDiffusion(StableDiffusionConfig(dtype=tp.float16)) + model = StableDiffusion(StableDiffusionConfig(dtype=dtype)) print("[I] Loading model weights...", flush=True) - load_from_diffusers(model, tp.float16, debug=True) + load_from_diffusers(model, dtype, args.hf_token, debug=True) clip_compiled = compile_clip(model.cond_stage_model.transformer.text_model, verbose=True) - unet_compiled = compile_unet(model, verbose=True) - vae_compiled = compile_vae(model.decode, verbose=True) + unet_compiled = compile_unet(model, dtype, verbose=True) + vae_compiled = compile_vae(model.decode, dtype, verbose=True) - os.mkdir("engines") - print("[I] Saving engines to disk...") + os.mkdir(args.engine_dir) + print(f"[I] Saving engines to {args.engine_dir}...") clip_compiled.save(os.path.join("engines", "clip_executable.json")) unet_compiled.save(os.path.join("engines", "unet_executable.json")) vae_compiled.save(os.path.join("engines", "vae_executable.json")) @@ -135,11 +137,11 @@ def tripy_diffusion(args): # Backbone of diffusion - the UNet if args.seed is not None: torch.manual_seed(args.seed) - torch_latent = torch.randn((1, 4, 64, 64)).to("cuda") + torch_latent = torch.randn((1, 4, 64, 64), dtype=torch_dtype).to("cuda") latent = tp.Tensor(torch_latent) diffusion_run_start = time.perf_counter() - latent = run_diffusion_loop(unet_compiled, unconditional_context, context, latent, args.steps, args.guidance) + latent = run_diffusion_loop(unet_compiled, unconditional_context, context, latent, args.steps, args.guidance, dtype) diffusion_run_end = time.perf_counter() print(f"[I] Finished diffusion denoising. Inference took {diffusion_run_end - diffusion_run_start} seconds.") @@ -173,15 +175,17 @@ def hf_diffusion(args): run_start_time = time.perf_counter() + dtype = torch.float16 if args.fp16 else torch.float32 + model_opts = {'variant': 'fp16', 'torch_dtype': torch.float16} if args.fp16 else {} + # Initialize models - model_id = "CompVis/stable-diffusion-v1-4" #"benjamin-paine/stable-diffusion-v1-5" #"runwayml/stable-diffusion-v1-5" - clip_id = "openai/clip-vit-large-patch14" + model_id = "KiwiXR/stable-diffusion-v1-5" print("[I] Loading models...") - hf_tokenizer = CLIPTokenizer.from_pretrained(clip_id) - hf_encoder = CLIPTextModel.from_pretrained(clip_id).to("cuda") - unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet").to("cuda") - vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae").to("cuda") + hf_tokenizer = CLIPTokenizer.from_pretrained(model_id, subfolder="tokenizer") + hf_encoder = CLIPTextModel.from_pretrained(model_id, subfolder="text_encoder").to("cuda") + unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet", use_auth_token=args.hf_token, **model_opts).to("cuda") + vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae", use_auth_token=args.hf_token, **model_opts).to("cuda") scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000) # Run through CLIP to get context from prompt @@ -192,19 +196,20 @@ def hf_diffusion(args): uncond_input = hf_tokenizer([""], padding="max_length", max_length=max_length, return_tensors="pt").to("cuda") text_embeddings = hf_encoder(text_input.input_ids, output_hidden_states=True)[0] uncond_embeddings = hf_encoder(uncond_input.input_ids)[0] - text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) + text_embeddings = torch.cat([uncond_embeddings, text_embeddings]).to(dtype) clip_run_end = time.perf_counter() print(f"took {clip_run_end - clip_run_start} seconds.") # Backbone of diffusion - the UNet if args.seed is not None: torch.manual_seed(args.seed) - torch_latent = torch.randn((1, 4, 64, 64)).to("cuda") + torch_latent = torch.randn((1, 4, 64, 64), dtype=dtype).to("cuda") torch_latent *= scheduler.init_noise_sigma scheduler.set_timesteps(args.steps) diffusion_run_start = time.perf_counter() + print(f"[I] Running diffusion for {args.steps} timesteps...") for t in tqdm(scheduler.timesteps): # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes. latent_model_input = torch.cat([torch_latent] * 2) @@ -267,7 +272,6 @@ def print_summary(denoising_steps, times): # TODO: Add torch compilation modes -# TODO: Add fp16 support # TODO: Add Timing context def main(): default_prompt = "a horse sized cat eating a bagel" @@ -282,6 +286,8 @@ def main(): parser.add_argument("--seed", type=int, help="Set the random latent seed") parser.add_argument("--guidance", type=float, default=7.5, help="Prompt strength") parser.add_argument('--torch-inference', action='store_true', help="Run inference with PyTorch (eager mode) instead of TensorRT.") + parser.add_argument('--hf-token', type=str, default='', help="HuggingFace API access token for downloading model checkpoints") + parser.add_argument('--engine-dir', type=str, default='engines', help="Output directory for TensorRT engines") args = parser.parse_args() if args.torch_inference: diff --git a/tripy/examples/diffusion/helper.py b/tripy/examples/diffusion/helper.py index 9fdbbc0d..ae4e956e 100644 --- a/tripy/examples/diffusion/helper.py +++ b/tripy/examples/diffusion/helper.py @@ -12,7 +12,7 @@ def scaled_dot_product_attention( embedding_dim: Optional[int] = None, attn_mask: Optional[tp.Tensor] = None, is_causal: bool = False, - dtype: tp.dtype = tp.float16 + dtype: tp.dtype = tp.float32 ) -> tp.Tensor: """ Computes scaled dot-product attention. diff --git a/tripy/examples/diffusion/model.py b/tripy/examples/diffusion/model.py index 85984f6e..9fad013b 100644 --- a/tripy/examples/diffusion/model.py +++ b/tripy/examples/diffusion/model.py @@ -33,7 +33,7 @@ @dataclass class StableDiffusionConfig: - dtype: tp.dtype = tp.float16 + dtype: tp.dtype = tp.float32 clip_config: Optional[CLIPConfig] = field(default=None, init=False) unet_config: Optional[UNetConfig] = field(default=None, init=False) vae_config: Optional[VAEConfig] = field(default=None, init=False) @@ -44,11 +44,11 @@ def __post_init__(self): self.vae_config = VAEConfig(dtype=self.dtype) # equivalent to LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000) -def get_alphas_cumprod(beta_start=0.00085, beta_end=0.0120, n_training_steps=1000): +def get_alphas_cumprod(beta_start=0.00085, beta_end=0.0120, n_training_steps=1000, dtype=tp.float32): betas = np.linspace(beta_start**0.5, beta_end**0.5, n_training_steps, dtype=np.float32) ** 2 alphas = 1.0 - betas alphas_cumprod = np.cumprod(alphas, axis=0) - return tp.Tensor(alphas_cumprod) + return tp.cast(tp.Tensor(alphas_cumprod), dtype) class StableDiffusion(tp.Module): diff --git a/tripy/examples/diffusion/unet_model.py b/tripy/examples/diffusion/unet_model.py index fb91f224..a18bd3d7 100644 --- a/tripy/examples/diffusion/unet_model.py +++ b/tripy/examples/diffusion/unet_model.py @@ -18,6 +18,7 @@ import math from typing import List, Tuple +import torch import tripy as tp from dataclasses import dataclass @@ -33,28 +34,30 @@ class UNetConfig: num_heads: int = 8 context_dim: int = 768 emb_channels: int = 1280 - dtype: tp.dtype = tp.float16 + dtype: tp.dtype = tp.float32 # Used for UNet, not to be confused with ResnetBlock, called ResnetBlock2D in HF diffusers class ResBlock(tp.Module): def __init__(self, config: UNetConfig, channels, emb_channels, out_channels): - self.norm1 = tp.GroupNorm(32, channels, dtype=config.dtype) + self.norm1 = tp.GroupNorm(32, channels, dtype=tp.float32) self.conv1 = tp.Conv(channels, out_channels, (3, 3), padding=((1, 1), (1, 1)), dtype=config.dtype) self.time_emb_proj = tp.Linear(emb_channels, out_channels, dtype=config.dtype) - self.norm2 = tp.GroupNorm(32, out_channels, dtype=config.dtype) + self.norm2 = tp.GroupNorm(32, out_channels, dtype=tp.float32) self.conv2 = tp.Conv(out_channels, out_channels, (3, 3), padding=((1, 1), (1, 1)), dtype=config.dtype) self.nonlinearity = tp.silu self.conv_shortcut = tp.Conv(channels, out_channels, (1, 1), dtype=config.dtype) if channels != out_channels else lambda x: x def __call__(self, x, emb): - h = self.conv1(self.nonlinearity(self.norm1(x))) + h = tp.cast(self.norm1(tp.cast(x, self.norm1.dtype)), x.dtype) + h = self.conv1(self.nonlinearity(h)) emb_out = self.time_emb_proj(self.nonlinearity(emb)) target_shape = emb_out.shape + (1, 1) # TODO: #228: WAR to prevent computing output rank in infer_rank for reshape target_shape.trace_tensor.shape = (emb_out.rank + 2,) h = h + tp.reshape(emb_out, target_shape) - h = self.conv2(self.nonlinearity(self.norm2(h))) + h = tp.cast(self.norm2(tp.cast(h, self.norm2.dtype)), h.dtype) + h = self.conv2(self.nonlinearity(h)) ret = self.conv_shortcut(x) + h return ret @@ -67,6 +70,7 @@ def __init__(self, config: UNetConfig, query_dim, context_dim, n_heads, d_head): self.num_heads = n_heads self.head_size = d_head self.to_out = [tp.Linear(n_heads * d_head, query_dim, dtype=config.dtype)] + self.dtype = config.dtype def __call__(self, x, context=None): context = x if context is None else context @@ -74,7 +78,7 @@ def __call__(self, x, context=None): q, k, v = [ tp.transpose(tp.reshape(y, (x.shape[0], -1, self.num_heads, self.head_size)), 1, 2) for y in (q, k, v) ] - attention = tp.transpose(scaled_dot_product_attention(q, k, v, embedding_dim=self.head_size), 1, 2) + attention = tp.transpose(scaled_dot_product_attention(q, k, v, embedding_dim=self.head_size, dtype=self.dtype), 1, 2) h_ = tp.reshape(attention, (x.shape[0], -1, self.num_heads * self.head_size)) out = sequential(h_, self.to_out) return out @@ -116,20 +120,20 @@ def __init__(self, config, dim, context_dim, n_heads, d_head): self.attn1 = CrossAttention(config, dim, dim, n_heads, d_head) self.ff = FeedForward(config, dim) self.attn2 = CrossAttention(config, dim, context_dim, n_heads, d_head) - self.norm1 = tp.LayerNorm(dim, dtype=config.dtype) - self.norm2 = tp.LayerNorm(dim, dtype=config.dtype) - self.norm3 = tp.LayerNorm(dim, dtype=config.dtype) + self.norm1 = tp.LayerNorm(dim, dtype=tp.float32) + self.norm2 = tp.LayerNorm(dim, dtype=tp.float32) + self.norm3 = tp.LayerNorm(dim, dtype=tp.float32) def __call__(self, x, context=None): - x = self.attn1(self.norm1(x)) + x - x = self.attn2(self.norm2(x), context=context) + x - x = self.ff(self.norm3(x)) + x + x = self.attn1(tp.cast(self.norm1(tp.cast(x, self.norm1.dtype)), x.dtype)) + x + x = self.attn2(tp.cast(self.norm2(tp.cast(x, self.norm2.dtype)), x.dtype), context=context) + x + x = self.ff(tp.cast(self.norm3(tp.cast(x, self.norm3.dtype)), x.dtype)) + x return x class SpatialTransformer(tp.Module): # Transformer2dModel in HF diffusers def __init__(self, config: UNetConfig, channels, context_dim, n_heads, d_head): - self.norm = tp.GroupNorm(32, channels, dtype=config.dtype) + self.norm = tp.GroupNorm(32, channels, dtype=tp.float32) assert channels == n_heads * d_head self.proj_in = tp.Conv(channels, n_heads * d_head, (1, 1), dtype=config.dtype) self.transformer_blocks = [BasicTransformerBlock(config, channels, context_dim, n_heads, d_head)] @@ -138,7 +142,7 @@ def __init__(self, config: UNetConfig, channels, context_dim, n_heads, d_head): def __call__(self, x, context=None): b, c, h, w = x.shape x_in = x - x = self.norm(x) + x = tp.cast(self.norm(tp.cast(x, self.norm.dtype)), x.dtype) x = self.proj_in(x) x = tp.permute(tp.reshape(x, (b, c, h * w)), (0, 2, 1)) for block in self.transformer_blocks: @@ -272,7 +276,7 @@ def __init__(self, config: UNetConfig): CrossAttnUpBlock2D(config, up_channels[2:5], down_channels[2]), CrossAttnUpBlock2D(config, up_channels[4:7], down_channels[1], use_upsampler=False), ] - self.conv_norm_out = tp.GroupNorm(32, config.model_channels, dtype=config.dtype) + self.conv_norm_out = tp.GroupNorm(32, config.model_channels, dtype=tp.float32) self.conv_act = tp.silu self.conv_out = tp.Conv(config.model_channels, config.io_channels, (3, 3), padding=((1, 1), (1, 1)), dtype=config.dtype) @@ -280,7 +284,6 @@ def __call__(self, x, timesteps=None, context=None): # TODO: real time embedding t_emb = timestep_embedding(timesteps, self.config.model_channels, self.config.dtype) emb = self.time_embedding(t_emb) - x = self.conv_in(x) saved_inputs = [x] @@ -301,6 +304,7 @@ def __call__(self, x, timesteps=None, context=None): else: x = block(x, emb, context, partial_inputs) - act = self.conv_out(self.conv_act(self.conv_norm_out(x))) + act = tp.cast(self.conv_norm_out(tp.cast(x, self.conv_norm_out.dtype)), x.dtype) + act = self.conv_out(self.conv_act(act)) return act diff --git a/tripy/examples/diffusion/vae_model.py b/tripy/examples/diffusion/vae_model.py index cc19103f..111bdb33 100644 --- a/tripy/examples/diffusion/vae_model.py +++ b/tripy/examples/diffusion/vae_model.py @@ -29,28 +29,29 @@ class VAEConfig: model_channel: int = 128 channel_mult_encode: Tuple[int] = (1, 1, 2, 4, 4) channel_mult_decode: Tuple[int] = (4, 4, 4, 2, 1) - dtype: tp.dtype = tp.float16 + dtype: tp.dtype = tp.float32 class AttnBlock(tp.Module): def __init__(self, config: VAEConfig, in_channels): - self.group_norm = tp.GroupNorm(32, in_channels, dtype=config.dtype) + self.group_norm = tp.GroupNorm(32, in_channels, dtype=tp.float32) self.to_q = tp.Linear(in_channels, in_channels, dtype=config.dtype) self.to_k = tp.Linear(in_channels, in_channels, dtype=config.dtype) self.to_v = tp.Linear(in_channels, in_channels, dtype=config.dtype) self.to_out = [tp.Linear(in_channels, in_channels, dtype=config.dtype)] self.in_channels = in_channels + self.dtype = config.dtype # adapted from AttnBlock in ldm repo def __call__(self, x): - h_ = self.group_norm(x) + h_ = tp.cast(self.group_norm(tp.cast(x, self.group_norm.dtype)), x.dtype) b, c, h, w = h_.shape h_flat = tp.transpose(tp.reshape(h_, (b, c, h * w)), 1, 2) q, k, v = self.to_q(h_flat), self.to_k(h_flat), self.to_v(h_flat) # compute attention - h_ = scaled_dot_product_attention(q, k, v, embedding_dim=self.in_channels) + h_ = scaled_dot_product_attention(q, k, v, embedding_dim=self.in_channels, dtype=self.dtype) out = tp.reshape( tp.transpose(self.to_out[0](h_), 1, 2), (b, c, h, w), @@ -60,16 +61,16 @@ def __call__(self, x): # Not to be confused with ResBlock. Called ResnetBlock2D in HF diffusers class ResnetBlock(tp.Module): def __init__(self, config: VAEConfig, in_channels, out_channels=None): - self.norm1 = tp.GroupNorm(32, in_channels, dtype=config.dtype) + self.norm1 = tp.GroupNorm(32, in_channels, dtype=tp.float32) self.conv1 = tp.Conv(in_channels, out_channels, (3, 3), padding=((1, 1), (1, 1)), dtype=config.dtype) - self.norm2 = tp.GroupNorm(32, out_channels, dtype=config.dtype) + self.norm2 = tp.GroupNorm(32, out_channels, dtype=tp.float32) self.conv2 = tp.Conv(out_channels, out_channels, (3, 3), padding=((1, 1), (1, 1)), dtype=config.dtype) self.nonlinearity = tp.silu self.conv_shortcut = tp.Conv(in_channels, out_channels, (1, 1), dtype=config.dtype) if in_channels != out_channels else lambda x: x def __call__(self, x): - h = self.conv1(self.nonlinearity(self.norm1(x))) - h = self.conv2(self.nonlinearity(self.norm2(h))) + h = self.conv1(self.nonlinearity(tp.cast(self.norm1(tp.cast(x, self.norm1.dtype)), x.dtype))) + h = self.conv2(self.nonlinearity(tp.cast(self.norm2(tp.cast(h, self.norm2.dtype)), h.dtype))) return self.conv_shortcut(x) + h class Downsample(tp.Module): @@ -122,7 +123,7 @@ def __init__(self, config: VAEConfig): self.conv_in = tp.Conv(config.latent_channels, config.model_channel * config.channel_mult_decode[0], (3, 3), padding=((1, 1), (1, 1)), dtype=config.dtype) self.up_blocks = [UpDecoderBlock2D(config, up_channels[i], up_channels[i+1], use_upsampler=upsamplers[i]) for i in range(num_resolutions)] self.mid_block = Mid(config, up_channels[0]) - self.conv_norm_out = tp.GroupNorm(32, config.model_channel, dtype=config.dtype) + self.conv_norm_out = tp.GroupNorm(32, config.model_channel, dtype=tp.float32) self.conv_act = tp.silu self.conv_out = tp.Conv(config.model_channel, config.io_channels, (3, 3), padding=((1, 1), (1, 1)), dtype=config.dtype) @@ -132,7 +133,7 @@ def __call__(self, x): for up_block in self.up_blocks: x = up_block(x) - return self.conv_out(self.conv_act(self.conv_norm_out(x))) + return self.conv_out(self.conv_act(tp.cast(self.conv_norm_out(tp.cast(x, self.conv_norm_out.dtype)), x.dtype))) class DownEncoderBlock2D(tp.Module): def __init__(self, config: VAEConfig, start_channels, channels, use_downsampler=True): @@ -156,7 +157,7 @@ def __init__(self, config: VAEConfig): self.conv_in = tp.Conv(config.io_channels, config.model_channel, (3, 3), padding=((1, 1), (1, 1)), dtype=config.dtype) self.down_blocks = [DownEncoderBlock2D(config, down_channels[i], down_channels[i+1], use_downsampler=downsamplers[i]) for i in range(num_resolutions)] self.mid_block = Mid(config, down_channels[-1]) - self.conv_norm_out = tp.GroupNorm(32, down_channels[-1], dtype=config.dtype) + self.conv_norm_out = tp.GroupNorm(32, down_channels[-1], dtype=tp.float32) self.conv_act = tp.silu self.conv_out = tp.Conv(down_channels[-1], 8, (3, 3), padding=((1, 1), (1, 1)), dtype=config.dtype) @@ -165,7 +166,7 @@ def __call__(self, x): for i in range(len(self.down_blocks)): x = self.down_blocks[i](x) x = self.mid_block(x) - return self.conv_out(self.conv_act(self.conv_norm_out(x))) + return self.conv_out(self.conv_act(tp.cast(self.conv_norm_out(tp.cast(x, self.conv_norm_out.dtype)), x.dtype))) class AutoencoderKL(tp.Module): diff --git a/tripy/examples/diffusion/weight_loader.py b/tripy/examples/diffusion/weight_loader.py index 14d98bed..3fa3b055 100644 --- a/tripy/examples/diffusion/weight_loader.py +++ b/tripy/examples/diffusion/weight_loader.py @@ -20,21 +20,16 @@ def load_weights_from_hf(model, hf_model, dtype, debug=False): torch_dtype = getattr(torch, dtype.name) for key in hf_keys: weight = hf_state_dict[key] - # print(weight.dtype) - # if "ln" in key or "gn" in key or "norm" in key: - # print(f"{key}: {weight.dtype}") - # if "norm" not in key: - # weight = weight.to(torch_dtype) - # print(f"{key}: {weight.dtype}") + if "norm" not in key: + weight = weight.to(torch_dtype) param = tp.Parameter(weight) tripy_state_dict[key.removeprefix("text_model.")] = param model.load_from_state_dict(tripy_state_dict) -def load_from_diffusers(model, dtype, debug=False): - model_id = "CompVis/stable-diffusion-v1-4" #"benjamin-paine/stable-diffusion-v1-5" #"runwayml/stable-diffusion-v1-5" - model_opts = {'variant': 'fp16', 'torch_dtype': torch.float16} if dtype == tp.float16 else {} - pipe = StableDiffusionPipeline.from_pretrained(model_id, **model_opts) +def load_from_diffusers(model, dtype, hf_token, debug=False): + model_id = "KiwiXR/stable-diffusion-v1-5" + pipe = StableDiffusionPipeline.from_pretrained(model_id, use_auth_token=hf_token) load_weights_from_hf(model.cond_stage_model.transformer.text_model, pipe.text_encoder, dtype, debug=debug) load_weights_from_hf(model.model.diffusion_model, pipe.unet, dtype, debug=debug)