Black output
When I run this code, I get the black results:
Same. Using the same code from model card
Cc: @a-r-r-o-w
Can you share your output for diffusers-cli env
?
Here's mine:
- 🤗 Diffusers version: 0.33.0.dev0
- Platform: Linux-5.4.0-166-generic-x86_64-with-glibc2.31
- Running on Google Colab?: No
- Python version: 3.10.14
- PyTorch version (GPU?): 2.5.1+cu124 (True)
- Flax version (CPU?/GPU?/TPU?): 0.8.5 (cpu)
- Jax version: 0.4.31
- JaxLib version: 0.4.31
- Huggingface_hub version: 0.26.2
- Transformers version: 4.48.0.dev0
- Accelerate version: 1.1.0.dev0
- PEFT version: 0.13.3.dev0
- Bitsandbytes version: 0.43.3
- Safetensors version: 0.4.5
- xFormers version: not installed
- Accelerator: NVIDIA A100-SXM4-80GB, 81920 MiB
NVIDIA A100-SXM4-80GB, 81920 MiB
NVIDIA A100-SXM4-80GB, 81920 MiB
NVIDIA DGX Display, 4096 MiB
NVIDIA A100-SXM4-80GB, 81920 MiB
- Using GPU in script?: <fill in>
- Using distributed or parallel set-up in script?: <fill in>
Just verified again that the code works for me:
一样同样的问题,下载模型过程中,还出现提醒说,模型size不匹配
Can you share your output for
diffusers-cli env
?Here's mine:
- 🤗 Diffusers version: 0.33.0.dev0 - Platform: Linux-5.4.0-166-generic-x86_64-with-glibc2.31 - Running on Google Colab?: No - Python version: 3.10.14 - PyTorch version (GPU?): 2.5.1+cu124 (True) - Flax version (CPU?/GPU?/TPU?): 0.8.5 (cpu) - Jax version: 0.4.31 - JaxLib version: 0.4.31 - Huggingface_hub version: 0.26.2 - Transformers version: 4.48.0.dev0 - Accelerate version: 1.1.0.dev0 - PEFT version: 0.13.3.dev0 - Bitsandbytes version: 0.43.3 - Safetensors version: 0.4.5 - xFormers version: not installed - Accelerator: NVIDIA A100-SXM4-80GB, 81920 MiB NVIDIA A100-SXM4-80GB, 81920 MiB NVIDIA A100-SXM4-80GB, 81920 MiB NVIDIA DGX Display, 4096 MiB NVIDIA A100-SXM4-80GB, 81920 MiB - Using GPU in script?: <fill in> - Using distributed or parallel set-up in script?: <fill in>
Just verified again that the code works for me:
- 🤗 Diffusers version: 0.33.0.dev0
- Platform: Linux
- Running on Google Colab?: No
- Python version: 3.10.14
- PyTorch version (GPU?): 2.4.0+cu124 (True)
- Huggingface_hub version: 0.24.2
- Transformers version: 4.46.3
- Accelerate version: 0.33.0
- PEFT version: 0.12.0
- Bitsandbytes version: 0.43.2
- Safetensors version: 0.4.3
- xFormers version: 0.0.27
- Accelerator: NVIDIA A100-80GB, 81920 MiB
Yes, please upgrade to torch 2.5.1 and it should hopefully work. Atleast two other people have confirmed that upgrading the torch version fixed the black output problem
Yes, please upgrade to torch 2.5.1 and it should hopefully work. Atleast two other people have confirmed that upgrading the torch version fixed the black output problem
Thank you! It works for me~
Generated video is black, and the token number is limited,
default text encoder is CLIP? how to change another text encoder?
Token indices sequence length is longer than the specified maximum sequence length for this model (123 > 77). Running this sequence through the model will result in indexing errorsThe following part of your input was truncated because CLIP can only handle sequences up to 77 tokens:
您好,我这边也是黑屏,我分析发现是SDPA这个接口,在torch2.3的时候,对应mask中,水平方向某一行如果全是inf的时候, torch2.3的sdpa输出全是nan,这样会导致黑屏,我试了torch2.5.1,SDPA这个接口,对应mask中水平方向某一行全是inf,输出变为了0,这样视频显示是正常的
torch.sdpa 和 varlen attn等效替换方案可以按如下处理:
import torch
import torch.nn.functional as F
import random
import numpy as np
from flash_attn import flash_attn_varlen_func, flash_attn_func
def set_seeds(seed_list, device=None):
if isinstance(seed_list, (tuple, list)):
seed = sum(seed_list)
else:
seed = seed_list
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
def test_flash():
dtype = torch.bfloat16
HEAD = 24
HEAD_DIM = 128
seqlens = [1080+2, 256-2]
querys = []
keys = []
values = []
# for l in seqlens:
img_q = torch.rand(seqlens[0], HEAD, HEAD_DIM, dtype=dtype).cuda()
img_k = torch.rand(seqlens[0], HEAD, HEAD_DIM, dtype=dtype).cuda()
img_v = torch.rand(seqlens[0], HEAD, HEAD_DIM, dtype=dtype).cuda()
txt_q = torch.rand(seqlens[1], HEAD, HEAD_DIM, dtype=dtype).cuda()
txt_k = torch.rand(seqlens[1], HEAD, HEAD_DIM, dtype=dtype).cuda()
txt_v = torch.rand(seqlens[1], HEAD, HEAD_DIM, dtype=dtype).cuda()
# querys.append(img_q)
# keys.append(img_k)
# values.append(img_v)
# querys.append(txt_q)
# keys.append(txt_k)
# values.append(txt_v)
query = torch.cat((img_q, txt_q), dim=0)
key = torch.cat((img_k, txt_k), dim=0)
value = torch.cat((img_v, txt_v), dim=0)
querys = query.split(seqlens[0], dim=0)
keys = key.split(seqlens[0], dim=0)
values = value.split(seqlens[0], dim=0)
atol, rtol = 1e-5, 1e-8
print(torch.allclose(querys[0], img_q, atol=atol, rtol=rtol))
print(torch.allclose(querys[1], txt_q, atol=atol, rtol=rtol))
# query = torch.cat([querys[0], querys[1]], dim=0)
# key = torch.cat([keys[0], keys[1]], dim=0)
# value = torch.cat([values[0], values[1]], dim=0)
print(f'query.shape: {query.shape}')
print(f'key.shape: {key.shape}')
print(f'value.shape: {value.shape}')
print("===Standard===")
sdpa_out = []
for q, k, v in zip(querys, keys, values):
print(f'q.shape: {q.shape}')
print(f'k.shape: {k.shape}')
print(f'v.shape: {v.shape}')
q = q.unsqueeze(0)
k = k.unsqueeze(0)
v = v.unsqueeze(0)
print(f'q.shape1: {q.shape}')
print(f'k.shape1: {k.shape}')
print(f'v.shape1: {v.shape}')
out = flash_attn_func(q, k, v)
print(f'flash_attn_func out.shape: {out.shape}')
q = q.transpose(1, 2)
k = k.transpose(1, 2)
v = v.transpose(1, 2)
print(f'q.shape2: {q.shape}')
print(f'k.shape2: {k.shape}')
print(f'v.shape2: {v.shape}')
out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).contiguous()
print(f'torch.sdpa out.shape: {out.shape}')
sdpa_out.append(out)
print(out.shape)
print("====sdpa end=====\n")
print("===Varlen===")
seq_len = torch.tensor(seqlens, dtype=torch.int32).cuda()
# NOTE: flash_attn_varlen_func这个接口需要(bs + 1)长度的cu_seqlens_q和cu_seqlens_k
prefill_start_pos = torch.cumsum(seq_len, dim=0, dtype=torch.int32) - seq_len
prefill_start_pos = torch.cat([prefill_start_pos, torch.tensor([torch.sum(seq_len)], dtype=torch.int32, device="cuda")], dim=0)
print(prefill_start_pos.shape)
print(prefill_start_pos)
print(f'query2.shape: {query.shape}')
print(f'key2.shape: {key.shape}')
print(f'value2.shape: {value.shape}')
cu_seqlens_q = prefill_start_pos
cu_seqlens_k = prefill_start_pos
max_seqlen_q = max(seqlens)
max_seqlen_k = max(seqlens)
out = flash_attn_varlen_func(query, key, value, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k)
print(f'flash_attn_varlen_func out.shape: {out.shape}')
print("===Varlen end===\n")
i = 0
acc = 0
for qlen in seqlens:
print(f"====seqlen: {qlen}=====")
varlen_out = out[acc:acc+qlen]
print(f'varlen out.shape: {varlen_out.shape}')
print(f'torch.sdpa out.shape: {sdpa_out[i].shape}')
similarity = torch.cosine_similarity(
varlen_out.to("cpu").ravel().double(),
sdpa_out[i].to("cpu").ravel().double(),
dim=0
).item()
i = i + 1
acc += qlen
print(f'flash varlen vs torch.sdpa similarity: {similarity}')
if name == "main":
set_seeds(43)
test_flash()
您好,我这边发现一个torch.sdpa和flash varlen这个接口的等价关系
其中 https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformers/transformer_hunyuan_video.py#L729,
这个mask应该改为如下这样,这样才可以保证diffusers 的结果和hunyunvidoe原始的输出完全是一样的
for i in range(batch_size):
attention_mask[i, :effective_sequence_length[i], :effective_sequence_length[i]] = True
attention_mask[i, effective_sequence_length[i]: , effective_sequence_length[i]:] = True
attn_mask应该是类似这样:
###torch.sdpa和flash varlen等价验证如下:
import torch
import torch.nn.functional as F
import random
import numpy as np
def set_seeds(seed_list, device=None):
if isinstance(seed_list, (tuple, list)):
seed = sum(seed_list)
else:
seed = seed_list
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
def flash_attn_impl(q, k, v, batch_size, seq_len, num_heads, head_dim, img_len, txt_len, effective_condition_sequence_length, device):
from flash_attn import flash_attn_varlen_func
cu_seqlens_q = torch.tensor([0, img_len + effective_condition_sequence_length, img_len + txt_len], device=device, dtype=torch.int32)
cu_seqlens_k = torch.tensor([0, img_len + effective_condition_sequence_length, img_len + txt_len], device=device, dtype=torch.int32)
max_seqlen_q = img_len + txt_len
max_seqlen_k = img_len + txt_len
print(f'cu_seqlens_q: {cu_seqlens_q}')
print(f'cu_seqlens_k: {cu_seqlens_k}')
print(f'max_seqlen_q: {max_seqlen_q}')
print(f'max_seqlen_k: {max_seqlen_k}')
q = q.view(-1, num_heads, head_dim)
k = k.view(-1, num_heads, head_dim)
v = v.view(-1, num_heads, head_dim)
print(f' flash attn q.shape: {q.shape}')
print(f' flash attn k.shape: {k.shape}')
print(f' flash attn v.shape: {v.shape}')
x = flash_attn_varlen_func(
q,
k,
v,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_q,
max_seqlen_k=max_seqlen_k,
dropout_p= 0.0,
)
x = x.view(batch_size, seq_len, num_heads * head_dim)
print(f'flash varlen x.shape: {x.shape}')
return x
def torch_impl(q, k, v, batch_size, seq_len, num_heads, head_dim, img_len, txt_len, effective_condition_sequence_length, device):
q = q.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
k = k.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
v = v.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
sequence_length = img_len + txt_len
attn_mask = torch.zeros(
batch_size, sequence_length, sequence_length, device=device, dtype=torch.bool
) # [batch_size, seq_len, seq_len]
print(f'attn_mask: {attn_mask}, attn_mask.shape: {attn_mask.shape}')
effective_sequence_length = [img_len + effective_condition_sequence_length]
for i in range(batch_size):
attn_mask[i, :effective_sequence_length[i], :effective_sequence_length[i]] = True
attn_mask[i, effective_sequence_length[i]: , effective_sequence_length[i]:] = True
print(f'attn_mask: {attn_mask}')
out = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
x = out.transpose(1, 2).contiguous().view(batch_size, seq_len, num_heads * head_dim)
print(f'torch.sdpa x.shape: {x.shape}')
return x
if name == "main":
set_seeds(43)
device='cuda'
dtype=torch.bfloat16
num_heads = 16
head_dim = 64
img_len = 1080
txt_len = 256
batch_size = 1
seq_len = img_len + txt_len
effective_condition_sequence_length = 2
q = torch.randn([batch_size, seq_len, num_heads, head_dim], device=device, dtype=dtype)
k = torch.randn([batch_size, seq_len, num_heads, head_dim], device=device, dtype=dtype)
v = torch.randn([batch_size, seq_len, num_heads, head_dim], device=device, dtype=dtype)
print(f'attn q.shape: {q.shape}')
print(f'attn k.shape: {k.shape}')
print(f'attn v.shape: {v.shape}')
x1 = flash_attn_impl(q, k, v, batch_size, seq_len, num_heads, head_dim, img_len, txt_len, effective_condition_sequence_length, device)
x2= torch_impl(q, k, v, batch_size, seq_len, num_heads, head_dim, img_len, txt_len, effective_condition_sequence_length, device)
print(f'flash attn x1: {x1}')
print(f'torch.sdpa x2: {x2}')
atol, rtol = 1e-3, 1e-8
# print(f'flash varlen vs torch.sdpa: {torch.allclose(x1, x2, atol=atol, rtol=rtol)}')
similarity = torch.cosine_similarity(
x1.to("cpu").ravel().double(),
x2.to("cpu").ravel().double(),
dim=0
).item()
print(f'flash varlen vs torch.sdpa similarity: {similarity}')