Skip to content

Commit 30db310

Browse files
committed
Merge remote-tracking branch 'origin/master'
2 parents 6c5cbe5 + 206595f commit 30db310

File tree

104 files changed

+8026
-2257
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

104 files changed

+8026
-2257
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
name: Execution Tests
2+
3+
on:
4+
push:
5+
branches: [ main, master ]
6+
pull_request:
7+
branches: [ main, master ]
8+
9+
jobs:
10+
test:
11+
strategy:
12+
matrix:
13+
os: [ubuntu-latest, windows-latest, macos-latest]
14+
runs-on: ${{ matrix.os }}
15+
continue-on-error: true
16+
steps:
17+
- uses: actions/checkout@v4
18+
- name: Set up Python
19+
uses: actions/setup-python@v4
20+
with:
21+
python-version: '3.12'
22+
- name: Install requirements
23+
run: |
24+
python -m pip install --upgrade pip
25+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
26+
pip install -r requirements.txt
27+
pip install -r tests-unit/requirements.txt
28+
- name: Run Execution Tests
29+
run: |
30+
python -m pytest tests/execution -v --skip-timing-checks

README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,18 +71,17 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
7171
- [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
7272
- [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/)
7373
- [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/)
74-
- [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/)
7574
- [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/)
7675
- Image Editing Models
7776
- [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
7877
- [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
7978
- [HiDream E1.1](https://comfyanonymous.github.io/ComfyUI_examples/hidream/#hidream-e11)
79+
- [Qwen Image Edit](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/#edit-model)
8080
- Video Models
8181
- [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/)
8282
- [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/)
8383
- [LTX-Video](https://comfyanonymous.github.io/ComfyUI_examples/ltxv/)
8484
- [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
85-
- [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/) and [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/)
8685
- [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
8786
- [Wan 2.2](https://comfyanonymous.github.io/ComfyUI_examples/wan22/)
8887
- Audio Models
@@ -197,7 +196,7 @@ comfy install
197196

198197
## Manual Install (Windows, Linux)
199198

200-
python 3.13 is supported but using 3.12 is recommended because some custom nodes and their dependencies might not support it yet.
199+
Python 3.13 is very well supported. If you have trouble with some custom node dependencies you can try 3.12
201200

202201
Git clone this repo.
203202

app/user_manager.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -363,10 +363,17 @@ async def post_userdata(request):
363363
if not overwrite and os.path.exists(path):
364364
return web.Response(status=409, text="File already exists")
365365

366-
body = await request.read()
366+
try:
367+
body = await request.read()
367368

368-
with open(path, "wb") as f:
369-
f.write(body)
369+
with open(path, "wb") as f:
370+
f.write(body)
371+
except OSError as e:
372+
logging.warning(f"Error saving file '{path}': {e}")
373+
return web.Response(
374+
status=400,
375+
reason="Invalid filename. Please avoid special characters like :\\/*?\"<>|"
376+
)
370377

371378
user_path = self.get_request_user_filepath(request, None)
372379
if full_info:
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from .wav2vec2 import Wav2Vec2Model
2+
import comfy.model_management
3+
import comfy.ops
4+
import comfy.utils
5+
import logging
6+
import torchaudio
7+
8+
9+
class AudioEncoderModel():
10+
def __init__(self, config):
11+
self.load_device = comfy.model_management.text_encoder_device()
12+
offload_device = comfy.model_management.text_encoder_offload_device()
13+
self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
14+
self.model = Wav2Vec2Model(dtype=self.dtype, device=offload_device, operations=comfy.ops.manual_cast)
15+
self.model.eval()
16+
self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
17+
self.model_sample_rate = 16000
18+
19+
def load_sd(self, sd):
20+
return self.model.load_state_dict(sd, strict=False)
21+
22+
def get_sd(self):
23+
return self.model.state_dict()
24+
25+
def encode_audio(self, audio, sample_rate):
26+
comfy.model_management.load_model_gpu(self.patcher)
27+
audio = torchaudio.functional.resample(audio, sample_rate, self.model_sample_rate)
28+
out, all_layers = self.model(audio.to(self.load_device))
29+
outputs = {}
30+
outputs["encoded_audio"] = out
31+
outputs["encoded_audio_all_layers"] = all_layers
32+
return outputs
33+
34+
35+
def load_audio_encoder_from_sd(sd, prefix=""):
36+
audio_encoder = AudioEncoderModel(None)
37+
sd = comfy.utils.state_dict_prefix_replace(sd, {"wav2vec2.": ""})
38+
m, u = audio_encoder.load_sd(sd)
39+
if len(m) > 0:
40+
logging.warning("missing audio encoder: {}".format(m))
41+
42+
return audio_encoder

comfy/audio_encoders/wav2vec2.py

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
import torch
2+
import torch.nn as nn
3+
from comfy.ldm.modules.attention import optimized_attention_masked
4+
5+
6+
class LayerNormConv(nn.Module):
7+
def __init__(self, in_channels, out_channels, kernel_size, stride, bias=False, dtype=None, device=None, operations=None):
8+
super().__init__()
9+
self.conv = operations.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, bias=bias, device=device, dtype=dtype)
10+
self.layer_norm = operations.LayerNorm(out_channels, elementwise_affine=True, device=device, dtype=dtype)
11+
12+
def forward(self, x):
13+
x = self.conv(x)
14+
return torch.nn.functional.gelu(self.layer_norm(x.transpose(-2, -1)).transpose(-2, -1))
15+
16+
17+
class ConvFeatureEncoder(nn.Module):
18+
def __init__(self, conv_dim, dtype=None, device=None, operations=None):
19+
super().__init__()
20+
self.conv_layers = nn.ModuleList([
21+
LayerNormConv(1, conv_dim, kernel_size=10, stride=5, bias=True, device=device, dtype=dtype, operations=operations),
22+
LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
23+
LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
24+
LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
25+
LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
26+
LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
27+
LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
28+
])
29+
30+
def forward(self, x):
31+
x = x.unsqueeze(1)
32+
33+
for conv in self.conv_layers:
34+
x = conv(x)
35+
36+
return x.transpose(1, 2)
37+
38+
39+
class FeatureProjection(nn.Module):
40+
def __init__(self, conv_dim, embed_dim, dtype=None, device=None, operations=None):
41+
super().__init__()
42+
self.layer_norm = operations.LayerNorm(conv_dim, eps=1e-05, device=device, dtype=dtype)
43+
self.projection = operations.Linear(conv_dim, embed_dim, device=device, dtype=dtype)
44+
45+
def forward(self, x):
46+
x = self.layer_norm(x)
47+
x = self.projection(x)
48+
return x
49+
50+
51+
class PositionalConvEmbedding(nn.Module):
52+
def __init__(self, embed_dim=768, kernel_size=128, groups=16):
53+
super().__init__()
54+
self.conv = nn.Conv1d(
55+
embed_dim,
56+
embed_dim,
57+
kernel_size=kernel_size,
58+
padding=kernel_size // 2,
59+
groups=groups,
60+
)
61+
self.conv = torch.nn.utils.parametrizations.weight_norm(self.conv, name="weight", dim=2)
62+
self.activation = nn.GELU()
63+
64+
def forward(self, x):
65+
x = x.transpose(1, 2)
66+
x = self.conv(x)[:, :, :-1]
67+
x = self.activation(x)
68+
x = x.transpose(1, 2)
69+
return x
70+
71+
72+
class TransformerEncoder(nn.Module):
73+
def __init__(
74+
self,
75+
embed_dim=768,
76+
num_heads=12,
77+
num_layers=12,
78+
mlp_ratio=4.0,
79+
dtype=None, device=None, operations=None
80+
):
81+
super().__init__()
82+
83+
self.pos_conv_embed = PositionalConvEmbedding(embed_dim=embed_dim)
84+
self.layers = nn.ModuleList([
85+
TransformerEncoderLayer(
86+
embed_dim=embed_dim,
87+
num_heads=num_heads,
88+
mlp_ratio=mlp_ratio,
89+
device=device, dtype=dtype, operations=operations
90+
)
91+
for _ in range(num_layers)
92+
])
93+
94+
self.layer_norm = operations.LayerNorm(embed_dim, eps=1e-05, device=device, dtype=dtype)
95+
96+
def forward(self, x, mask=None):
97+
x = x + self.pos_conv_embed(x)
98+
all_x = ()
99+
for layer in self.layers:
100+
all_x += (x,)
101+
x = layer(x, mask)
102+
x = self.layer_norm(x)
103+
all_x += (x,)
104+
return x, all_x
105+
106+
107+
class Attention(nn.Module):
108+
def __init__(self, embed_dim, num_heads, bias=True, dtype=None, device=None, operations=None):
109+
super().__init__()
110+
self.embed_dim = embed_dim
111+
self.num_heads = num_heads
112+
self.head_dim = embed_dim // num_heads
113+
114+
self.k_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
115+
self.v_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
116+
self.q_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
117+
self.out_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
118+
119+
def forward(self, x, mask=None):
120+
assert (mask is None) # TODO?
121+
q = self.q_proj(x)
122+
k = self.k_proj(x)
123+
v = self.v_proj(x)
124+
125+
out = optimized_attention_masked(q, k, v, self.num_heads)
126+
return self.out_proj(out)
127+
128+
129+
class FeedForward(nn.Module):
130+
def __init__(self, embed_dim, mlp_ratio, dtype=None, device=None, operations=None):
131+
super().__init__()
132+
self.intermediate_dense = operations.Linear(embed_dim, int(embed_dim * mlp_ratio), device=device, dtype=dtype)
133+
self.output_dense = operations.Linear(int(embed_dim * mlp_ratio), embed_dim, device=device, dtype=dtype)
134+
135+
def forward(self, x):
136+
x = self.intermediate_dense(x)
137+
x = torch.nn.functional.gelu(x)
138+
x = self.output_dense(x)
139+
return x
140+
141+
142+
class TransformerEncoderLayer(nn.Module):
143+
def __init__(
144+
self,
145+
embed_dim=768,
146+
num_heads=12,
147+
mlp_ratio=4.0,
148+
dtype=None, device=None, operations=None
149+
):
150+
super().__init__()
151+
152+
self.attention = Attention(embed_dim, num_heads, device=device, dtype=dtype, operations=operations)
153+
154+
self.layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype)
155+
self.feed_forward = FeedForward(embed_dim, mlp_ratio, device=device, dtype=dtype, operations=operations)
156+
self.final_layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype)
157+
158+
def forward(self, x, mask=None):
159+
residual = x
160+
x = self.layer_norm(x)
161+
x = self.attention(x, mask=mask)
162+
x = residual + x
163+
164+
x = x + self.feed_forward(self.final_layer_norm(x))
165+
return x
166+
167+
168+
class Wav2Vec2Model(nn.Module):
169+
"""Complete Wav2Vec 2.0 model."""
170+
171+
def __init__(
172+
self,
173+
embed_dim=1024,
174+
final_dim=256,
175+
num_heads=16,
176+
num_layers=24,
177+
dtype=None, device=None, operations=None
178+
):
179+
super().__init__()
180+
181+
conv_dim = 512
182+
self.feature_extractor = ConvFeatureEncoder(conv_dim, device=device, dtype=dtype, operations=operations)
183+
self.feature_projection = FeatureProjection(conv_dim, embed_dim, device=device, dtype=dtype, operations=operations)
184+
185+
self.masked_spec_embed = nn.Parameter(torch.empty(embed_dim, device=device, dtype=dtype))
186+
187+
self.encoder = TransformerEncoder(
188+
embed_dim=embed_dim,
189+
num_heads=num_heads,
190+
num_layers=num_layers,
191+
device=device, dtype=dtype, operations=operations
192+
)
193+
194+
def forward(self, x, mask_time_indices=None, return_dict=False):
195+
196+
x = torch.mean(x, dim=1)
197+
198+
x = (x - x.mean()) / torch.sqrt(x.var() + 1e-7)
199+
200+
features = self.feature_extractor(x)
201+
features = self.feature_projection(features)
202+
203+
batch_size, seq_len, _ = features.shape
204+
205+
x, all_x = self.encoder(features)
206+
207+
return x, all_x

comfy/cli_args.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,8 +143,9 @@ class PerformanceFeature(enum.Enum):
143143
Fp16Accumulation = "fp16_accumulation"
144144
Fp8MatrixMultiplication = "fp8_matrix_mult"
145145
CublasOps = "cublas_ops"
146+
AutoTune = "autotune"
146147

147-
parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops")
148+
parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature))))
148149

149150
parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
150151
parser.add_argument("--disable-mmap", action="store_true", help="Don't use mmap when loading safetensors.")

comfy/clip_model.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,25 @@ def __init__(self, num_layers, embed_dim, heads, intermediate_size, intermediate
6161
def forward(self, x, mask=None, intermediate_output=None):
6262
optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True)
6363

64+
all_intermediate = None
6465
if intermediate_output is not None:
65-
if intermediate_output < 0:
66+
if intermediate_output == "all":
67+
all_intermediate = []
68+
intermediate_output = None
69+
elif intermediate_output < 0:
6670
intermediate_output = len(self.layers) + intermediate_output
6771

6872
intermediate = None
6973
for i, l in enumerate(self.layers):
7074
x = l(x, mask, optimized_attention)
7175
if i == intermediate_output:
7276
intermediate = x.clone()
77+
if all_intermediate is not None:
78+
all_intermediate.append(x.unsqueeze(1).clone())
79+
80+
if all_intermediate is not None:
81+
intermediate = torch.cat(all_intermediate, dim=1)
82+
7383
return x, intermediate
7484

7585
class CLIPEmbeddings(torch.nn.Module):

0 commit comments

Comments
 (0)