comfyanonymous
diff --git a/‎.github/workflows/test-execution.yml‎
Lines changed: 30 additions & 0 deletions b/‎.github/workflows/test-execution.yml‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 3 deletions b/‎README.md‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎app/user_manager.py‎
Lines changed: 10 additions & 3 deletions b/‎app/user_manager.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎comfy/audio_encoders/audio_encoders.py‎
Lines changed: 42 additions & 0 deletions b/‎comfy/audio_encoders/audio_encoders.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎comfy/audio_encoders/wav2vec2.py‎
Lines changed: 207 additions & 0 deletions b/‎comfy/audio_encoders/wav2vec2.py‎
Lines changed: 207 additions & 0 deletions
diff --git a/‎comfy/cli_args.py‎
Lines changed: 2 additions & 1 deletion b/‎comfy/cli_args.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎comfy/clip_model.py‎
Lines changed: 11 additions & 1 deletion b/‎comfy/clip_model.py‎
Lines changed: 11 additions & 1 deletion
@@ -0,0 +1,30 @@
+name: Execution Tests
+
+on:
+  push:
+    branches: [ main, master ]
+  pull_request:
+    branches: [ main, master ]
+
+jobs:
+  test:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+    runs-on: ${{ matrix.os }}
+    continue-on-error: true
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python      
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+    - name: Install requirements
+      run: |
+        python -m pip install --upgrade pip
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+        pip install -r requirements.txt
+        pip install -r tests-unit/requirements.txt
+    - name: Run Execution Tests
+      run: |
+        python -m pytest tests/execution -v --skip-timing-checks
@@ -71,18 +71,17 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
    - [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
    - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/)
    - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/)
-   - [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/)
    - [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/)
 - Image Editing Models
    - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
    - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
    - [HiDream E1.1](https://comfyanonymous.github.io/ComfyUI_examples/hidream/#hidream-e11)
+   - [Qwen Image Edit](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/#edit-model)
 - Video Models
    - [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/)
    - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/)
    - [LTX-Video](https://comfyanonymous.github.io/ComfyUI_examples/ltxv/)
    - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
-   - [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/) and [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/)
    - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
    - [Wan 2.2](https://comfyanonymous.github.io/ComfyUI_examples/wan22/)
 - Audio Models
@@ -197,7 +196,7 @@ comfy install
 
 ## Manual Install (Windows, Linux)
 
-python 3.13 is supported but using 3.12 is recommended because some custom nodes and their dependencies might not support it yet.
+Python 3.13 is very well supported. If you have trouble with some custom node dependencies you can try 3.12
 
 Git clone this repo.
 
 
@@ -363,10 +363,17 @@ async def post_userdata(request):
             if not overwrite and os.path.exists(path):
                 return web.Response(status=409, text="File already exists")
 
-            body = await request.read()
+            try:
+                body = await request.read()
 
-            with open(path, "wb") as f:
-                f.write(body)
+                with open(path, "wb") as f:
+                    f.write(body)
+            except OSError as e:
+                logging.warning(f"Error saving file '{path}': {e}")
+                return web.Response(
+                    status=400,
+                    reason="Invalid filename. Please avoid special characters like :\\/*?\"<>|"
+                )
 
             user_path = self.get_request_user_filepath(request, None)
             if full_info:
 
@@ -0,0 +1,42 @@
+from .wav2vec2 import Wav2Vec2Model
+import comfy.model_management
+import comfy.ops
+import comfy.utils
+import logging
+import torchaudio
+
+
+class AudioEncoderModel():
+    def __init__(self, config):
+        self.load_device = comfy.model_management.text_encoder_device()
+        offload_device = comfy.model_management.text_encoder_offload_device()
+        self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
+        self.model = Wav2Vec2Model(dtype=self.dtype, device=offload_device, operations=comfy.ops.manual_cast)
+        self.model.eval()
+        self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
+        self.model_sample_rate = 16000
+
+    def load_sd(self, sd):
+        return self.model.load_state_dict(sd, strict=False)
+
+    def get_sd(self):
+        return self.model.state_dict()
+
+    def encode_audio(self, audio, sample_rate):
+        comfy.model_management.load_model_gpu(self.patcher)
+        audio = torchaudio.functional.resample(audio, sample_rate, self.model_sample_rate)
+        out, all_layers = self.model(audio.to(self.load_device))
+        outputs = {}
+        outputs["encoded_audio"] = out
+        outputs["encoded_audio_all_layers"] = all_layers
+        return outputs
+
+
+def load_audio_encoder_from_sd(sd, prefix=""):
+    audio_encoder = AudioEncoderModel(None)
+    sd = comfy.utils.state_dict_prefix_replace(sd, {"wav2vec2.": ""})
+    m, u = audio_encoder.load_sd(sd)
+    if len(m) > 0:
+        logging.warning("missing audio encoder: {}".format(m))
+
+    return audio_encoder
@@ -0,0 +1,207 @@
+import torch
+import torch.nn as nn
+from comfy.ldm.modules.attention import optimized_attention_masked
+
+
+class LayerNormConv(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, bias=False, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.conv = operations.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, bias=bias, device=device, dtype=dtype)
+        self.layer_norm = operations.LayerNorm(out_channels, elementwise_affine=True, device=device, dtype=dtype)
+
+    def forward(self, x):
+        x = self.conv(x)
+        return torch.nn.functional.gelu(self.layer_norm(x.transpose(-2, -1)).transpose(-2, -1))
+
+
+class ConvFeatureEncoder(nn.Module):
+    def __init__(self, conv_dim, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.conv_layers = nn.ModuleList([
+            LayerNormConv(1, conv_dim, kernel_size=10, stride=5, bias=True, device=device, dtype=dtype, operations=operations),
+            LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
+            LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
+            LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
+            LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
+            LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
+            LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
+        ])
+
+    def forward(self, x):
+        x = x.unsqueeze(1)
+
+        for conv in self.conv_layers:
+            x = conv(x)
+
+        return x.transpose(1, 2)
+
+
+class FeatureProjection(nn.Module):
+    def __init__(self, conv_dim, embed_dim, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.layer_norm = operations.LayerNorm(conv_dim, eps=1e-05, device=device, dtype=dtype)
+        self.projection = operations.Linear(conv_dim, embed_dim, device=device, dtype=dtype)
+
+    def forward(self, x):
+        x = self.layer_norm(x)
+        x = self.projection(x)
+        return x
+
+
+class PositionalConvEmbedding(nn.Module):
+    def __init__(self, embed_dim=768, kernel_size=128, groups=16):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            embed_dim,
+            embed_dim,
+            kernel_size=kernel_size,
+            padding=kernel_size // 2,
+            groups=groups,
+        )
+        self.conv = torch.nn.utils.parametrizations.weight_norm(self.conv, name="weight", dim=2)
+        self.activation = nn.GELU()
+
+    def forward(self, x):
+        x = x.transpose(1, 2)
+        x = self.conv(x)[:, :, :-1]
+        x = self.activation(x)
+        x = x.transpose(1, 2)
+        return x
+
+
+class TransformerEncoder(nn.Module):
+    def __init__(
+        self,
+        embed_dim=768,
+        num_heads=12,
+        num_layers=12,
+        mlp_ratio=4.0,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+
+        self.pos_conv_embed = PositionalConvEmbedding(embed_dim=embed_dim)
+        self.layers = nn.ModuleList([
+            TransformerEncoderLayer(
+                embed_dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                device=device, dtype=dtype, operations=operations
+            )
+            for _ in range(num_layers)
+        ])
+
+        self.layer_norm = operations.LayerNorm(embed_dim, eps=1e-05, device=device, dtype=dtype)
+
+    def forward(self, x, mask=None):
+        x = x + self.pos_conv_embed(x)
+        all_x = ()
+        for layer in self.layers:
+            all_x += (x,)
+            x = layer(x, mask)
+        x = self.layer_norm(x)
+        all_x += (x,)
+        return x, all_x
+
+
+class Attention(nn.Module):
+    def __init__(self, embed_dim, num_heads, bias=True, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+
+        self.k_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
+        self.v_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
+        self.q_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
+        self.out_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
+
+    def forward(self, x, mask=None):
+        assert (mask is None)  # TODO?
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+
+        out = optimized_attention_masked(q, k, v, self.num_heads)
+        return self.out_proj(out)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, embed_dim, mlp_ratio, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.intermediate_dense = operations.Linear(embed_dim, int(embed_dim * mlp_ratio), device=device, dtype=dtype)
+        self.output_dense = operations.Linear(int(embed_dim * mlp_ratio), embed_dim, device=device, dtype=dtype)
+
+    def forward(self, x):
+        x = self.intermediate_dense(x)
+        x = torch.nn.functional.gelu(x)
+        x = self.output_dense(x)
+        return x
+
+
+class TransformerEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        embed_dim=768,
+        num_heads=12,
+        mlp_ratio=4.0,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+
+        self.attention = Attention(embed_dim, num_heads, device=device, dtype=dtype, operations=operations)
+
+        self.layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype)
+        self.feed_forward = FeedForward(embed_dim, mlp_ratio, device=device, dtype=dtype, operations=operations)
+        self.final_layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype)
+
+    def forward(self, x, mask=None):
+        residual = x
+        x = self.layer_norm(x)
+        x = self.attention(x, mask=mask)
+        x = residual + x
+
+        x = x + self.feed_forward(self.final_layer_norm(x))
+        return x
+
+
+class Wav2Vec2Model(nn.Module):
+    """Complete Wav2Vec 2.0 model."""
+
+    def __init__(
+        self,
+        embed_dim=1024,
+        final_dim=256,
+        num_heads=16,
+        num_layers=24,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+
+        conv_dim = 512
+        self.feature_extractor = ConvFeatureEncoder(conv_dim, device=device, dtype=dtype, operations=operations)
+        self.feature_projection = FeatureProjection(conv_dim, embed_dim, device=device, dtype=dtype, operations=operations)
+
+        self.masked_spec_embed = nn.Parameter(torch.empty(embed_dim, device=device, dtype=dtype))
+
+        self.encoder = TransformerEncoder(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            num_layers=num_layers,
+            device=device, dtype=dtype, operations=operations
+        )
+
+    def forward(self, x, mask_time_indices=None, return_dict=False):
+
+        x = torch.mean(x, dim=1)
+
+        x = (x - x.mean()) / torch.sqrt(x.var() + 1e-7)
+
+        features = self.feature_extractor(x)
+        features = self.feature_projection(features)
+
+        batch_size, seq_len, _ = features.shape
+
+        x, all_x = self.encoder(features)
+
+        return x, all_x
@@ -143,8 +143,9 @@ class PerformanceFeature(enum.Enum):
     Fp16Accumulation = "fp16_accumulation"
     Fp8MatrixMultiplication = "fp8_matrix_mult"
     CublasOps = "cublas_ops"
+    AutoTune = "autotune"
 
-parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops")
+parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature))))
 
 parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
 parser.add_argument("--disable-mmap", action="store_true", help="Don't use mmap when loading safetensors.")
 
@@ -61,15 +61,25 @@ def __init__(self, num_layers, embed_dim, heads, intermediate_size, intermediate
     def forward(self, x, mask=None, intermediate_output=None):
         optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True)
 
+        all_intermediate = None
         if intermediate_output is not None:
-            if intermediate_output < 0:
+            if intermediate_output == "all":
+                all_intermediate = []
+                intermediate_output = None
+            elif intermediate_output < 0:
                 intermediate_output = len(self.layers) + intermediate_output
 
         intermediate = None
         for i, l in enumerate(self.layers):
             x = l(x, mask, optimized_attention)
             if i == intermediate_output:
                 intermediate = x.clone()
+            if all_intermediate is not None:
+                all_intermediate.append(x.unsqueeze(1).clone())
+
+        if all_intermediate is not None:
+            intermediate = torch.cat(all_intermediate, dim=1)
+
         return x, intermediate
 
 class CLIPEmbeddings(torch.nn.Module):