Skip to content

Race condition on fetching the same url #34

@zsol

Description

@zsol

When calling Cache.async_fetch with the same url multiple times concurrently, the coroutines step on each other while writing to a temporary file. The symptom is a FileNotFoundError thrown during os.replace():

    |     os.replace(tmp, output_file)
    | FileNotFoundError: [Errno 2] No such file or directory: ...

Looks like this happens because the temp file gets shared between coroutines.

I think there needs to be either a locking mechanism in there, or a safer way to make temporary filenames.

Repro script:

import asyncio
import logging
from pathlib import Path

from honesty.cache import Cache


async def main() -> None:
    """
    Fetch the same Python wheel ten times in parallel using honesty's cache.async_fetch function.
    This is a torture test to stress test the honesty cache system.
    """
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    # Use libcst wheel as an example
    package_name = "libcst"
    wheel_url = "https://files.pythonhosted.org/packages/b7/31/39c110eb66d5fd7cc4891cf55192a358a6be8b8f6ac0e2eb709850104456/libcst-1.8.0-cp313-cp313t-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl"

    logger.info(f"Starting parallel fetch of {package_name} wheel from {wheel_url}")

    # Create a cache with proxy settings similar to the example
    async with Cache(
        fresh_index=True,
    ) as cache:
        # Use task group to run 10 fetch tasks in parallel
        tasks = []
        async with asyncio.TaskGroup() as tg:
            for i in range(10):
                task = tg.create_task(fetch_wheel(cache, package_name, wheel_url, i))
                tasks.append(task)

        # Collect results after all tasks have completed
        results = [task.result() for task in tasks]

        # Check if all fetches were successful
        successful = all(result is not None for result in results)
        logger.info(f"All fetches {'successful' if successful else 'failed'}")


async def fetch_wheel(
    cache: Cache, package_name: str, wheel_url: str, index: int
) -> Path:
    """Fetch a wheel and return the path to the downloaded file."""
    logger = logging.getLogger(__name__)
    logger.info(f"Starting fetch {index} for {package_name}")

    try:
        wheel_file = await cache.async_fetch(package_name, wheel_url)
        logger.info(f"Fetch {index} completed: {wheel_file}")
        return wheel_file
    except Exception as e:
        logger.error(f"Fetch {index} failed: {e}")
        raise


if __name__ == "__main__":
    asyncio.run(main())

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions