Logging improvements (#2030)

natoverse · web-flow · commit 77fb7d9d7ddf · 2025-08-25T14:56:43.000-07:00
* Turn down blob/cosmos exception reporting to match file storage

* Restore indexing-engine.log

* Restore some basic console logging and progress for index CLI

* Semver

* Ignore small ruff complaints

* Fix CLI console printing
diff --git a/.semversioner/next-release/patch-20250814225833190425.json b/.semversioner/next-release/patch-20250814225833190425.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Improve upon recent logging refactor"
+}
diff --git a/graphrag/__init__.py b/graphrag/__init__.py
@@ -2,10 +2,3 @@
 # Licensed under the MIT License
 
 """The GraphRAG package."""
-
-import logging
-
-from graphrag.logger.standard_logging import init_console_logger
-
-logger = logging.getLogger(__name__)
-init_console_logger()
diff --git a/graphrag/callbacks/console_workflow_callbacks.py b/graphrag/callbacks/console_workflow_callbacks.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""A logger that emits updates from the indexing engine to the console."""
+
+from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks
+from graphrag.index.typing.pipeline_run_result import PipelineRunResult
+from graphrag.logger.progress import Progress
+
+# ruff: noqa: T201
+
+
+class ConsoleWorkflowCallbacks(NoopWorkflowCallbacks):
+    """A logger that writes to a console."""
+
+    _verbose = False
+
+    def __init__(self, verbose=False):
+        self._verbose = verbose
+
+    def pipeline_start(self, names: list[str]) -> None:
+        """Execute this callback to signal when the entire pipeline starts."""
+        print("Starting pipeline with workflows:", ", ".join(names))
+
+    def pipeline_end(self, results: list[PipelineRunResult]) -> None:
+        """Execute this callback to signal when the entire pipeline ends."""
+        print("Pipeline complete")
+
+    def workflow_start(self, name: str, instance: object) -> None:
+        """Execute this callback when a workflow starts."""
+        print(f"Starting workflow: {name}")
+
+    def workflow_end(self, name: str, instance: object) -> None:
+        """Execute this callback when a workflow ends."""
+        print("")  # account for potential return on prior progress
+        print(f"Workflow complete: {name}")
+        if self._verbose:
+            print(instance)
+
+    def progress(self, progress: Progress) -> None:
+        """Handle when progress occurs."""
+        complete = progress.completed_items or 0
+        total = progress.total_items or 1
+        percent = round((complete / total) * 100)
+        start = f"  {complete} / {total} "
+        print(f"{start:{'.'}<{percent}}", flush=True, end="\r")
diff --git a/graphrag/cli/index.py b/graphrag/cli/index.py
@@ -10,9 +10,11 @@
 from pathlib import Path
 
 import graphrag.api as api
+from graphrag.callbacks.console_workflow_callbacks import ConsoleWorkflowCallbacks
 from graphrag.config.enums import CacheType, IndexingMethod, ReportingType
 from graphrag.config.load_config import load_config
 from graphrag.index.validate_config import validate_config_names
+from graphrag.logger.standard_logging import DEFAULT_LOG_FILENAME
 from graphrag.utils.cli import redact
 
 # Ignore warnings from numba
@@ -115,7 +117,6 @@ def _run_index(
     # Initialize loggers and reporting config
     init_loggers(
         config=config,
-        root_dir=str(config.root_dir) if config.root_dir else None,
         verbose=verbose,
     )
 
@@ -124,8 +125,8 @@ def _run_index(
 
     # Log the configuration details
     if config.reporting.type == ReportingType.file:
-        log_dir = Path(config.root_dir or "") / (config.reporting.base_dir or "")
-        log_path = log_dir / "logs.txt"
+        log_dir = Path(config.root_dir) / config.reporting.base_dir
+        log_path = log_dir / DEFAULT_LOG_FILENAME
         logger.info("Logging enabled at %s", log_path)
     else:
         logger.info(
@@ -154,6 +155,7 @@ def _run_index(
             method=method,
             is_update_run=is_update_run,
             memory_profile=memprofile,
+            callbacks=[ConsoleWorkflowCallbacks(verbose=verbose)],
         )
     )
     encountered_errors = any(
diff --git a/graphrag/cli/prompt_tune.py b/graphrag/cli/prompt_tune.py
@@ -9,6 +9,7 @@
 import graphrag.api as api
 from graphrag.config.enums import ReportingType
 from graphrag.config.load_config import load_config
+from graphrag.logger.standard_logging import DEFAULT_LOG_FILENAME
 from graphrag.prompt_tune.generator.community_report_summarization import (
     COMMUNITY_SUMMARIZATION_FILENAME,
 )
@@ -75,14 +76,13 @@ async def prompt_tune(
     # initialize loggers with config
     init_loggers(
         config=graph_config,
-        root_dir=str(root_path),
         verbose=verbose,
     )
 
     # log the configuration details
     if graph_config.reporting.type == ReportingType.file:
-        log_dir = Path(root_path) / (graph_config.reporting.base_dir or "")
-        log_path = log_dir / "logs.txt"
+        log_dir = Path(root_path) / graph_config.reporting.base_dir
+        log_path = log_dir / DEFAULT_LOG_FILENAME
         logger.info("Logging enabled at %s", log_path)
     else:
         logger.info(
diff --git a/graphrag/cli/query.py b/graphrag/cli/query.py
@@ -4,7 +4,6 @@
 """CLI implementation of the query subcommand."""
 
 import asyncio
-import logging
 import sys
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
@@ -19,8 +18,7 @@
 if TYPE_CHECKING:
     import pandas as pd
 
-# Initialize standard logger
-logger = logging.getLogger(__name__)
+# ruff: noqa: T201
 
 
 def run_global_search(
@@ -61,10 +59,6 @@ def run_global_search(
         final_community_reports_list = dataframe_dict["community_reports"]
         index_names = dataframe_dict["index_names"]
 
-        logger.info(
-            "Running multi-index global search on indexes: %s",
-            dataframe_dict["index_names"],
-        )
         response, context_data = asyncio.run(
             api.multi_index_global_search(
                 config=config,
@@ -80,11 +74,7 @@ def run_global_search(
                 verbose=verbose,
             )
         )
-        # log the full response at INFO level for user visibility but at DEBUG level in the API layer
-        logger.info("Query Response:\n%s", response)
-
-        # NOTE: we return the response and context data here purely as a complete demonstration of the API.
-        # External users should use the API directly to get the response and context data.
+        print(response)
         return response, context_data
 
     # Otherwise, call the Single-Index Global Search API
@@ -118,9 +108,9 @@ def on_context(context: Any) -> None:
                 verbose=verbose,
             ):
                 full_response += stream_chunk
-                print(stream_chunk, end="")  # noqa: T201
-                sys.stdout.flush()  # flush output buffer to display text immediately
-            print()  # noqa: T201
+                print(stream_chunk, end="")
+                sys.stdout.flush()
+            print()
             return full_response, context_data
 
         return asyncio.run(run_streaming_search())
@@ -138,11 +128,8 @@ def on_context(context: Any) -> None:
             verbose=verbose,
         )
     )
-    # log the full response at INFO level for user visibility but at DEBUG level in the API layer
-    logger.info("Global Search Response:\n%s", response)
+    print(response)
 
-    # NOTE: we return the response and context data here purely as a complete demonstration of the API.
-    # External users should use the API directly to get the response and context data.
     return response, context_data
 
 
@@ -188,11 +175,6 @@ def run_local_search(
         final_relationships_list = dataframe_dict["relationships"]
         index_names = dataframe_dict["index_names"]
 
-        logger.info(
-            "Running multi-index local search on indexes: %s",
-            dataframe_dict["index_names"],
-        )
-
         # If any covariates tables are missing from any index, set the covariates list to None
         if len(dataframe_dict["covariates"]) != dataframe_dict["num_indexes"]:
             final_covariates_list = None
@@ -216,11 +198,8 @@ def run_local_search(
                 verbose=verbose,
             )
         )
-        # log the full response at INFO level for user visibility but at DEBUG level in the API layer
-        logger.info("Local Search Response:\n%s", response)
+        print(response)
 
-        # NOTE: we return the response and context data here purely as a complete demonstration of the API.
-        # External users should use the API directly to get the response and context data.
         return response, context_data
 
     # Otherwise, call the Single-Index Local Search API
@@ -259,9 +238,9 @@ def on_context(context: Any) -> None:
                 verbose=verbose,
             ):
                 full_response += stream_chunk
-                print(stream_chunk, end="")  # noqa: T201
-                sys.stdout.flush()  # flush output buffer to display text immediately
-            print()  # noqa: T201
+                print(stream_chunk, end="")
+                sys.stdout.flush()
+            print()
             return full_response, context_data
 
         return asyncio.run(run_streaming_search())
@@ -281,11 +260,8 @@ def on_context(context: Any) -> None:
             verbose=verbose,
         )
     )
-    # log the full response at INFO level for user visibility but at DEBUG level in the API layer
-    logger.info("Local Search Response:\n%s", response)
+    print(response)
 
-    # NOTE: we return the response and context data here purely as a complete demonstration of the API.
-    # External users should use the API directly to get the response and context data.
     return response, context_data
 
 
@@ -329,11 +305,6 @@ def run_drift_search(
         final_relationships_list = dataframe_dict["relationships"]
         index_names = dataframe_dict["index_names"]
 
-        logger.info(
-            "Running multi-index drift search on indexes: %s",
-            dataframe_dict["index_names"],
-        )
-
         response, context_data = asyncio.run(
             api.multi_index_drift_search(
                 config=config,
@@ -350,11 +321,8 @@ def run_drift_search(
                 verbose=verbose,
             )
         )
-        # log the full response at INFO level for user visibility but at DEBUG level in the API layer
-        logger.info("DRIFT Search Response:\n%s", response)
+        print(response)
 
-        # NOTE: we return the response and context data here purely as a complete demonstration of the API.
-        # External users should use the API directly to get the response and context data.
         return response, context_data
 
     # Otherwise, call the Single-Index Drift Search API
@@ -391,9 +359,9 @@ def on_context(context: Any) -> None:
                 verbose=verbose,
             ):
                 full_response += stream_chunk
-                print(stream_chunk, end="")  # noqa: T201
-                sys.stdout.flush()  # flush output buffer to display text immediately
-            print()  # noqa: T201
+                print(stream_chunk, end="")
+                sys.stdout.flush()
+            print()
             return full_response, context_data
 
         return asyncio.run(run_streaming_search())
@@ -413,11 +381,8 @@ def on_context(context: Any) -> None:
             verbose=verbose,
         )
     )
-    # log the full response at INFO level for user visibility but at DEBUG level in the API layer
-    logger.info("DRIFT Search Response:\n%s", response)
+    print(response)
 
-    # NOTE: we return the response and context data here purely as a complete demonstration of the API.
-    # External users should use the API directly to get the response and context data.
     return response, context_data
 
 
@@ -451,11 +416,6 @@ def run_basic_search(
         final_text_units_list = dataframe_dict["text_units"]
         index_names = dataframe_dict["index_names"]
 
-        logger.info(
-            "Running multi-index basic search on indexes: %s",
-            dataframe_dict["index_names"],
-        )
-
         response, context_data = asyncio.run(
             api.multi_index_basic_search(
                 config=config,
@@ -466,11 +426,8 @@ def run_basic_search(
                 verbose=verbose,
             )
         )
-        # log the full response at INFO level for user visibility but at DEBUG level in the API layer
-        logger.info("Basic Search Response:\n%s", response)
+        print(response)
 
-        # NOTE: we return the response and context data here purely as a complete demonstration of the API.
-        # External users should use the API directly to get the response and context data.
         return response, context_data
 
     # Otherwise, call the Single-Index Basic Search API
@@ -497,9 +454,9 @@ def on_context(context: Any) -> None:
                 verbose=verbose,
             ):
                 full_response += stream_chunk
-                print(stream_chunk, end="")  # noqa: T201
-                sys.stdout.flush()  # flush output buffer to display text immediately
-            print()  # noqa: T201
+                print(stream_chunk, end="")
+                sys.stdout.flush()
+            print()
             return full_response, context_data
 
         return asyncio.run(run_streaming_search())
@@ -512,11 +469,8 @@ def on_context(context: Any) -> None:
             verbose=verbose,
         )
     )
-    # log the full response at INFO level for user visibility but at DEBUG level in the API layer
-    logger.info("Basic Search Response:\n%s", response)
+    print(response)
 
-    # NOTE: we return the response and context data here purely as a complete demonstration of the API.
-    # External users should use the API directly to get the response and context data.
     return response, context_data
 
 
diff --git a/graphrag/index/workflows/create_base_text_units.py b/graphrag/index/workflows/create_base_text_units.py
@@ -17,7 +17,6 @@
 from graphrag.index.typing.context import PipelineRunContext
 from graphrag.index.typing.workflow import WorkflowFunctionOutput
 from graphrag.index.utils.hashing import gen_sha512_hash
-from graphrag.logger.progress import Progress
 from graphrag.utils.storage import load_table_from_storage, write_table_to_storage
 
 logger = logging.getLogger(__name__)
@@ -69,8 +68,6 @@ def create_base_text_units(
         zip(*[sort[col] for col in ["id", "text"]], strict=True)
     )
 
-    callbacks.progress(Progress(percent=0))
-
     agg_dict = {"text_with_ids": list}
     if "metadata" in documents:
         agg_dict["metadata"] = "first"  # type: ignore
diff --git a/graphrag/logger/progress.py b/graphrag/logger/progress.py
@@ -16,9 +16,6 @@
 class Progress:
     """A class representing the progress of a task."""
 
-    percent: float | None = None
-    """0 - 1 progress"""
-
     description: str | None = None
     """Description of the progress"""
 
diff --git a/graphrag/logger/standard_logging.py b/graphrag/logger/standard_logging.py
diff --git a/graphrag/storage/blob_pipeline_storage.py b/graphrag/storage/blob_pipeline_storage.py
diff --git a/graphrag/storage/cosmosdb_pipeline_storage.py b/graphrag/storage/cosmosdb_pipeline_storage.py
diff --git a/tests/unit/logger/test_standard_logging.py b/tests/unit/logger/test_standard_logging.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "patch",
 +  "description": "Improve upon recent logging refactor"
 +}