Added support for verbose logging and csv-metadata to the prompt tune… (#1789)

nievespg1 · Gabriel Nieves · AlonsoGuevara · web-flow · commit e39d869bedb3 · 2025-03-11T14:55:02.000-06:00
* Added support for verbose logging and csv-metadata to the prompt tune client.

* Updated community report summarization file name and prompt template

* updated semversioner

* ran ruff linter

* Ran poe format

* Fix Ruff complains

* Fix a new ruff complain :P

* Pyright

* Fix tests

---------

Co-authored-by: Gabriel Nieves &lt;gnievesponce@microsoft.com&gt;
Co-authored-by: Alonso Guevara &lt;alonsog@microsoft.com&gt;
diff --git a/.semversioner/next-release/minor-20250306172213505351.json b/.semversioner/next-release/minor-20250306172213505351.json
@@ -0,0 +1,4 @@
+{
+  "type": "minor",
+  "description": "Updated the prompt tunning client to support csv-metadata injection and updated ourput file types to match the new naming convention."
+}
diff --git a/graphrag/api/prompt_tune.py b/graphrag/api/prompt_tune.py
@@ -11,13 +11,16 @@
 Backwards compatibility is not guaranteed at this time.
 """
 
+from typing import Annotated
+
+import annotated_types
 from pydantic import PositiveInt, validate_call
 
 from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks
-from graphrag.config.defaults import language_model_defaults
+from graphrag.config.defaults import graphrag_config_defaults, language_model_defaults
 from graphrag.config.models.graph_rag_config import GraphRagConfig
 from graphrag.language_model.manager import ModelManager
-from graphrag.logger.print_progress import PrintProgressLogger
+from graphrag.logger.base import ProgressLogger
 from graphrag.prompt_tune.defaults import MAX_TOKEN_COUNT, PROMPT_TUNING_MODEL_ID
 from graphrag.prompt_tune.generator.community_report_rating import (
     generate_community_report_rating,
@@ -41,15 +44,19 @@
 )
 from graphrag.prompt_tune.generator.language import detect_language
 from graphrag.prompt_tune.generator.persona import generate_persona
-from graphrag.prompt_tune.loader.input import MIN_CHUNK_SIZE, load_docs_in_chunks
+from graphrag.prompt_tune.loader.input import load_docs_in_chunks
 from graphrag.prompt_tune.types import DocSelectionType
 
 
-@validate_call
+@validate_call(config={"arbitrary_types_allowed": True})
 async def generate_indexing_prompts(
     config: GraphRagConfig,
+    logger: ProgressLogger,
     root: str,
-    chunk_size: PositiveInt = MIN_CHUNK_SIZE,
+    chunk_size: PositiveInt = graphrag_config_defaults.chunks.size,
+    overlap: Annotated[
+        int, annotated_types.Gt(-1)
+    ] = graphrag_config_defaults.chunks.overlap,
     limit: PositiveInt = 15,
     selection_method: DocSelectionType = DocSelectionType.RANDOM,
     domain: str | None = None,
@@ -65,6 +72,8 @@ async def generate_indexing_prompts(
     Parameters
     ----------
     - config: The GraphRag configuration.
+    - logger: The logger to use for progress updates.
+    - root: The root directory.
     - output_path: The path to store the prompts.
     - chunk_size: The chunk token size to use for input text units.
     - limit: The limit of chunks to load.
@@ -81,22 +90,23 @@ async def generate_indexing_prompts(
     -------
     tuple[str, str, str]: entity extraction prompt, entity summarization prompt, community summarization prompt
     """
-    logger = PrintProgressLogger("")
-
     # Retrieve documents
+    logger.info("Chunking documents...")
     doc_list = await load_docs_in_chunks(
         root=root,
         config=config,
         limit=limit,
         select_method=selection_method,
         logger=logger,
         chunk_size=chunk_size,
+        overlap=overlap,
         n_subset_max=n_subset_max,
         k=k,
     )
 
     # Create LLM from config
     # TODO: Expose a way to specify Prompt Tuning model ID through config
+    logger.info("Retrieving language model configuration...")
     default_llm_settings = config.get_language_model_config(PROMPT_TUNING_MODEL_ID)
 
     # if max_retries is not set, inject a dynamically assigned value based on the number of expected LLM calls
@@ -105,7 +115,10 @@ async def generate_indexing_prompts(
         default_llm_settings.max_retries = min(
             len(doc_list), language_model_defaults.max_retries
         )
+        msg = f"max_retries not set, using default value: {default_llm_settings.max_retries}"
+        logger.warning(msg)
 
+    logger.info("Creating language model...")
     llm = ModelManager().register_chat(
         name="prompt_tuning",
         model_type=default_llm_settings.type,
@@ -117,7 +130,6 @@ async def generate_indexing_prompts(
     if not domain:
         logger.info("Generating domain...")
         domain = await generate_domain(llm, doc_list)
-        logger.info(f"Generated domain: {domain}")  # noqa
 
     if not language:
         logger.info("Detecting language...")
@@ -186,6 +198,10 @@ async def generate_indexing_prompts(
         language=language,
     )
 
+    logger.info(f"\nGenerated domain: {domain}")  # noqa: G004
+    logger.info(f"\nDetected language: {language}")  # noqa: G004
+    logger.info(f"\nGenerated persona: {persona}")  # noqa: G004
+
     return (
         extract_graph_prompt,
         entity_summarization_prompt,
diff --git a/graphrag/cli/main.py b/graphrag/cli/main.py
@@ -11,14 +11,10 @@
 
 import typer
 
+from graphrag.config.defaults import graphrag_config_defaults
 from graphrag.config.enums import IndexingMethod, SearchMethod
 from graphrag.logger.types import LoggerType
-from graphrag.prompt_tune.defaults import (
-    MAX_TOKEN_COUNT,
-    MIN_CHUNK_SIZE,
-    N_SUBSET_MAX,
-    K,
-)
+from graphrag.prompt_tune.defaults import LIMIT, MAX_TOKEN_COUNT, N_SUBSET_MAX, K
 from graphrag.prompt_tune.types import DocSelectionType
 
 INVALID_METHOD_ERROR = "Invalid method"
@@ -274,6 +270,12 @@ def _prompt_tune_cli(
             ),
         ),
     ] = None,
+    verbose: Annotated[
+        bool, typer.Option(help="Run the prompt tuning pipeline with verbose logging")
+    ] = False,
+    logger: Annotated[
+        LoggerType, typer.Option(help="The progress logger to use.")
+    ] = LoggerType.RICH,
     domain: Annotated[
         str | None,
         typer.Option(
@@ -300,7 +302,7 @@ def _prompt_tune_cli(
         typer.Option(
             help="The number of documents to load when --selection-method={random,top}."
         ),
-    ] = 15,
+    ] = LIMIT,
     max_tokens: Annotated[
         int, typer.Option(help="The max token count for prompt generation.")
     ] = MAX_TOKEN_COUNT,
@@ -311,8 +313,17 @@ def _prompt_tune_cli(
         ),
     ] = 2,
     chunk_size: Annotated[
-        int, typer.Option(help="The max token count for prompt generation.")
-    ] = MIN_CHUNK_SIZE,
+        int,
+        typer.Option(
+            help="The size of each example text chunk. Overrides chunks.size in the configuration file."
+        ),
+    ] = graphrag_config_defaults.chunks.size,
+    overlap: Annotated[
+        int,
+        typer.Option(
+            help="The overlap size for chunking documents. Overrides chunks.overlap in the configuration file"
+        ),
+    ] = graphrag_config_defaults.chunks.overlap,
     language: Annotated[
         str | None,
         typer.Option(
@@ -343,10 +354,13 @@ def _prompt_tune_cli(
             root=root,
             config=config,
             domain=domain,
+            verbose=verbose,
+            logger=logger,
             selection_method=selection_method,
             limit=limit,
             max_tokens=max_tokens,
             chunk_size=chunk_size,
+            overlap=overlap,
             language=language,
             discover_entity_types=discover_entity_types,
             output=output,
diff --git a/graphrag/cli/prompt_tune.py b/graphrag/cli/prompt_tune.py
@@ -6,8 +6,10 @@
 from pathlib import Path
 
 import graphrag.api as api
+from graphrag.cli.index import _logger
 from graphrag.config.load_config import load_config
-from graphrag.logger.print_progress import PrintProgressLogger
+from graphrag.config.logging import enable_logging_with_config
+from graphrag.logger.factory import LoggerFactory, LoggerType
 from graphrag.prompt_tune.generator.community_report_summarization import (
     COMMUNITY_SUMMARIZATION_FILENAME,
 )
@@ -17,16 +19,20 @@
 from graphrag.prompt_tune.generator.extract_graph_prompt import (
     EXTRACT_GRAPH_FILENAME,
 )
+from graphrag.utils.cli import redact
 
 
 async def prompt_tune(
     root: Path,
     config: Path | None,
     domain: str | None,
+    verbose: bool,
+    logger: LoggerType,
     selection_method: api.DocSelectionType,
     limit: int,
     max_tokens: int,
     chunk_size: int,
+    overlap: int,
     language: str | None,
     discover_entity_types: bool,
     output: Path,
@@ -41,6 +47,8 @@ async def prompt_tune(
     - config: The configuration file.
     - root: The root directory.
     - domain: The domain to map the input documents to.
+    - verbose: Whether to enable verbose logging.
+    - logger: The logger to use.
     - selection_method: The chunk selection method.
     - limit: The limit of chunks to load.
     - max_tokens: The maximum number of tokens to use on entity extraction prompts.
@@ -52,14 +60,36 @@ async def prompt_tune(
     - k: The number of documents to select when using auto selection method.
     - min_examples_required: The minimum number of examples required for entity extraction prompts.
     """
-    logger = PrintProgressLogger("")
     root_path = Path(root).resolve()
     graph_config = load_config(root_path, config)
 
+    # override chunking config in the configuration
+    if chunk_size != graph_config.chunks.size:
+        graph_config.chunks.size = chunk_size
+
+    if overlap != graph_config.chunks.overlap:
+        graph_config.chunks.overlap = overlap
+
+    progress_logger = LoggerFactory().create_logger(logger)
+    info, error, success = _logger(progress_logger)
+
+    enabled_logging, log_path = enable_logging_with_config(
+        graph_config, verbose, filename="prompt-tune.log"
+    )
+    if enabled_logging:
+        info(f"Logging enabled at {log_path}", verbose)
+    else:
+        info(
+            f"Logging not enabled for config {redact(graph_config.model_dump())}",
+            verbose,
+        )
+
     prompts = await api.generate_indexing_prompts(
         config=graph_config,
         root=str(root_path),
+        logger=progress_logger,
         chunk_size=chunk_size,
+        overlap=overlap,
         limit=limit,
         selection_method=selection_method,
         domain=domain,
@@ -73,7 +103,7 @@ async def prompt_tune(
 
     output_path = output.resolve()
     if output_path:
-        logger.info(f"Writing prompts to {output_path}")  # noqa: G004
+        info(f"Writing prompts to {output_path}")
         output_path.mkdir(parents=True, exist_ok=True)
         extract_graph_prompt_path = output_path / EXTRACT_GRAPH_FILENAME
         entity_summarization_prompt_path = output_path / ENTITY_SUMMARIZATION_FILENAME
@@ -87,3 +117,6 @@ async def prompt_tune(
             file.write(prompts[1].encode(encoding="utf-8", errors="strict"))
         with community_summarization_prompt_path.open("wb") as file:
             file.write(prompts[2].encode(encoding="utf-8", errors="strict"))
+        success(f"Prompts written to {output_path}")
+    else:
+        error("No output path provided. Skipping writing prompts.")
diff --git a/graphrag/config/logging.py b/graphrag/config/logging.py
@@ -34,7 +34,7 @@ def enable_logging(log_filepath: str | Path, verbose: bool = False) -> None:
 
 
 def enable_logging_with_config(
-    config: GraphRagConfig, verbose: bool = False
+    config: GraphRagConfig, verbose: bool = False, filename: str = "indexing-engine.log"
 ) -> tuple[bool, str]:
     """Enable logging to a file based on the config.
 
@@ -55,7 +55,7 @@ def enable_logging_with_config(
         (True, str) if logging was enabled.
     """
     if config.reporting.type == ReportingType.file:
-        log_path = Path(config.reporting.base_dir) / "indexing-engine.log"
+        log_path = Path(config.reporting.base_dir) / filename
         enable_logging(log_path, verbose)
         return (True, str(log_path))
     return (False, "")
diff --git a/graphrag/language_model/protocol/base.py b/graphrag/language_model/protocol/base.py
@@ -120,6 +120,7 @@ async def achat_stream(
         -------
             A generator that yields strings representing the response.
         """
+        yield ""  # Yield an empty string so that the function is recognized as a generator
         ...
 
     def chat(
diff --git a/graphrag/prompt_tune/defaults.py b/graphrag/prompt_tune/defaults.py
@@ -12,6 +12,7 @@
 """
 
 K = 15
+LIMIT = 15
 MAX_TOKEN_COUNT = 2000
 MIN_CHUNK_SIZE = 200
 N_SUBSET_MAX = 300
diff --git a/graphrag/prompt_tune/generator/community_report_summarization.py b/graphrag/prompt_tune/generator/community_report_summarization.py
@@ -9,7 +9,7 @@
     COMMUNITY_REPORT_SUMMARIZATION_PROMPT,
 )
 
-COMMUNITY_SUMMARIZATION_FILENAME = "community_report.txt"
+COMMUNITY_SUMMARIZATION_FILENAME = "community_report_graph.txt"
 
 
 def create_community_summarization_prompt(
diff --git a/graphrag/prompt_tune/loader/input.py b/graphrag/prompt_tune/loader/input.py
diff --git a/graphrag/prompt_tune/template/community_report_summarization.py b/graphrag/prompt_tune/template/community_report_summarization.py
diff --git a/poetry.lock b/poetry.lock
diff --git a/tests/integration/language_model/test_factory.py b/tests/integration/language_model/test_factory.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "minor",
 +  "description": "Updated the prompt tunning client to support csv-metadata injection and updated ourput file types to match the new naming convention."
 +}
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`	`COMMUNITY_REPORT_SUMMARIZATION_PROMPT,`
`10`	`10`	`)`
`11`	`11`
`12`		`-COMMUNITY_SUMMARIZATION_FILENAME = "community_report.txt"`
	`12`	`+COMMUNITY_SUMMARIZATION_FILENAME = "community_report_graph.txt"`
`13`	`13`
`14`	`14`
`15`	`15`	`def create_community_summarization_prompt(`