Merge pull request #4 from microsoft/main

Merge official 0.2.1 to main
KylinMountain · Aug 6, 2024 · 6360a3e · 6360a3e
2 parents 9d99f32 + 5326840
commit 6360a3e
Show file tree

Hide file tree

Showing 30 changed files with 1,200 additions and 719 deletions.
diff --git a/.github/workflows/issues-autoresolve.yml b/.github/workflows/issues-autoresolve.yml
@@ -10,15 +10,15 @@ jobs:
       issues: write
       pull-requests: write
     steps:
-      - uses: actions/stale@v5
+      - uses: actions/stale@v9
         with:
           days-before-issue-stale: 7
           days-before-issue-close: 5
           stale-issue-label: "stale"
           close-issue-label: "autoresolved"
           stale-issue-message: "This issue has been marked stale due to inactivity after repo maintainer or community member responses that request more information or suggest a solution. It will be closed after five additional days."
           close-issue-message: "This issue has been closed after being marked as stale for five days. Please reopen if needed."
-          exempt-issue-label: "triage"
+          any-of-labels: "awaiting_response"
           days-before-pr-stale: -1
           days-before-pr-close: -1
           repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.semversioner/0.2.1.json b/.semversioner/0.2.1.json
@@ -0,0 +1,70 @@
+{
+  "changes": [
+    {
+      "description": "Added default columns for vector store at create_pipeline_config. No change for other cases.",
+      "type": "patch"
+    },
+    {
+      "description": "Change json parsing error in the map step of global search to warning",
+      "type": "patch"
+    },
+    {
+      "description": "Fix Local Search breaking when loading Embeddings input. Defaulting overwrite to True as in the rest of the vector store config",
+      "type": "patch"
+    },
+    {
+      "description": "Fix json parsing when LLM returns faulty responses",
+      "type": "patch"
+    },
+    {
+      "description": "Fix missing community reports and refactor community context builder",
+      "type": "patch"
+    },
+    {
+      "description": "Fixed a bug that erased the vector database, added a new parameter to specify the config file path, and updated the documentation accordingly.",
+      "type": "patch"
+    },
+    {
+      "description": "Try parsing json before even repairing",
+      "type": "patch"
+    },
+    {
+      "description": "Update Prompt Tuning meta prompts with finer examples",
+      "type": "patch"
+    },
+    {
+      "description": "Update default entity extraction and gleaning prompts to reduce hallucinations",
+      "type": "patch"
+    },
+    {
+      "description": "add encoding-model to entity/claim extraction config",
+      "type": "patch"
+    },
+    {
+      "description": "add encoding-model to text chunking config",
+      "type": "patch"
+    },
+    {
+      "description": "add user prompt to history-tracking llm",
+      "type": "patch"
+    },
+    {
+      "description": "update config reader to allow for zero gleans",
+      "type": "patch"
+    },
+    {
+      "description": "update config-reader to allow for empty chunk-by arrays",
+      "type": "patch"
+    },
+    {
+      "description": "update history-tracking LLm to use 'assistant' instead of 'system' in output history.",
+      "type": "patch"
+    },
+    {
+      "description": "use history argument in hash key computation; add history input to cache data",
+      "type": "patch"
+    }
+  ],
+  "created_at": "2024-08-06T00:25:52+00:00",
+  "version": "0.2.1"
+}
diff --git a/.semversioner/next-release/patch-20240726142042913643.json b/.semversioner/next-release/patch-20240726142042913643.json
diff --git a/.semversioner/next-release/patch-20240726143138162263.json b/.semversioner/next-release/patch-20240726143138162263.json
diff --git a/.semversioner/next-release/patch-20240726154054702667.json b/.semversioner/next-release/patch-20240726154054702667.json
diff --git a/.semversioner/next-release/patch-20240726181256417715.json b/.semversioner/next-release/patch-20240726181256417715.json
diff --git a/.semversioner/next-release/patch-20240726200425411495.json b/.semversioner/next-release/patch-20240726200425411495.json
diff --git a/.semversioner/next-release/patch-20240726205654788488.json b/.semversioner/next-release/patch-20240726205654788488.json
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,25 @@
 # Changelog
 Note: version releases in the 0.x.y range may introduce breaking changes.
 
+## 0.2.1
+
+- patch: Added default columns for vector store at create_pipeline_config. No change for other cases.
+- patch: Change json parsing error in the map step of global search to warning
+- patch: Fix Local Search breaking when loading Embeddings input. Defaulting overwrite to True as in the rest of the vector store config
+- patch: Fix json parsing when LLM returns faulty responses
+- patch: Fix missing community reports and refactor community context builder
+- patch: Fixed a bug that erased the vector database, added a new parameter to specify the config file path, and updated the documentation accordingly.
+- patch: Try parsing json before even repairing
+- patch: Update Prompt Tuning meta prompts with finer examples
+- patch: Update default entity extraction and gleaning prompts to reduce hallucinations
+- patch: add encoding-model to entity/claim extraction config
+- patch: add encoding-model to text chunking config
+- patch: add user prompt to history-tracking llm
+- patch: update config reader to allow for zero gleans
+- patch: update config-reader to allow for empty chunk-by arrays
+- patch: update history-tracking LLm to use 'assistant' instead of 'system' in output history.
+- patch: use history argument in hash key computation; add history input to cache data
+
 ## 0.2.0
 
 - minor: Add content-based KNN for selecting prompt tune few shot examples

diff --git a/dictionary.txt b/dictionary.txt
@@ -132,11 +132,30 @@ MSRC
 Arrary
 
 # Prompt Inputs
-dulce
-Asadi
 ABILA
 Abila
+ALHAMIA
+Alhamia
+Asadi
+Aurelians
+Bataglani
+BATAGLANI
+Bratinas
+dulce
+Durke
+Firuzabad
+Firuzabad's
+FIRUZABAD
+Krohaara
+KROHAARA
 POKRALLY
+Tazbah
+TIRUZIA
+Tiruzia
+Tiruzia's
+Verdantis
+Verdantis's
+
 
 # English
 skippable

diff --git a/docsite/posts/query/3-cli.md b/docsite/posts/query/3-cli.md
@@ -9,11 +9,12 @@ date: 2024-27-03
 The GraphRAG query CLI allows for no-code usage of the GraphRAG Query engine.
 
 ```bash
-python -m graphrag.query --data <path-to-data> --community_level <comunit-level> --response_type <response-type> --method <"local"|"global"> <query>
+python -m graphrag.query --config <config_file.yml> --data <path-to-data> --community_level <comunit-level> --response_type <response-type> --method <"local"|"global"> <query>
 ```
 
 ## CLI Arguments
 
+- `--config <config_file.yml>` - The configuration yaml file to use when running the query. If this is used, then none of the environment-variables below will apply.
 - `--data <path-to-data>` - Folder containing the `.parquet` output files from running the Indexer.
 - `--community_level <community-level>` - Community level in the Leiden community hierarchy from which we will load the community reports higher value means we use reports on smaller communities. Default: 2
 - `--response_type <response-type>` - Free form text describing the response type and format, can be anything, e.g. `Multiple Paragraphs`, `Single Paragraph`, `Single Sentence`, `List of 3-7 Points`, `Single Page`, `Multi-Page Report`. Default: `Multiple Paragraphs`.

diff --git a/examples_notebooks/global_search.ipynb b/examples_notebooks/global_search.ipynb
@@ -115,7 +115,10 @@
     "\n",
     "reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)\n",
     "entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)\n",
-    "print(f\"Report records: {len(report_df)}\")\n",
+    "print(f\"Total report count: {len(report_df)}\")\n",
+    "print(\n",
+    "    f\"Report count after filtering by community level {COMMUNITY_LEVEL}: {len(reports)}\"\n",
+    ")\n",
     "report_df.head()"
    ]
   },
@@ -223,17 +226,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "LLM calls: 13. LLM tokens: 184660\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# inspect number of LLM calls and tokens\n",
     "print(f\"LLM calls: {result.llm_calls}. LLM tokens: {result.prompt_tokens}\")"

diff --git a/graphrag/index/create_pipeline_config.py b/graphrag/index/create_pipeline_config.py
@@ -198,7 +198,12 @@ def _document_workflows(
             name=create_final_documents,
             config={
                 "document_raw_content_embed": _get_embedding_settings(
-                    settings.embeddings, "document_raw_content"
+                    settings.embeddings,
+                    "document_raw_content",
+                    {
+                        "title_column": "raw_content",
+                        "collection_name": "final_documents_raw_content_embedding",
+                    },
                 ),
                 "skip_raw_content_embedding": skip_document_raw_content_embedding,
             },
@@ -243,7 +248,9 @@ def _text_unit_workflows(
             name=create_final_text_units,
             config={
                 "text_unit_text_embed": _get_embedding_settings(
-                    settings.embeddings, "text_unit_text"
+                    settings.embeddings,
+                    "text_unit_text",
+                    {"title_column": "text", "collection_name": "text_units_embedding"},
                 ),
                 "covariates_enabled": covariates_enabled,
                 "skip_text_unit_embedding": skip_text_unit_embedding,
@@ -252,19 +259,22 @@ def _text_unit_workflows(
     ]
 
 
-def _get_embedding_settings(settings: TextEmbeddingConfig, embedding_name: str) -> dict:
+def _get_embedding_settings(
+    settings: TextEmbeddingConfig,
+    embedding_name: str,
+    vector_store_params: dict | None = None,
+) -> dict:
     vector_store_settings = settings.vector_store
     if vector_store_settings is None:
         return {"strategy": settings.resolved_strategy()}
-
     #
     # If we get to this point, settings.vector_store is defined, and there's a specific setting for this embedding.
     # settings.vector_store.base contains connection information, or may be undefined
     # settings.vector_store.<vector_name> contains the specific settings for this embedding
     #
     strategy = settings.resolved_strategy()  # get the default strategy
     strategy.update({
-        "vector_store": vector_store_settings
+        "vector_store": {**vector_store_settings, **(vector_store_params or {})}
     })  # update the default strategy with the vector store settings
     # This ensures the vector store config is part of the strategy and not the global config
     return {
@@ -327,10 +337,20 @@ def _graph_workflows(
             name=create_final_entities,
             config={
                 "entity_name_embed": _get_embedding_settings(
-                    settings.embeddings, "entity_name"
+                    settings.embeddings,
+                    "entity_name",
+                    {
+                        "title_column": "name",
+                        "collection_name": "entity_name_embeddings",
+                    },
                 ),
                 "entity_name_description_embed": _get_embedding_settings(
-                    settings.embeddings, "entity_name_description"
+                    settings.embeddings,
+                    "entity_name_description",
+                    {
+                        "title_column": "description",
+                        "collection_name": "entity_description_embeddings",
+                    },
                 ),
                 "skip_name_embedding": skip_entity_name_embedding,
                 "skip_description_embedding": skip_entity_description_embedding,
@@ -340,7 +360,12 @@ def _graph_workflows(
             name=create_final_relationships,
             config={
                 "relationship_description_embed": _get_embedding_settings(
-                    settings.embeddings, "relationship_description"
+                    settings.embeddings,
+                    "relationship_description",
+                    {
+                        "title_column": "description",
+                        "collection_name": "relationships_description_embeddings",
+                    },
                 ),
                 "skip_description_embedding": skip_relationship_description_embedding,
             },
@@ -382,13 +407,25 @@ def _community_workflows(
                     ),
                 },
                 "community_report_full_content_embed": _get_embedding_settings(
-                    settings.embeddings, "community_report_full_content"
+                    settings.embeddings,
+                    "community_report_full_content",
+                    {
+                        "title_column": "full_content",
+                        "collection_name": "final_community_reports_full_content_embedding",
+                    },
                 ),
                 "community_report_summary_embed": _get_embedding_settings(
-                    settings.embeddings, "community_report_summary"
+                    settings.embeddings,
+                    "community_report_summary",
+                    {
+                        "title_column": "summary",
+                        "collection_name": "final_community_reports_summary_embedding",
+                    },
                 ),
                 "community_report_title_embed": _get_embedding_settings(
-                    settings.embeddings, "community_report_title"
+                    settings.embeddings,
+                    "community_report_title",
+                    {"title_column": "title"},
                 ),
             },
         ),