Skip to content

Commit

Permalink
Merge pull request #4 from microsoft/main
Browse files Browse the repository at this point in the history
Merge official 0.2.1 to main
  • Loading branch information
KylinMountain authored Aug 6, 2024
2 parents 9d99f32 + 5326840 commit 6360a3e
Show file tree
Hide file tree
Showing 30 changed files with 1,200 additions and 719 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/issues-autoresolve.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ jobs:
issues: write
pull-requests: write
steps:
- uses: actions/stale@v5
- uses: actions/stale@v9
with:
days-before-issue-stale: 7
days-before-issue-close: 5
stale-issue-label: "stale"
close-issue-label: "autoresolved"
stale-issue-message: "This issue has been marked stale due to inactivity after repo maintainer or community member responses that request more information or suggest a solution. It will be closed after five additional days."
close-issue-message: "This issue has been closed after being marked as stale for five days. Please reopen if needed."
exempt-issue-label: "triage"
any-of-labels: "awaiting_response"
days-before-pr-stale: -1
days-before-pr-close: -1
repo-token: ${{ secrets.GITHUB_TOKEN }}
70 changes: 70 additions & 0 deletions .semversioner/0.2.1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
{
"changes": [
{
"description": "Added default columns for vector store at create_pipeline_config. No change for other cases.",
"type": "patch"
},
{
"description": "Change json parsing error in the map step of global search to warning",
"type": "patch"
},
{
"description": "Fix Local Search breaking when loading Embeddings input. Defaulting overwrite to True as in the rest of the vector store config",
"type": "patch"
},
{
"description": "Fix json parsing when LLM returns faulty responses",
"type": "patch"
},
{
"description": "Fix missing community reports and refactor community context builder",
"type": "patch"
},
{
"description": "Fixed a bug that erased the vector database, added a new parameter to specify the config file path, and updated the documentation accordingly.",
"type": "patch"
},
{
"description": "Try parsing json before even repairing",
"type": "patch"
},
{
"description": "Update Prompt Tuning meta prompts with finer examples",
"type": "patch"
},
{
"description": "Update default entity extraction and gleaning prompts to reduce hallucinations",
"type": "patch"
},
{
"description": "add encoding-model to entity/claim extraction config",
"type": "patch"
},
{
"description": "add encoding-model to text chunking config",
"type": "patch"
},
{
"description": "add user prompt to history-tracking llm",
"type": "patch"
},
{
"description": "update config reader to allow for zero gleans",
"type": "patch"
},
{
"description": "update config-reader to allow for empty chunk-by arrays",
"type": "patch"
},
{
"description": "update history-tracking LLm to use 'assistant' instead of 'system' in output history.",
"type": "patch"
},
{
"description": "use history argument in hash key computation; add history input to cache data",
"type": "patch"
}
],
"created_at": "2024-08-06T00:25:52+00:00",
"version": "0.2.1"
}
4 changes: 0 additions & 4 deletions .semversioner/next-release/patch-20240726142042913643.json

This file was deleted.

4 changes: 0 additions & 4 deletions .semversioner/next-release/patch-20240726143138162263.json

This file was deleted.

4 changes: 0 additions & 4 deletions .semversioner/next-release/patch-20240726154054702667.json

This file was deleted.

4 changes: 0 additions & 4 deletions .semversioner/next-release/patch-20240726181256417715.json

This file was deleted.

4 changes: 0 additions & 4 deletions .semversioner/next-release/patch-20240726200425411495.json

This file was deleted.

4 changes: 0 additions & 4 deletions .semversioner/next-release/patch-20240726205654788488.json

This file was deleted.

19 changes: 19 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,25 @@
# Changelog
Note: version releases in the 0.x.y range may introduce breaking changes.

## 0.2.1

- patch: Added default columns for vector store at create_pipeline_config. No change for other cases.
- patch: Change json parsing error in the map step of global search to warning
- patch: Fix Local Search breaking when loading Embeddings input. Defaulting overwrite to True as in the rest of the vector store config
- patch: Fix json parsing when LLM returns faulty responses
- patch: Fix missing community reports and refactor community context builder
- patch: Fixed a bug that erased the vector database, added a new parameter to specify the config file path, and updated the documentation accordingly.
- patch: Try parsing json before even repairing
- patch: Update Prompt Tuning meta prompts with finer examples
- patch: Update default entity extraction and gleaning prompts to reduce hallucinations
- patch: add encoding-model to entity/claim extraction config
- patch: add encoding-model to text chunking config
- patch: add user prompt to history-tracking llm
- patch: update config reader to allow for zero gleans
- patch: update config-reader to allow for empty chunk-by arrays
- patch: update history-tracking LLm to use 'assistant' instead of 'system' in output history.
- patch: use history argument in hash key computation; add history input to cache data

## 0.2.0

- minor: Add content-based KNN for selecting prompt tune few shot examples
Expand Down
23 changes: 21 additions & 2 deletions dictionary.txt
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,30 @@ MSRC
Arrary

# Prompt Inputs
dulce
Asadi
ABILA
Abila
ALHAMIA
Alhamia
Asadi
Aurelians
Bataglani
BATAGLANI
Bratinas
dulce
Durke
Firuzabad
Firuzabad's
FIRUZABAD
Krohaara
KROHAARA
POKRALLY
Tazbah
TIRUZIA
Tiruzia
Tiruzia's
Verdantis
Verdantis's


# English
skippable
Expand Down
3 changes: 2 additions & 1 deletion docsite/posts/query/3-cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ date: 2024-27-03
The GraphRAG query CLI allows for no-code usage of the GraphRAG Query engine.

```bash
python -m graphrag.query --data <path-to-data> --community_level <comunit-level> --response_type <response-type> --method <"local"|"global"> <query>
python -m graphrag.query --config <config_file.yml> --data <path-to-data> --community_level <comunit-level> --response_type <response-type> --method <"local"|"global"> <query>
```

## CLI Arguments

- `--config <config_file.yml>` - The configuration yaml file to use when running the query. If this is used, then none of the environment-variables below will apply.
- `--data <path-to-data>` - Folder containing the `.parquet` output files from running the Indexer.
- `--community_level <community-level>` - Community level in the Leiden community hierarchy from which we will load the community reports higher value means we use reports on smaller communities. Default: 2
- `--response_type <response-type>` - Free form text describing the response type and format, can be anything, e.g. `Multiple Paragraphs`, `Single Paragraph`, `Single Sentence`, `List of 3-7 Points`, `Single Page`, `Multi-Page Report`. Default: `Multiple Paragraphs`.
Expand Down
17 changes: 6 additions & 11 deletions examples_notebooks/global_search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,10 @@
"\n",
"reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)\n",
"entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)\n",
"print(f\"Report records: {len(report_df)}\")\n",
"print(f\"Total report count: {len(report_df)}\")\n",
"print(\n",
" f\"Report count after filtering by community level {COMMUNITY_LEVEL}: {len(reports)}\"\n",
")\n",
"report_df.head()"
]
},
Expand Down Expand Up @@ -223,17 +226,9 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LLM calls: 13. LLM tokens: 184660\n"
]
}
],
"outputs": [],
"source": [
"# inspect number of LLM calls and tokens\n",
"print(f\"LLM calls: {result.llm_calls}. LLM tokens: {result.prompt_tokens}\")"
Expand Down
59 changes: 48 additions & 11 deletions graphrag/index/create_pipeline_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,12 @@ def _document_workflows(
name=create_final_documents,
config={
"document_raw_content_embed": _get_embedding_settings(
settings.embeddings, "document_raw_content"
settings.embeddings,
"document_raw_content",
{
"title_column": "raw_content",
"collection_name": "final_documents_raw_content_embedding",
},
),
"skip_raw_content_embedding": skip_document_raw_content_embedding,
},
Expand Down Expand Up @@ -243,7 +248,9 @@ def _text_unit_workflows(
name=create_final_text_units,
config={
"text_unit_text_embed": _get_embedding_settings(
settings.embeddings, "text_unit_text"
settings.embeddings,
"text_unit_text",
{"title_column": "text", "collection_name": "text_units_embedding"},
),
"covariates_enabled": covariates_enabled,
"skip_text_unit_embedding": skip_text_unit_embedding,
Expand All @@ -252,19 +259,22 @@ def _text_unit_workflows(
]


def _get_embedding_settings(settings: TextEmbeddingConfig, embedding_name: str) -> dict:
def _get_embedding_settings(
settings: TextEmbeddingConfig,
embedding_name: str,
vector_store_params: dict | None = None,
) -> dict:
vector_store_settings = settings.vector_store
if vector_store_settings is None:
return {"strategy": settings.resolved_strategy()}

#
# If we get to this point, settings.vector_store is defined, and there's a specific setting for this embedding.
# settings.vector_store.base contains connection information, or may be undefined
# settings.vector_store.<vector_name> contains the specific settings for this embedding
#
strategy = settings.resolved_strategy() # get the default strategy
strategy.update({
"vector_store": vector_store_settings
"vector_store": {**vector_store_settings, **(vector_store_params or {})}
}) # update the default strategy with the vector store settings
# This ensures the vector store config is part of the strategy and not the global config
return {
Expand Down Expand Up @@ -327,10 +337,20 @@ def _graph_workflows(
name=create_final_entities,
config={
"entity_name_embed": _get_embedding_settings(
settings.embeddings, "entity_name"
settings.embeddings,
"entity_name",
{
"title_column": "name",
"collection_name": "entity_name_embeddings",
},
),
"entity_name_description_embed": _get_embedding_settings(
settings.embeddings, "entity_name_description"
settings.embeddings,
"entity_name_description",
{
"title_column": "description",
"collection_name": "entity_description_embeddings",
},
),
"skip_name_embedding": skip_entity_name_embedding,
"skip_description_embedding": skip_entity_description_embedding,
Expand All @@ -340,7 +360,12 @@ def _graph_workflows(
name=create_final_relationships,
config={
"relationship_description_embed": _get_embedding_settings(
settings.embeddings, "relationship_description"
settings.embeddings,
"relationship_description",
{
"title_column": "description",
"collection_name": "relationships_description_embeddings",
},
),
"skip_description_embedding": skip_relationship_description_embedding,
},
Expand Down Expand Up @@ -382,13 +407,25 @@ def _community_workflows(
),
},
"community_report_full_content_embed": _get_embedding_settings(
settings.embeddings, "community_report_full_content"
settings.embeddings,
"community_report_full_content",
{
"title_column": "full_content",
"collection_name": "final_community_reports_full_content_embedding",
},
),
"community_report_summary_embed": _get_embedding_settings(
settings.embeddings, "community_report_summary"
settings.embeddings,
"community_report_summary",
{
"title_column": "summary",
"collection_name": "final_community_reports_summary_embedding",
},
),
"community_report_title_embed": _get_embedding_settings(
settings.embeddings, "community_report_title"
settings.embeddings,
"community_report_title",
{"title_column": "title"},
),
},
),
Expand Down
Loading

0 comments on commit 6360a3e

Please sign in to comment.