forked from microsoft/graphrag
-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathsettings.yaml
138 lines (108 loc) · 3.8 KB
/
settings.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
### This config file contains required core defaults that must be set, along with a handful of common optional settings.
### For a full list of available settings, see https://microsoft.github.io/graphrag/config/yaml/
### LLM settings ###
## There are a number of settings to tune the threading and token limits for LLM calls - check the docs.
encoding_model: cl100k_base # this needs to be matched to your model!
llm:
api_key: ${GRAPHRAG_API_KEY} # set this in the generated .env file
type: openai_chat # or azure_openai_chat
model: glm-4-flash
api_base: https://open.bigmodel.cn/api/paas/v4
model_supports_json: true # recommended if this is available for your model.
# audience: "https://cognitiveservices.azure.com/.default"
# api_base: https://<instance>.openai.azure.com
# api_version: 2024-02-15-preview
# organization: <organization_id>
# deployment_name: <azure_model_deployment_name>
parallelization:
stagger: 0.3
# num_threads: 50
async_mode: threaded # or asyncio
embeddings:
async_mode: threaded # or asyncio
vector_store:
type: lancedb
db_uri: 'output/lancedb'
container_name: default
overwrite: true
llm:
api_key: ${GRAPHRAG_API_KEY}
type: openai_embedding # or azure_openai_embedding
model: text-embedding-3-small
api_base: http://localhost:1234/v1
concurrent_requests: 1
batch_size: 1
# api_base: https://<instance>.openai.azure.com
# api_version: 2024-02-15-preview
# audience: "https://cognitiveservices.azure.com/.default"
# organization: <organization_id>
# deployment_name: <azure_model_deployment_name>
### Input settings ###
input:
type: file # or blob
file_type: text # or csv
base_dir: "input"
file_encoding: utf-8
file_pattern: ".*\\.txt$"
chunks:
size: 1200
overlap: 100
group_by_columns: [id]
### Storage settings ###
## If blob storage is specified in the following four sections,
## connection_string and container_name must be provided
cache:
type: file # one of [blob, cosmosdb, file]
base_dir: "cache"
reporting:
type: file # or console, blob
base_dir: "logs"
storage:
type: file # one of [blob, cosmosdb, file]
base_dir: "output"
## only turn this on if running `graphrag index` with custom settings
## we normally use `graphrag update` with the defaults
update_index_storage:
# type: file # or blob
# base_dir: "update_output"
### Workflow settings ###
skip_workflows: []
entity_extraction:
prompt: "prompts/entity_extraction.txt"
entity_types: [organization,person,geo,event]
max_gleanings: 1
summarize_descriptions:
prompt: "prompts/summarize_descriptions.txt"
max_length: 500
claim_extraction:
enabled: false
prompt: "prompts/claim_extraction.txt"
description: "Any claims or facts that could be relevant to information discovery."
max_gleanings: 1
community_reports:
prompt: "prompts/community_report.txt"
max_length: 2000
max_input_length: 8000
cluster_graph:
max_cluster_size: 10
embed_graph:
enabled: false # if true, will generate node2vec embeddings for nodes
umap:
enabled: false # if true, will generate UMAP embeddings for nodes (embed_graph must also be enabled)
snapshots:
graphml: false
embeddings: false
transient: false
### Query settings ###
## The prompt locations are required here, but each search method has a number of optional knobs that can be tuned.
## See the config docs: https://microsoft.github.io/graphrag/config/yaml/#query
local_search:
prompt: "prompts/local_search_system_prompt.txt"
global_search:
map_prompt: "prompts/global_search_map_system_prompt.txt"
reduce_prompt: "prompts/global_search_reduce_system_prompt.txt"
knowledge_prompt: "prompts/global_search_knowledge_system_prompt.txt"
drift_search:
prompt: "prompts/drift_search_system_prompt.txt"
basic_search:
prompt: "prompts/basic_search_system_prompt.txt"