SciPhi-AI · NolanTrem · Dec 11, 2024 · Dec 10, 2024
diff --git a/py/core/main/services/ingestion_service.py b/py/core/main/services/ingestion_service.py
@@ -249,7 +249,6 @@ async def augment_document_info(
                 task_prompt_name=self.config.ingestion.document_summary_task_prompt,
                 task_inputs={"document": document},
             )
-            # FIXME: Why are we hardcoding the model here?
             response = await self.providers.llm.aget_completion(
                 messages=messages,
                 generation_config=GenerationConfig(

diff --git a/py/core/main/services/kg_service.py b/py/core/main/services/kg_service.py
@@ -1001,9 +1001,19 @@ async def _extract_kg(
         # combine all extractions into a single string
         combined_extraction: str = " ".join([chunk.data for chunk in chunks])  # type: ignore
 
+        response = await self.providers.database.document_handler.get_documents_overview(  # type: ignore
+            offset=0,
+            limit=1,
+            filter_document_ids=[chunks[0].document_id],
+        )
+        document_summary = (
+            response["results"][0].summary if response["results"] else None
+        )
+
         messages = await self.providers.database.prompt_handler.get_message_payload(
             task_prompt_name=self.providers.database.config.graph_creation_settings.graphrag_relationships_extraction_few_shot,
             task_inputs={
+                "document_summary": document_summary,
                 "input": combined_extraction,
                 "max_knowledge_relationships": max_knowledge_relationships,
                 "entity_types": "\n".join(entity_types),

diff --git a/py/core/pipes/kg/description.py b/py/core/pipes/kg/description.py
@@ -4,13 +4,12 @@
 import logging
 import random
 import time
-from typing import Any, AsyncGenerator, Optional
+from typing import Any, AsyncGenerator
 from uuid import UUID
 
 from core.base import (
     AsyncState,
     CompletionProvider,
-    DatabaseProvider,
     EmbeddingProvider,
 )
 from core.base.abstractions import Entity
@@ -74,8 +73,19 @@ def truncate_info(info_list, max_length):
             return truncated_info
 
         async def process_entity(
-            entities, relationships, max_description_input_length, document_id
+            entities,
+            relationships,
+            max_description_input_length,
+            document_id: UUID,
         ):
+            response = await self.database_provider.document_handler.get_documents_overview(  # type: ignore
+                offset=0,
+                limit=1,
+                filter_document_ids=[document_id],
+            )
+            document_summary = (
+                response["results"][0].summary if response["results"] else None
+            )
 
             entity_info = [
                 f"{entity.name}, {entity.description}" for entity in entities
@@ -100,6 +110,7 @@ async def process_entity(
                             messages=await self.database_provider.prompt_handler.get_message_payload(
                                 task_prompt_name=self.database_provider.config.graph_creation_settings.graph_entity_description_prompt,
                                 task_inputs={
+                                    "document_summary": document_summary,
                                     "entity_info": truncate_info(
                                         entity_info,
                                         max_description_input_length,
@@ -157,14 +168,16 @@ async def process_entity(
 
         workflows = []
 
-        for i, (entity_name, entity_info) in enumerate(entity_map.items()):
+        for _, (entity_name, entity_info) in enumerate(entity_map.items()):
             try:
                 workflows.append(
                     process_entity(
-                        entity_info["entities"],
-                        entity_info["relationships"],
-                        input.message["max_description_input_length"],
-                        document_id,
+                        entities=entity_info["entities"],
+                        relationships=entity_info["relationships"],
+                        max_description_input_length=input.message[
+                            "max_description_input_length"
+                        ],
+                        document_id=document_id,
                     )
                 )
             except Exception as e:

diff --git a/py/core/providers/database/prompts/graphrag_entity_description.yaml b/py/core/providers/database/prompts/graphrag_entity_description.yaml
@@ -1,20 +1,39 @@
 graphrag_entity_description:
   template: |
-    Provide a comprehensive yet concise summary of the given entity, incorporating its description and associated relationships:
+    Given the following information about an entity:
 
-    Entity Info:
+    Document Summary:
+    {document_summary}
+
+    Entity Information:
     {entity_info}
-    Relationships:
+
+    Relationship Data:
     {relationships_txt}
 
-    Your summary should:
-    1. Clearly define the entity's core concept or purpose
-    2. Highlight key relationships or attributes from the relationships
-    3. Integrate any relevant information from the existing description
-    4. Maintain a neutral, factual tone
-    5. Be approximately 2-3 sentences long
+    Generate a comprehensive entity description that:
+
+    1. Opens with a clear definition statement identifying the entity's primary classification and core function
+    2. Incorporates key data points from both the document summary and relationship information
+    3. Emphasizes the entity's role within its broader context or system
+    4. Highlights critical relationships, particularly those that:
+      - Demonstrate hierarchical connections
+      - Show functional dependencies
+      - Indicate primary use cases or applications
+
+    Format Requirements:
+    - Length: 2-3 sentences
+    - Style: Technical and precise
+    - Structure: Definition + Context + Key Relationships
+    - Tone: Objective and authoritative
+
+    Integration Guidelines:
+    - Prioritize information that appears in multiple sources
+    - Resolve any conflicting information by favoring the most specific source
+    - Include temporal context if relevant to the entity's current state or evolution
 
-    Ensure the summary is coherent, informative, and captures the essence of the entity within the context of the provided information.
+    Output should reflect the entity's complete nature while maintaining concision and clarity.
   input_types:
+    document_summary: str
     entity_info: str
     relationships_txt: str
diff --git a/py/core/providers/database/prompts/graphrag_relationships_extraction_few_shot.yaml b/py/core/providers/database/prompts/graphrag_relationships_extraction_few_shot.yaml
@@ -1,28 +1,36 @@
 graphrag_relationships_extraction_few_shot:
   template: >
     -Goal-
-    Given a text document, identify all entities and their entity types from the text and all relationships among the identified entities.
-    Given the text, extract up to {max_knowledge_relationships} entity-relation relationshipts.
+    Given both a document summary and full text, identify all entities and their entity types, along with all relationships among the identified entities.
+    Extract up to {max_knowledge_relationships} entity-relation relationships using both the summary context and full text.
+
+    -Context Summary-
+    {document_summary}
+
     -Steps-
     1. Identify all entities. For each identified entity, extract the following information:
+    1. Identify all entities given the full text, grounding and contextualizing them based on the summary. For each identified entity, extract:
     - entity_name: Name of the entity, capitalized
-    - entity_type: Type of the entity. If the list below is not empty, only extract entities of the given types. If this list is empty, extract all entities.
-    {entity_types}
-    - entity_description: Comprehensive description of the entity's attributes and activities that is explicitly mentioned in the text.
-    Format each entity as ("entity"$$$$<entity_name>$$$$<entity_type>$$$$<entity_description>).
-    Output additional entities based on the entity_description if they contain more named entities. You will later use this to create relationships between them.
-    2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
-    For each pair of related entities, extract the following information:
-    - source_entity: name of the source entity, as identified in step 1
-    - target_entity: name of the target entity, as identified in step 1
-    - relation: relationship between source_entity and target_entity. If the list below is not empty, only extract relations of the given types. If this list is empty, extract all relations.
-    {relation_types}
-    - relationship_description: explanation as to why you think the source entity and the target entity are related to each other
-    - relationship_weight: a weight between 0 and 10 that represents the strength of the relationship between the source entity and the target entity
+    - entity_type: Type of the entity (constrained to {entity_types} if provided, otherwise all types)
+    - entity_description: Comprehensive description incorporating context from both summary and full text
+
+    Format each entity as ("entity"$$$$<entity_name>$$$$<entity_type>$$$$<entity_description>)
+    Note: Generate additional entities from descriptions if they contain named entities for relationship mapping.
+
+    2. From the identified entities, identify all related entity pairs, using both summary and full text context:
+    - source_entity: name of the source entity
+    - target_entity: name of the target entity
+    - relation: relationship type (constrained to {relation_types} if provided)
+    - relationship_description: justification based on both summary and full text context
+    - relationship_weight: strength score 0-10
+
     Format each relationship as ("relationship"$$$$<source_entity>$$$$<target_entity>$$$$<relation>$$$$<relationship_description>$$$$<relationship_weight>)
-    3. Make sure that each entity has at least one relationship. If you are not able to find a relationship from the pair of entities above, first create a new entity based on the source entity description and then create a relationship connecting them.
-       Again, make sure that each entity has at least one relationship. If you don't do this, you will be fired.
-    3. When finished, output in the format in as given in the examples below. Do not repeat the same entity or relationship multiple times.
+
+    3. Coverage Requirements:
+    - Each entity must have at least one relationship
+    - Create intermediate entities if needed to establish relationships
+    - Verify relationships against both summary and full text
+    - Resolve any discrepancies between sources
 
     Example 1:
     If the list is empty, extract all entities and relations.
@@ -111,12 +119,15 @@ graphrag_relationships_extraction_few_shot:
     Entity_types: {entity_types}
     Relation_types: {relation_types}
 
-    Text:
+    Document Summary:
+    {document_summary}
+
+    Full Text:
     {input}
     ######################
     Output:
-
   input_types:
+    document_summary: str
     max_knowledge_relationships: int
     input: str
     entity_types: list[str]

diff --git a/py/migrations/versions/c45a9cf6a8a4_add_user_and_document_count_to_.py b/py/migrations/versions/c45a9cf6a8a4_add_user_and_document_count_to_.py
@@ -1,7 +1,7 @@
 """Add user and document count to collection
 
 Revision ID: c45a9cf6a8a4
-Revises: 
+Revises:
 Create Date: 2024-12-10 13:28:07.798167
 
 """