Merge branch 'SN1-423-restructure-prompting' into 'SN1-419-r-d-resear…

…ch-better-scoring-mechanisms-for-msr'
macrocosm-os · Mar 4, 2025 · 46fe23a · 46fe23a
2 parents 1fad896 + 0245a2a
commit 46fe23a
Show file tree

Hide file tree

Showing 8 changed files with 176 additions and 105 deletions.
diff --git a/prompting/rewards/inference_reward_model.py b/prompting/rewards/inference_reward_model.py
@@ -14,5 +14,5 @@ async def reward(
     ) -> BatchRewardOutput:
         """Gives an exact reward of 1 if the response matches the reference, 0 otherwise"""
         if model_id:
-            return ExactMatchRewardModel().reward(reference, response_event)
-        return RelevanceRewardModel().reward(reference, response_event)
+            return await ExactMatchRewardModel().reward(reference, response_event)
+        return await RelevanceRewardModel().reward(reference, response_event)
diff --git a/test.md → shared/prompts/__init__.py b/test.md → shared/prompts/__init__.py
diff --git a/shared/prompts/test_time_inference.py b/shared/prompts/test_time_inference.py
@@ -0,0 +1,125 @@
+import textwrap
+
+
+def intro_prompt() -> str:
+    """
+    Returns the intro prompt.
+    """
+
+    intro = textwrap.dedent(
+        """\
+    You are a world-class expert in analytical reasoning and problem-solving. Your task is to break down complex problems through rigorous step-by-step analysis, carefully examining each aspect before moving forward. For each reasoning step:
+
+    OUTPUT FORMAT:
+    Return a JSON object with these required fields:
+    {
+        "title": "Brief, descriptive title of current reasoning phase",
+        "content": "Detailed explanation of your analysis",
+        "next_action": "continue" or "final_answer"
+    }
+
+    REASONING PROCESS:
+    1. Initial Analysis
+    - Break down the problem into core components
+    - Identify key constraints and requirements
+    - List relevant domain knowledge and principles
+
+    2. Multiple Perspectives
+    - Examine the problem from at least 3 different angles
+    - Consider both conventional and unconventional approaches
+    - Identify potential biases in initial assumptions
+
+    3. Exploration & Validation
+    - Test preliminary conclusions against edge cases
+    - Apply domain-specific best practices
+    - Quantify confidence levels when possible (e.g., 90% certain)
+    - Document key uncertainties or limitations
+
+    4. Critical Review
+    - Actively seek counterarguments to your reasoning
+    - Identify potential failure modes
+    - Consider alternative interpretations of the data/requirements
+    - Validate assumptions against provided context
+
+    5. Synthesis & Refinement
+    - Combine insights from multiple approaches
+    - Strengthen weak points in the reasoning chain
+    - Address identified edge cases and limitations
+    - Build towards a comprehensive solution
+
+    REQUIREMENTS:
+    - Each step must focus on ONE specific aspect of reasoning
+    - Explicitly state confidence levels and uncertainty
+    - When evaluating options, use concrete criteria
+    - Include specific examples or scenarios when relevant
+    - Acknowledge limitations in your knowledge or capabilities
+    - Maintain logical consistency across steps
+    - Build on previous steps while avoiding redundancy
+
+    CRITICAL THINKING CHECKLIST:
+    ✓ Have I considered non-obvious interpretations?
+    ✓ Are my assumptions clearly stated and justified?
+    ✓ Have I identified potential failure modes?
+    ✓ Is my confidence level appropriate given the evidence?
+    ✓ Have I adequately addressed counterarguments?
+
+    Remember: Quality of reasoning is more important than speed. Take the necessary steps to build a solid analytical foundation before moving to conclusions.
+
+    Example:
+
+    User Query: How many piano tuners are in New York City?
+
+    {Expected Answer:
+    {
+        "title": "Estimating the Number of Piano Tuners in New York City",
+        "content": "To estimate the number of piano tuners in NYC, we need to break down the problem into core components. Key factors include the total population of NYC, the number of households with pianos, the average number of pianos per household, and the frequency of piano tuning. We should also consider the number of professional piano tuners and their workload.",
+        "next_action": "continue"
+    }}
+    """
+    ).strip()
+
+    return intro
+
+
+def system_acceptance_prompt() -> str:
+    """
+    Returns the system acceptance prompt.
+    """
+
+    system_acceptance = textwrap.dedent(
+        """\
+    I understand. I will now analyze the problem systematically, following the structured reasoning process while maintaining high standards of analytical rigor and self-criticism.
+    """
+    ).strip()
+
+    return system_acceptance
+
+
+def final_answer_prompt() -> str:
+    """
+    Returns the final answer prompt.
+    """
+
+    final_answer = textwrap.dedent(
+        """\
+        Review your previous reasoning steps and synthesize them into a final answer.
+        Your response should:
+
+        1. Clearly state your final conclusion.
+        2. Summarize the key reasoning and evidence from previous steps.
+        3. Address any remaining uncertainties or alternative perspectives.
+        4. Note any relevant caveats or limitations to your conclusion.
+
+        Ensure the response is concise, well-structured, and avoids unnecessary repetition.
+        Do not include explicit confidence levels or probabilities.
+
+        Format your response as valid JSON:
+        {{
+            "title": "Final Answer",
+            "content": "Your synthesized conclusion and explanation here.",
+            "next_action": "final_answer"
+        }}
+        """
+    ).strip()
+
+    return final_answer
diff --git a/tests/prompting/shared/test_get_prompt.py b/tests/prompting/shared/test_get_prompt.py
@@ -0,0 +1,29 @@
+from shared.prompts.test_time_inference import final_answer_prompt, intro_prompt, system_acceptance_prompt
+
+
+def test_intro_prompt():
+    """Test that intro_prompt returns the correct prompt."""
+    prompt = intro_prompt()
+    assert isinstance(prompt, str)
+    assert "You are a world-class expert in analytical reasoning" in prompt
+    assert "OUTPUT FORMAT:" in prompt
+    assert "REASONING PROCESS:" in prompt
+    assert "REQUIREMENTS:" in prompt
+    assert "CRITICAL THINKING CHECKLIST:" in prompt
+
+
+def test_system_acceptance_prompt():
+    """Test that system_acceptance_prompt returns the correct prompt."""
+    prompt = system_acceptance_prompt()
+    assert isinstance(prompt, str)
+    assert "I understand. I will now analyze the problem systematically" in prompt
+
+
+def test_final_answer_prompt():
+    """Test that final_answer_prompt returns the correct prompt."""
+    prompt = final_answer_prompt()
+    assert isinstance(prompt, str)
+    assert "Review your previous reasoning steps" in prompt
+    assert "Format your response as valid JSON" in prompt
+    assert '"title":' in prompt
+    assert '"content":' in prompt
diff --git a/validator_api/gpt_endpoints.py b/validator_api/gpt_endpoints.py
@@ -100,7 +100,13 @@ async def completions(request: CompletionsRequest, api_key: str = Depends(valida
 
         # Choose between regular inference, test time inference, and mixture of miners.
         if body.get("test_time_inference", False):
-            return await test_time_inference(request)
+            test_time_request = TestTimeInferenceRequest(
+                messages=request.messages,
+                model=request.model,
+                uids=uids if uids else None,
+                json_format=request.json_format,
+            )
+            return await test_time_inference(test_time_request)
         elif body.get("mixture", False):
             return await mixture_of_miners(body, uids=uids)
         else:
@@ -230,20 +236,6 @@ async def web_retrieval(
     return WebRetrievalResponse(results=unique_results)
 
 
-@router.post(
-    "/test_time_inference",
-    summary="Test time inference endpoint",
-    description="Provides step-by-step reasoning and thinking process during inference.",
-    response_description="Streaming response with reasoning steps",
-    status_code=status.HTTP_200_OK,
-    responses={
-        status.HTTP_200_OK: {
-            "description": "Successful streaming response with reasoning steps",
-            "content": {"text/event-stream": {}},
-        },
-        status.HTTP_500_INTERNAL_SERVER_ERROR: {"description": "Internal server error during streaming"},
-    },
-)
 async def test_time_inference(request: TestTimeInferenceRequest):
     """
     Test time inference endpoint that provides step-by-step reasoning.
@@ -289,14 +281,16 @@ async def stream_steps():
             i = 0
             async for steps, thinking_time in create_response_stream(request):
                 i += 1
+                if request.json_format:
+                    choice = Choice(index=i, delta=ChoiceDelta(content=json.dumps(steps[-1])))
+                else:
+                    choice = Choice(index=i, delta=ChoiceDelta(content=f"## {steps[-1][0]}\n\n{steps[-1][1]}" + "\n\n"))
                 yield "data: " + ChatCompletionChunk(
                     id=str(uuid.uuid4()),
                     created=int(time.time()),
                     model=request.model or "None",
                     object="chat.completion.chunk",
-                    choices=[
-                        Choice(index=i, delta=ChoiceDelta(content=f"## {steps[-1][0]}\n\n{steps[-1][1]}" + "\n\n"))
-                    ],
+                    choices=[choice],
                 ).model_dump_json() + "\n\n"
         except Exception as e:
             logger.exception(f"Error during streaming: {e}")

diff --git a/validator_api/scoring_queue.py b/validator_api/scoring_queue.py
@@ -80,6 +80,8 @@ async def run_step(self):
                     # Raise an exception so that the retry logic in the except block handles it.
                     raise Exception(f"Non-200 response: {response.status_code} for uids {uids}")
                 logger.debug(f"Forwarding response completed with status {response.status_code}")
+        except httpx.ConnectError as e:
+            logger.warning(f"Couldn't connect to validator {url} for Scoring {uids}. Exception: {e}")
         except Exception as e:
             if scoring_payload.retries < self.max_scoring_retries:
                 scoring_payload.retries += 1

diff --git a/validator_api/serializers.py b/validator_api/serializers.py
@@ -46,6 +46,7 @@ class CompletionsRequest(BaseModel):
             "do_sample": True,
         },
     )
+    json_format: bool = Field(default=False, description="Enable JSON format for the response.", example=True)
 
 
 class WebRetrievalRequest(BaseModel):
@@ -107,6 +108,7 @@ class TestTimeInferenceRequest(BaseModel):
         example=[{"role": "user", "content": "Solve the equation: 3x + 5 = 14"}],
     )
     model: Optional[str] = Field(default=None, description="Model identifier to use for inference.", example="gpt-4")
+    json_format: bool = Field(default=False, description="Enable JSON format for the response.", example=True)
 
     def to_dict(self):
         return self.model_dump().update({"messages": [m.model_dump() for m in self.messages]})
diff --git a/validator_api/test_time_inference.py b/validator_api/test_time_inference.py
@@ -8,6 +8,7 @@
 
 from prompting.llms.apis.llm_messages import LLMMessage, LLMMessages
 from prompting.llms.apis.llm_wrapper import LLMWrapper
+from shared.prompts.test_time_inference import final_answer_prompt, intro_prompt, system_acceptance_prompt
 from shared.timer import Timer
 from validator_api.chat_completion import chat_completion
 
@@ -168,81 +169,14 @@ async def generate_response(
     messages = [
         {
             "role": "system",
-            "content": """You are a world-class expert in analytical reasoning and problem-solving. Your task is to break down complex problems through rigorous step-by-step analysis, carefully examining each aspect before moving forward. For each reasoning step:
-
-OUTPUT FORMAT:
-Return a JSON object with these required fields:
-{
-    "title": "Brief, descriptive title of current reasoning phase",
-    "content": "Detailed explanation of your analysis",
-    "next_action": "continue" or "final_answer"
-}
-
-REASONING PROCESS:
-1. Initial Analysis
-   - Break down the problem into core components
-   - Identify key constraints and requirements
-   - List relevant domain knowledge and principles
-
-2. Multiple Perspectives
-   - Examine the problem from at least 3 different angles
-   - Consider both conventional and unconventional approaches
-   - Identify potential biases in initial assumptions
-
-3. Exploration & Validation
-   - Test preliminary conclusions against edge cases
-   - Apply domain-specific best practices
-   - Quantify confidence levels when possible (e.g., 90% certain)
-   - Document key uncertainties or limitations
-
-4. Critical Review
-   - Actively seek counterarguments to your reasoning
-   - Identify potential failure modes
-   - Consider alternative interpretations of the data/requirements
-   - Validate assumptions against provided context
-
-5. Synthesis & Refinement
-   - Combine insights from multiple approaches
-   - Strengthen weak points in the reasoning chain
-   - Address identified edge cases and limitations
-   - Build towards a comprehensive solution
-
-REQUIREMENTS:
-- Each step must focus on ONE specific aspect of reasoning
-- Explicitly state confidence levels and uncertainty
-- When evaluating options, use concrete criteria
-- Include specific examples or scenarios when relevant
-- Acknowledge limitations in your knowledge or capabilities
-- Maintain logical consistency across steps
-- Build on previous steps while avoiding redundancy
-
-CRITICAL THINKING CHECKLIST:
-✓ Have I considered non-obvious interpretations?
-✓ Are my assumptions clearly stated and justified?
-✓ Have I identified potential failure modes?
-✓ Is my confidence level appropriate given the evidence?
-✓ Have I adequately addressed counterarguments?
-
-Remember: Quality of reasoning is more important than speed. Take the necessary steps to build a solid analytical foundation before moving to conclusions.
-
-Example:
-
-User Query: How many piano tuners are in New York City?
-
-Expected Answer:
-{
-    "title": "Estimating the Number of Piano Tuners in New York City",
-    "content": "To estimate the number of piano tuners in NYC, we need to break down the problem into core components. Key factors include the total population of NYC, the number of households with pianos, the average number of pianos per household, and the frequency of piano tuning. We should also consider the number of professional piano tuners and their workload.",
-    "next_action": "continue"
-}
-""",
+            "content": intro_prompt(),
         }
     ]
     messages += original_messages
     messages += [
         {
             "role": "assistant",
-            "content": "I understand. I will now analyze the problem systematically, following the structured reasoning process while maintaining high standards of analytical rigor and self-criticism.",
+            "content": system_acceptance_prompt(),
         }
     ]
 
@@ -257,7 +191,6 @@ async def generate_response(
         total_thinking_time += thinking_time
 
         steps.append((f"Step {step_count}: {step_data['title']}", step_data["content"], thinking_time))
-
         messages.append({"role": "assistant", "content": json.dumps(step_data)})
 
         if step_data["next_action"] == "final_answer" or not step_data.get("next_action"):
@@ -266,24 +199,10 @@ async def generate_response(
         step_count += 1
         yield steps, None
 
-    final_answer_prompt = """Based on your thorough analysis, please provide your final answer. Your response should:
-
-        1. Clearly state your conclusion
-        2. Summarize the key supporting evidence
-        3. Acknowledge any remaining uncertainties
-        4. Include relevant caveats or limitations
-        5. Synthesis & Refinement
-
-        Ensure your response uses the correct json format as follows:
-        {
-            "title": "Final Answer",
-            "content": "Conclusion and detailed explanation of your answer",
-        }"""
-
     messages.append(
         {
             "role": "user",
-            "content": final_answer_prompt,
+            "content": final_answer_prompt(),
         }
     )