LAION-AI · draganjovanovich · May 2, 2023 · Apr 19, 2023 · Apr 20, 2023 · Apr 20, 2023
@@ -79,12 +79,15 @@ async def abort_work(self, message_id: str, reason: str) -> models.DbMessage:
         await self.session.refresh(message)
         return message
 
-    async def complete_work(self, message_id: str, content: str) -> models.DbMessage:
+    async def complete_work(
+        self, message_id: str, content: str, work_parameters: inference.WorkParameters
+    ) -> models.DbMessage:
         logger.debug(f"Completing work on message {message_id}")
         message = await self.get_assistant_message_by_id(message_id)
         message.state = inference.MessageState.complete
         message.work_end_at = datetime.datetime.utcnow()
         message.content = content
+        message.work_parameters = work_parameters
         await self.session.commit()
         logger.debug(f"Completed work on message {message_id}")
         await self.session.refresh(message)

@@ -102,6 +102,7 @@ async def create_assistant_message(
             work_parameters = inference.WorkParameters(
                 model_config=model_config,
                 sampling_parameters=request.sampling_parameters,
+                plugins=request.plugins,
             )
             assistant_message = await ucr.initiate_assistant_message(
                 parent_id=request.parent_id,

@@ -1,9 +1,23 @@
+import json
+
 import fastapi
 import pydantic
+import requests
+import yaml
+from loguru import logger
 from oasst_inference_server.settings import settings
 from oasst_shared import model_configs
 from oasst_shared.schemas import inference
 
+# NOTE: Replace this with plugins that we will provide out of the box
+DUMMY_PLUGINS = [
+    inference.PluginEntry(
+        url="http://192.168.0.35:8085/ai-plugin.json",
+        enabled=False,
+        trusted=True,
+    ),
+]
+
 router = fastapi.APIRouter(
     prefix="/configs",
     tags=["configs"],
@@ -73,3 +87,63 @@ async def get_model_configs() -> list[ModelConfigInfo]:
         for model_config_name in model_configs.MODEL_CONFIGS
         if (settings.allowed_model_config_names == "*" or model_config_name in settings.allowed_model_config_names_list)
     ]
+
+
+@router.post("/plugin_config")
+async def get_plugin_config(plugin: inference.PluginEntry) -> inference.PluginEntry | fastapi.HTTPException:
+    plugin_config = None
+    try:
+        response = requests.get(plugin.url)
+        response.raise_for_status()
+    except requests.exceptions.RequestException:
+        return fastapi.HTTPException(status_code=404, detail="Plugin not found")
+
+    config = {}
+    try:
+        content_type = response.headers.get("Content-Type")
+        if "application/json" in content_type or plugin.url.endswith(".json"):
+            config = json.loads(response.text)
+        elif (
+            "application/yaml" in content_type
+            or "application/x-yaml" in content_type
+            or plugin.url.endswith(".yaml")
+            or plugin.url.endswith(".yml")
+        ):
+            config = yaml.safe_load(response.text)
+        else:
+            raise Exception(f"Unsupported content type: {content_type}. Only JSON and YAML are supported.")
+
+        plugin_config = inference.PluginConfig(**config)
+    except Exception as e:
+        return fastapi.HTTPException(status_code=404, detail="Failed to parse plugin config, error: " + str(e))
+
+    return inference.PluginEntry(url=plugin.url, enabled=plugin.enabled, plugin_config=plugin_config)
+
+
+@router.get("/builtin_plugins")
+async def get_builtin_plugins() -> list[inference.PluginEntry] | fastapi.HTTPException:
+    plugins = []
+
+    for plugin in DUMMY_PLUGINS:
+        try:
+            response = requests.get(plugin.url)
+            response.raise_for_status()
+        except requests.exceptions.RequestException:
+            logger.warning(f"Failed to fetch plugin config from {plugin.url}")
+            continue
+
+        try:
+            plugin_config = inference.PluginConfig(**response.json())
+        except ValueError:
+            logger.warning(f"Failed to parse plugin config from {plugin.url}")
+            continue
+
+        final_plugin: inference.PluginEntry = inference.PluginEntry(
+            url=plugin.url,
+            enabled=plugin.enabled,
+            trusted=plugin.trusted,
+            plugin_config=plugin_config,
+        )
+        plugins.append(final_plugin)
+
+    return plugins
@@ -340,9 +340,12 @@ async def handle_generated_text_response(
         message_id = work_response_container.message_id
         async with deps.manual_create_session() as session:
             cr = chat_repository.ChatRepository(session=session)
+            work_parameters = work_response_container.work_request.parameters
+            work_parameters = work_parameters.copy(update={"used_plugin": response.used_plugin})
             message = await cr.complete_work(
                 message_id=message_id,
                 content=response.text,
+                work_parameters=work_parameters,
             )
             logger.info(f"Completed work for {message_id=}")
         message_packet = inference.InternalFinishedMessageResponse(

@@ -14,6 +14,8 @@ class CreateAssistantMessageRequest(pydantic.BaseModel):
     parent_id: str
     model_config_name: str
     sampling_parameters: inference.SamplingParameters = pydantic.Field(default_factory=inference.SamplingParameters)
+    plugins: list[inference.PluginEntry] = pydantic.Field(default_factory=list[inference.PluginEntry])
+    used_plugin: inference.PluginUsed | None = None
 
 
 class PendingResponseEvent(pydantic.BaseModel):

@@ -0,0 +1,68 @@
+# Plugin system for OA
+
+This is a basic implementation of support for external augmentation and
+OpenAI/ChatGPT plugins into the Open-Assistant. In the current state, this is
+more of a proof-of-concept and should be considered to be used behind some
+experimental flag.
+
+## Architecture
+
+There is now some kind of middleware between work.py(worker) and the final
+prompt that is passed to the inference server for generation and streaming. That
+middleware is responsible for checking if there is an enabled plugin in the
+userland/UI and if so, it will take over the job of creating curated pre-prompts
+for plugin usage, as well as generating subsequent calls to LLM(inner
+monologues) in order to generate the final externally **augmented** prompt, that
+will be passed back to the worker and next to the inference, for final LLM
+generation/streaming tokens to the frontend.
+
+## Plugins
+
+Plugins are in essence just pretty wrappers around some kind of API-s and serve
+a purpose to help LLM utilize it more precisely and reliably, so they can be
+quite useful and powerful augmentation tools for Open-Assistant. Two main parts
+of a plugin are the ai-plugin.json file, which is just the main descriptor of a
+plugin, and the second part is OpenAPI specification of the plugin API-s.
+
+Here is OpenAI plugins
+[specification](https://platform.openai.com/docs/plugins/getting-started) that
+is currently partially supported with this system.
+
+For now, only non-authentication-based plugins are supported. Some of them
+are: - https://www.klarna.com/.well-known/ai-plugin.json -
+https://www.joinmilo.com/.well-known/ai-plugin.json
+
+And quite a few of them can be found on this website
+[plugin "store" wellknown.ai](https://www.wellknown.ai/)
+
+One of the ideas of the plugin system is that we can have some internal OA
+plugins, which will be like out-of-the-box plugins, and there could be endless
+third-party community-developed plugins as well.
+
+There is one python based plugin called **calculator**, included in this system
+for now, as a proof of concept and as a learning material of how one could
+create own plugins.
+
+### Notes regarding the reliability and performance and the limitations of the plugin system
+
+Performance can vary a lot depending on the models and plugins used. Some of
+them work better some worse, but that aspect should improve as we get better and
+better models. One of the biggest limitations at the moment is context size and
+instruction following capabilities. And that is combated with some prompt
+tricks, truncations of the plugin OpenAPI descriptions and dynamically
+including/excluding parts of the prompts in the internal processing of the
+subsequent generations of intermediate texts (inner monologues). More of the
+limitations and possible alternatives are explained in code comments.
+
+The current approach is somewhat hybrid I would say, and relies on the zero-shot
+capabilities of a model. There will be one more branch with the plugin system
+that will be a bit different approach than this one as it will be utilizing
+other smaller embedding transformer models and vector stores, so we can do A/B
+testing of the system alongside new OA model releases.
+
+## Relevant files for the inference side of the plugin system
+
+- chat_chain.py
+- chat*chain_utils.py *(tweaking tools/plugin description string generation can
+  help for some models)\_
+- chat*chain_prompts.py *(tweaking prompts can help also)\_